1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from collections import OrderedDict
14 from textwrap import dedent
16 from icu import Transliterator
18 import psycopg2.extras
20 from nominatim.db.connection import connect
21 from nominatim.db import properties
22 from nominatim.db import utils as db_utils
23 from nominatim.db.sql_preprocessor import SQLPreprocessor
24 from nominatim.errors import UsageError
25 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
27 DBCFG_NORMALIZATION = "tokenizer_normalization"
28 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
30 LOG = logging.getLogger()
32 def create(dsn, data_dir):
33 """ Create a new instance of the tokenizer provided by this module.
35 return LegacyTokenizer(dsn, data_dir)
38 def _install_module(config_module_path, src_dir, module_dir):
39 """ Copies the PostgreSQL normalisation module into the project
40 directory if necessary. For historical reasons the module is
41 saved in the '/module' subdirectory and not with the other tokenizer
44 The function detects when the installation is run from the
45 build directory. It doesn't touch the module in that case.
47 # Custom module locations are simply used as is.
48 if config_module_path:
49 LOG.info("Using custom path for database module at '%s'", config_module_path)
50 return config_module_path
52 # Compatibility mode for builddir installations.
53 if module_dir.exists() and src_dir.samefile(module_dir):
54 LOG.info('Running from build directory. Leaving database module as is.')
57 # In any other case install the module in the project directory.
58 if not module_dir.exists():
61 destfile = module_dir / 'nominatim.so'
62 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
65 LOG.info('Database module installed at %s', str(destfile))
70 def _check_module(module_dir, conn):
71 """ Try to use the PostgreSQL module to confirm that it is correctly
72 installed and accessible from PostgreSQL.
74 with conn.cursor() as cur:
76 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
77 RETURNS text AS %s, 'transliteration'
78 LANGUAGE c IMMUTABLE STRICT;
79 DROP FUNCTION nominatim_test_import_func(text)
80 """, (f'{module_dir}/nominatim.so', ))
81 except psycopg2.DatabaseError as err:
82 LOG.fatal("Error accessing database module: %s", err)
83 raise UsageError("Database module cannot be accessed.") from err
86 class LegacyTokenizer(AbstractTokenizer):
87 """ The legacy tokenizer uses a special PostgreSQL module to normalize
88 names and queries. The tokenizer thus implements normalization through
89 calls to the database.
92 def __init__(self, dsn, data_dir):
94 self.data_dir = data_dir
95 self.normalization = None
98 def init_new_db(self, config, init_db=True):
99 """ Set up a new tokenizer for the database.
101 This copies all necessary data in the project directory to make
102 sure the tokenizer remains stable even over updates.
104 module_dir = _install_module(config.DATABASE_MODULE_PATH,
105 config.lib_dir.module,
106 config.project_dir / 'module')
108 self.normalization = config.TERM_NORMALIZATION
110 self._install_php(config, overwrite=True)
112 with connect(self.dsn) as conn:
113 _check_module(module_dir, conn)
114 self._save_config(conn, config)
118 self.update_sql_functions(config)
119 self._init_db_tables(config)
122 def init_from_project(self, config):
123 """ Initialise the tokenizer from the project directory.
125 with connect(self.dsn) as conn:
126 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
128 if not (config.project_dir / 'module' / 'nominatim.so').exists():
129 _install_module(config.DATABASE_MODULE_PATH,
130 config.lib_dir.module,
131 config.project_dir / 'module')
133 self._install_php(config, overwrite=False)
135 def finalize_import(self, config):
136 """ Do any required postprocessing to make the tokenizer data ready
139 with connect(self.dsn) as conn:
140 sqlp = SQLPreprocessor(conn, config)
141 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
144 def update_sql_functions(self, config):
145 """ Reimport the SQL functions for this tokenizer.
147 with connect(self.dsn) as conn:
148 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
149 modulepath = config.DATABASE_MODULE_PATH or \
150 str((config.project_dir / 'module').resolve())
151 sqlp = SQLPreprocessor(conn, config)
152 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
153 max_word_freq=max_word_freq,
154 modulepath=modulepath)
157 def check_database(self, _):
158 """ Check that the tokenizer is set up correctly.
161 The Postgresql extension nominatim.so was not correctly loaded.
166 * Check the output of the CMmake/make installation step
167 * Does nominatim.so exist?
168 * Does nominatim.so exist on the database server?
169 * Can nominatim.so be accessed by the database user?
171 with connect(self.dsn) as conn:
172 with conn.cursor() as cur:
174 out = cur.scalar("SELECT make_standard_name('a')")
175 except psycopg2.Error as err:
176 return hint.format(error=str(err))
179 return hint.format(error='Unexpected result for make_standard_name()')
184 def migrate_database(self, config):
185 """ Initialise the project directory of an existing database for
186 use with this tokenizer.
188 This is a special migration function for updating existing databases
189 to new software versions.
191 self.normalization = config.TERM_NORMALIZATION
192 module_dir = _install_module(config.DATABASE_MODULE_PATH,
193 config.lib_dir.module,
194 config.project_dir / 'module')
196 with connect(self.dsn) as conn:
197 _check_module(module_dir, conn)
198 self._save_config(conn, config)
201 def update_statistics(self):
202 """ Recompute the frequency of full words.
204 with connect(self.dsn) as conn:
205 if conn.table_exists('search_name'):
206 with conn.cursor() as cur:
207 cur.drop_table("word_frequencies")
208 LOG.info("Computing word frequencies")
209 cur.execute("""CREATE TEMP TABLE word_frequencies AS
210 SELECT unnest(name_vector) as id, count(*)
211 FROM search_name GROUP BY id""")
212 cur.execute("CREATE INDEX ON word_frequencies(id)")
213 LOG.info("Update word table with recomputed frequencies")
214 cur.execute("""UPDATE word SET search_name_count = count
215 FROM word_frequencies
216 WHERE word_token like ' %' and word_id = id""")
217 cur.drop_table("word_frequencies")
221 def update_word_tokens(self):
222 """ No house-keeping implemented for the legacy tokenizer.
224 LOG.info("No tokenizer clean-up available.")
227 def name_analyzer(self):
228 """ Create a new analyzer for tokenizing names and queries
229 using this tokinzer. Analyzers are context managers and should
233 with tokenizer.name_analyzer() as analyzer:
237 When used outside the with construct, the caller must ensure to
238 call the close() function before destructing the analyzer.
240 Analyzers are not thread-safe. You need to instantiate one per thread.
242 normalizer = Transliterator.createFromRules("phrase normalizer",
244 return LegacyNameAnalyzer(self.dsn, normalizer)
247 def _install_php(self, config, overwrite=True):
248 """ Install the php script for the tokenizer.
250 php_file = self.data_dir / "tokenizer.php"
252 if not php_file.exists() or overwrite:
253 php_file.write_text(dedent(f"""\
255 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
256 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
257 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
258 """), encoding='utf-8')
261 def _init_db_tables(self, config):
262 """ Set up the word table and fill it with pre-computed word
265 with connect(self.dsn) as conn:
266 sqlp = SQLPreprocessor(conn, config)
267 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
270 LOG.warning("Precomputing word tokens")
271 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
274 def _save_config(self, conn, config):
275 """ Save the configuration that needs to remain stable for the given
276 database as database properties.
278 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
279 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
282 class LegacyNameAnalyzer(AbstractAnalyzer):
283 """ The legacy analyzer uses the special Postgresql module for
286 Each instance opens a connection to the database to request the
290 def __init__(self, dsn, normalizer):
291 self.conn = connect(dsn).connection
292 self.conn.autocommit = True
293 self.normalizer = normalizer
294 psycopg2.extras.register_hstore(self.conn)
296 self._cache = _TokenCache(self.conn)
300 """ Free all resources used by the analyzer.
307 def get_word_token_info(self, words):
308 """ Return token information for the given list of words.
309 If a word starts with # it is assumed to be a full name
310 otherwise is a partial name.
312 The function returns a list of tuples with
313 (original word, word token, word id).
315 The function is used for testing and debugging only
316 and not necessarily efficient.
318 with self.conn.cursor() as cur:
319 cur.execute("""SELECT t.term, word_token, word_id
320 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
321 WHERE word_token = (CASE
322 WHEN left(t.term, 1) = '#' THEN
323 ' ' || make_standard_name(substring(t.term from 2))
325 make_standard_name(t.term)
327 and class is null and country_code is null""",
330 return [(r[0], r[1], r[2]) for r in cur]
333 def normalize(self, phrase):
334 """ Normalize the given phrase, i.e. remove all properties that
335 are irrelevant for search.
337 return self.normalizer.transliterate(phrase)
341 def normalize_postcode(postcode):
342 """ Convert the postcode to a standardized form.
344 This function must yield exactly the same result as the SQL function
345 'token_normalized_postcode()'.
347 return postcode.strip().upper()
350 def update_postcodes_from_db(self):
351 """ Update postcode tokens in the word table from the location_postcode
354 with self.conn.cursor() as cur:
355 # This finds us the rows in location_postcode and word that are
356 # missing in the other table.
357 cur.execute("""SELECT * FROM
358 (SELECT pc, word FROM
359 (SELECT distinct(postcode) as pc FROM location_postcode) p
361 (SELECT word FROM word
362 WHERE class ='place' and type = 'postcode') w
364 WHERE pc is null or word is null""")
369 for postcode, word in cur:
371 to_delete.append(word)
373 to_add.append(postcode)
376 cur.execute("""DELETE FROM WORD
377 WHERE class ='place' and type = 'postcode'
381 cur.execute("""SELECT count(create_postcode_id(pc))
382 FROM unnest(%s) as pc
387 def update_special_phrases(self, phrases, should_replace):
388 """ Replace the search index for special phrases with the new phrases.
390 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
393 with self.conn.cursor() as cur:
394 # Get the old phrases.
395 existing_phrases = set()
396 cur.execute("""SELECT word, class, type, operator FROM word
397 WHERE class != 'place'
398 OR (type != 'house' AND type != 'postcode')""")
399 for label, cls, typ, oper in cur:
400 existing_phrases.add((label, cls, typ, oper or '-'))
402 to_add = norm_phrases - existing_phrases
403 to_delete = existing_phrases - norm_phrases
407 """ INSERT INTO word (word_id, word_token, word, class, type,
408 search_name_count, operator)
409 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
411 CASE WHEN op in ('in', 'near') THEN op ELSE null END
412 FROM (VALUES %s) as v(name, class, type, op))""",
415 if to_delete and should_replace:
417 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
418 WHERE word = name and class = in_class and type = in_type
419 and ((op = '-' and operator is null) or op = operator)""",
422 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
423 len(norm_phrases), len(to_add), len(to_delete))
426 def add_country_names(self, country_code, names):
427 """ Add names for the given country to the search index.
429 with self.conn.cursor() as cur:
431 """INSERT INTO word (word_id, word_token, country_code)
432 (SELECT nextval('seq_word'), lookup_token, %s
433 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
435 WHERE NOT EXISTS(SELECT * FROM word
436 WHERE word_token = lookup_token and country_code = %s))
437 """, (country_code, list(names.values()), country_code))
440 def process_place(self, place):
441 """ Determine tokenizer information about the given place.
443 Returns a JSON-serialisable structure that will be handed into
444 the database via the token_info field.
446 token_info = _TokenInfo(self._cache)
451 token_info.add_names(self.conn, names)
453 if place.is_country():
454 self.add_country_names(place.country_code, names)
456 address = place.address
458 self._process_place_address(token_info, address)
460 return token_info.data
463 def _process_place_address(self, token_info, address):
467 for key, value in address.items():
468 if key == 'postcode':
469 # Make sure the normalized postcode is present in the word table.
470 if re.search(r'[:,;]', value) is None:
471 self._cache.add_postcode(self.conn,
472 self.normalize_postcode(value))
473 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
475 elif key == 'street':
476 token_info.add_street(self.conn, value)
478 token_info.add_place(self.conn, value)
479 elif not key.startswith('_') and key not in ('country', 'full'):
480 addr_terms.append((key, value))
483 token_info.add_housenumbers(self.conn, hnrs)
486 token_info.add_address_terms(self.conn, addr_terms)
491 """ Collect token information to be sent back to the database.
493 def __init__(self, cache):
498 def add_names(self, conn, names):
499 """ Add token information for the names of the place.
501 with conn.cursor() as cur:
502 # Create the token IDs for all names.
503 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
507 def add_housenumbers(self, conn, hnrs):
508 """ Extract housenumber information from the address.
511 token = self.cache.get_housenumber(hnrs[0])
512 if token is not None:
513 self.data['hnr_tokens'] = token
514 self.data['hnr'] = hnrs[0]
517 # split numbers if necessary
520 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
522 if len(simple_list) > 1:
523 simple_list = list(set(simple_list))
525 with conn.cursor() as cur:
526 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
527 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
530 def add_street(self, conn, street):
531 """ Add addr:street match terms.
533 def _get_street(name):
534 with conn.cursor() as cur:
535 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
537 tokens = self.cache.streets.get(street, _get_street)
539 self.data['street'] = tokens
542 def add_place(self, conn, place):
543 """ Add addr:place search and match terms.
545 def _get_place(name):
546 with conn.cursor() as cur:
547 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
548 word_ids_from_name(%s)::text""",
550 return cur.fetchone()
552 self.data['place_search'], self.data['place_match'] = \
553 self.cache.places.get(place, _get_place)
556 def add_address_terms(self, conn, terms):
557 """ Add additional address terms.
559 def _get_address_term(name):
560 with conn.cursor() as cur:
561 cur.execute("""SELECT addr_ids_from_name(%s)::text,
562 word_ids_from_name(%s)::text""",
564 return cur.fetchone()
567 for key, value in terms:
568 items = self.cache.address_terms.get(value, _get_address_term)
569 if items[0] or items[1]:
573 self.data['addr'] = tokens
577 """ Least recently used cache that accepts a generator function to
578 produce the item when there is a cache miss.
581 def __init__(self, maxsize=128, init_data=None):
582 self.data = init_data or OrderedDict()
583 self.maxsize = maxsize
584 if init_data is not None and len(init_data) > maxsize:
585 self.maxsize = len(init_data)
587 def get(self, key, generator):
588 """ Get the item with the given key from the cache. If nothing
589 is found in the cache, generate the value through the
590 generator function and store it in the cache.
592 value = self.data.get(key)
593 if value is not None:
594 self.data.move_to_end(key)
596 value = generator(key)
597 if len(self.data) >= self.maxsize:
598 self.data.popitem(last=False)
599 self.data[key] = value
605 """ Cache for token information to avoid repeated database queries.
607 This cache is not thread-safe and needs to be instantiated per
610 def __init__(self, conn):
612 self.streets = _LRU(maxsize=256)
613 self.places = _LRU(maxsize=128)
614 self.address_terms = _LRU(maxsize=1024)
616 # Lookup houseunumbers up to 100 and cache them
617 with conn.cursor() as cur:
618 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
619 FROM generate_series(1, 100) as i""")
620 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
622 # For postcodes remember the ones that have already been added
623 self.postcodes = set()
625 def get_housenumber(self, number):
626 """ Get a housenumber token from the cache.
628 return self._cached_housenumbers.get(number)
631 def add_postcode(self, conn, postcode):
632 """ Make sure the given postcode is in the database.
634 if postcode not in self.postcodes:
635 with conn.cursor() as cur:
636 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
637 self.postcodes.add(postcode)