1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from collections import OrderedDict
14 from textwrap import dedent
16 from icu import Transliterator
18 import psycopg2.extras
20 from nominatim.db.connection import connect
21 from nominatim.db import properties
22 from nominatim.db import utils as db_utils
23 from nominatim.db.sql_preprocessor import SQLPreprocessor
24 from nominatim.errors import UsageError
25 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
27 DBCFG_NORMALIZATION = "tokenizer_normalization"
28 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
30 LOG = logging.getLogger()
32 def create(dsn, data_dir):
33 """ Create a new instance of the tokenizer provided by this module.
35 return LegacyTokenizer(dsn, data_dir)
38 def _install_module(config_module_path, src_dir, module_dir):
39 """ Copies the PostgreSQL normalisation module into the project
40 directory if necessary. For historical reasons the module is
41 saved in the '/module' subdirectory and not with the other tokenizer
44 The function detects when the installation is run from the
45 build directory. It doesn't touch the module in that case.
47 # Custom module locations are simply used as is.
48 if config_module_path:
49 LOG.info("Using custom path for database module at '%s'", config_module_path)
50 return config_module_path
52 # Compatibility mode for builddir installations.
53 if module_dir.exists() and src_dir.samefile(module_dir):
54 LOG.info('Running from build directory. Leaving database module as is.')
57 # In any other case install the module in the project directory.
58 if not module_dir.exists():
61 destfile = module_dir / 'nominatim.so'
62 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
65 LOG.info('Database module installed at %s', str(destfile))
70 def _check_module(module_dir, conn):
71 """ Try to use the PostgreSQL module to confirm that it is correctly
72 installed and accessible from PostgreSQL.
74 with conn.cursor() as cur:
76 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
77 RETURNS text AS '{}/nominatim.so', 'transliteration'
78 LANGUAGE c IMMUTABLE STRICT;
79 DROP FUNCTION nominatim_test_import_func(text)
80 """.format(module_dir))
81 except psycopg2.DatabaseError as err:
82 LOG.fatal("Error accessing database module: %s", err)
83 raise UsageError("Database module cannot be accessed.") from err
86 class LegacyTokenizer(AbstractTokenizer):
87 """ The legacy tokenizer uses a special PostgreSQL module to normalize
88 names and queries. The tokenizer thus implements normalization through
89 calls to the database.
92 def __init__(self, dsn, data_dir):
94 self.data_dir = data_dir
95 self.normalization = None
98 def init_new_db(self, config, init_db=True):
99 """ Set up a new tokenizer for the database.
101 This copies all necessary data in the project directory to make
102 sure the tokenizer remains stable even over updates.
104 module_dir = _install_module(config.DATABASE_MODULE_PATH,
105 config.lib_dir.module,
106 config.project_dir / 'module')
108 self.normalization = config.TERM_NORMALIZATION
110 self._install_php(config)
112 with connect(self.dsn) as conn:
113 _check_module(module_dir, conn)
114 self._save_config(conn, config)
118 self.update_sql_functions(config)
119 self._init_db_tables(config)
122 def init_from_project(self, _):
123 """ Initialise the tokenizer from the project directory.
125 with connect(self.dsn) as conn:
126 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
129 def finalize_import(self, config):
130 """ Do any required postprocessing to make the tokenizer data ready
133 with connect(self.dsn) as conn:
134 sqlp = SQLPreprocessor(conn, config)
135 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
138 def update_sql_functions(self, config):
139 """ Reimport the SQL functions for this tokenizer.
141 with connect(self.dsn) as conn:
142 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
143 modulepath = config.DATABASE_MODULE_PATH or \
144 str((config.project_dir / 'module').resolve())
145 sqlp = SQLPreprocessor(conn, config)
146 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
147 max_word_freq=max_word_freq,
148 modulepath=modulepath)
151 def check_database(self, _):
152 """ Check that the tokenizer is set up correctly.
155 The Postgresql extension nominatim.so was not correctly loaded.
160 * Check the output of the CMmake/make installation step
161 * Does nominatim.so exist?
162 * Does nominatim.so exist on the database server?
163 * Can nominatim.so be accessed by the database user?
165 with connect(self.dsn) as conn:
166 with conn.cursor() as cur:
168 out = cur.scalar("SELECT make_standard_name('a')")
169 except psycopg2.Error as err:
170 return hint.format(error=str(err))
173 return hint.format(error='Unexpected result for make_standard_name()')
178 def migrate_database(self, config):
179 """ Initialise the project directory of an existing database for
180 use with this tokenizer.
182 This is a special migration function for updating existing databases
183 to new software versions.
185 self.normalization = config.TERM_NORMALIZATION
186 module_dir = _install_module(config.DATABASE_MODULE_PATH,
187 config.lib_dir.module,
188 config.project_dir / 'module')
190 with connect(self.dsn) as conn:
191 _check_module(module_dir, conn)
192 self._save_config(conn, config)
195 def update_statistics(self):
196 """ Recompute the frequency of full words.
198 with connect(self.dsn) as conn:
199 if conn.table_exists('search_name'):
200 with conn.cursor() as cur:
201 cur.drop_table("word_frequencies")
202 LOG.info("Computing word frequencies")
203 cur.execute("""CREATE TEMP TABLE word_frequencies AS
204 SELECT unnest(name_vector) as id, count(*)
205 FROM search_name GROUP BY id""")
206 cur.execute("CREATE INDEX ON word_frequencies(id)")
207 LOG.info("Update word table with recomputed frequencies")
208 cur.execute("""UPDATE word SET search_name_count = count
209 FROM word_frequencies
210 WHERE word_token like ' %' and word_id = id""")
211 cur.drop_table("word_frequencies")
214 def name_analyzer(self):
215 """ Create a new analyzer for tokenizing names and queries
216 using this tokinzer. Analyzers are context managers and should
220 with tokenizer.name_analyzer() as analyzer:
224 When used outside the with construct, the caller must ensure to
225 call the close() function before destructing the analyzer.
227 Analyzers are not thread-safe. You need to instantiate one per thread.
229 normalizer = Transliterator.createFromRules("phrase normalizer",
231 return LegacyNameAnalyzer(self.dsn, normalizer)
234 def _install_php(self, config):
235 """ Install the php script for the tokenizer.
237 php_file = self.data_dir / "tokenizer.php"
238 php_file.write_text(dedent("""\
240 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
241 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
242 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
246 def _init_db_tables(self, config):
247 """ Set up the word table and fill it with pre-computed word
250 with connect(self.dsn) as conn:
251 sqlp = SQLPreprocessor(conn, config)
252 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
255 LOG.warning("Precomputing word tokens")
256 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
259 def _save_config(self, conn, config):
260 """ Save the configuration that needs to remain stable for the given
261 database as database properties.
263 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
264 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
267 class LegacyNameAnalyzer(AbstractAnalyzer):
268 """ The legacy analyzer uses the special Postgresql module for
271 Each instance opens a connection to the database to request the
275 def __init__(self, dsn, normalizer):
276 self.conn = connect(dsn).connection
277 self.conn.autocommit = True
278 self.normalizer = normalizer
279 psycopg2.extras.register_hstore(self.conn)
281 self._cache = _TokenCache(self.conn)
285 """ Free all resources used by the analyzer.
292 def get_word_token_info(self, words):
293 """ Return token information for the given list of words.
294 If a word starts with # it is assumed to be a full name
295 otherwise is a partial name.
297 The function returns a list of tuples with
298 (original word, word token, word id).
300 The function is used for testing and debugging only
301 and not necessarily efficient.
303 with self.conn.cursor() as cur:
304 cur.execute("""SELECT t.term, word_token, word_id
305 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
306 WHERE word_token = (CASE
307 WHEN left(t.term, 1) = '#' THEN
308 ' ' || make_standard_name(substring(t.term from 2))
310 make_standard_name(t.term)
312 and class is null and country_code is null""",
315 return [(r[0], r[1], r[2]) for r in cur]
318 def normalize(self, phrase):
319 """ Normalize the given phrase, i.e. remove all properties that
320 are irrelevant for search.
322 return self.normalizer.transliterate(phrase)
326 def normalize_postcode(postcode):
327 """ Convert the postcode to a standardized form.
329 This function must yield exactly the same result as the SQL function
330 'token_normalized_postcode()'.
332 return postcode.strip().upper()
335 def update_postcodes_from_db(self):
336 """ Update postcode tokens in the word table from the location_postcode
339 with self.conn.cursor() as cur:
340 # This finds us the rows in location_postcode and word that are
341 # missing in the other table.
342 cur.execute("""SELECT * FROM
343 (SELECT pc, word FROM
344 (SELECT distinct(postcode) as pc FROM location_postcode) p
346 (SELECT word FROM word
347 WHERE class ='place' and type = 'postcode') w
349 WHERE pc is null or word is null""")
354 for postcode, word in cur:
356 to_delete.append(word)
358 to_add.append(postcode)
361 cur.execute("""DELETE FROM WORD
362 WHERE class ='place' and type = 'postcode'
366 cur.execute("""SELECT count(create_postcode_id(pc))
367 FROM unnest(%s) as pc
372 def update_special_phrases(self, phrases, should_replace):
373 """ Replace the search index for special phrases with the new phrases.
375 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
378 with self.conn.cursor() as cur:
379 # Get the old phrases.
380 existing_phrases = set()
381 cur.execute("""SELECT word, class, type, operator FROM word
382 WHERE class != 'place'
383 OR (type != 'house' AND type != 'postcode')""")
384 for label, cls, typ, oper in cur:
385 existing_phrases.add((label, cls, typ, oper or '-'))
387 to_add = norm_phrases - existing_phrases
388 to_delete = existing_phrases - norm_phrases
392 """ INSERT INTO word (word_id, word_token, word, class, type,
393 search_name_count, operator)
394 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
396 CASE WHEN op in ('in', 'near') THEN op ELSE null END
397 FROM (VALUES %s) as v(name, class, type, op))""",
400 if to_delete and should_replace:
402 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
403 WHERE word = name and class = in_class and type = in_type
404 and ((op = '-' and operator is null) or op = operator)""",
407 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
408 len(norm_phrases), len(to_add), len(to_delete))
411 def add_country_names(self, country_code, names):
412 """ Add names for the given country to the search index.
414 with self.conn.cursor() as cur:
416 """INSERT INTO word (word_id, word_token, country_code)
417 (SELECT nextval('seq_word'), lookup_token, %s
418 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
420 WHERE NOT EXISTS(SELECT * FROM word
421 WHERE word_token = lookup_token and country_code = %s))
422 """, (country_code, list(names.values()), country_code))
425 def process_place(self, place):
426 """ Determine tokenizer information about the given place.
428 Returns a JSON-serialisable structure that will be handed into
429 the database via the token_info field.
431 token_info = _TokenInfo(self._cache)
436 token_info.add_names(self.conn, names)
438 if place.is_country():
439 self.add_country_names(place.country_code, names)
441 address = place.address
443 self._process_place_address(token_info, address)
445 return token_info.data
448 def _process_place_address(self, token_info, address):
452 for key, value in address.items():
453 if key == 'postcode':
454 # Make sure the normalized postcode is present in the word table.
455 if re.search(r'[:,;]', value) is None:
456 self._cache.add_postcode(self.conn,
457 self.normalize_postcode(value))
458 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
460 elif key == 'street':
461 token_info.add_street(self.conn, value)
463 token_info.add_place(self.conn, value)
464 elif not key.startswith('_') and key not in ('country', 'full'):
465 addr_terms.append((key, value))
468 token_info.add_housenumbers(self.conn, hnrs)
471 token_info.add_address_terms(self.conn, addr_terms)
476 """ Collect token information to be sent back to the database.
478 def __init__(self, cache):
483 def add_names(self, conn, names):
484 """ Add token information for the names of the place.
486 with conn.cursor() as cur:
487 # Create the token IDs for all names.
488 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
492 def add_housenumbers(self, conn, hnrs):
493 """ Extract housenumber information from the address.
496 token = self.cache.get_housenumber(hnrs[0])
497 if token is not None:
498 self.data['hnr_tokens'] = token
499 self.data['hnr'] = hnrs[0]
502 # split numbers if necessary
505 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
507 if len(simple_list) > 1:
508 simple_list = list(set(simple_list))
510 with conn.cursor() as cur:
511 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
512 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
515 def add_street(self, conn, street):
516 """ Add addr:street match terms.
518 def _get_street(name):
519 with conn.cursor() as cur:
520 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
522 tokens = self.cache.streets.get(street, _get_street)
524 self.data['street'] = tokens
527 def add_place(self, conn, place):
528 """ Add addr:place search and match terms.
530 def _get_place(name):
531 with conn.cursor() as cur:
532 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
533 word_ids_from_name(%s)::text""",
535 return cur.fetchone()
537 self.data['place_search'], self.data['place_match'] = \
538 self.cache.places.get(place, _get_place)
541 def add_address_terms(self, conn, terms):
542 """ Add additional address terms.
544 def _get_address_term(name):
545 with conn.cursor() as cur:
546 cur.execute("""SELECT addr_ids_from_name(%s)::text,
547 word_ids_from_name(%s)::text""",
549 return cur.fetchone()
552 for key, value in terms:
553 items = self.cache.address_terms.get(value, _get_address_term)
554 if items[0] or items[1]:
558 self.data['addr'] = tokens
562 """ Least recently used cache that accepts a generator function to
563 produce the item when there is a cache miss.
566 def __init__(self, maxsize=128, init_data=None):
567 self.data = init_data or OrderedDict()
568 self.maxsize = maxsize
569 if init_data is not None and len(init_data) > maxsize:
570 self.maxsize = len(init_data)
572 def get(self, key, generator):
573 """ Get the item with the given key from the cache. If nothing
574 is found in the cache, generate the value through the
575 generator function and store it in the cache.
577 value = self.data.get(key)
578 if value is not None:
579 self.data.move_to_end(key)
581 value = generator(key)
582 if len(self.data) >= self.maxsize:
583 self.data.popitem(last=False)
584 self.data[key] = value
590 """ Cache for token information to avoid repeated database queries.
592 This cache is not thread-safe and needs to be instantiated per
595 def __init__(self, conn):
597 self.streets = _LRU(maxsize=256)
598 self.places = _LRU(maxsize=128)
599 self.address_terms = _LRU(maxsize=1024)
601 # Lookup houseunumbers up to 100 and cache them
602 with conn.cursor() as cur:
603 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
604 FROM generate_series(1, 100) as i""")
605 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
607 # For postcodes remember the ones that have already been added
608 self.postcodes = set()
610 def get_housenumber(self, number):
611 """ Get a housenumber token from the cache.
613 return self._cached_housenumbers.get(number)
616 def add_postcode(self, conn, postcode):
617 """ Make sure the given postcode is in the database.
619 if postcode not in self.postcodes:
620 with conn.cursor() as cur:
621 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
622 self.postcodes.add(postcode)