1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from collections import OrderedDict
14 from textwrap import dedent
16 from icu import Transliterator
18 import psycopg2.extras
20 from nominatim.db.connection import connect
21 from nominatim.db import properties
22 from nominatim.db import utils as db_utils
23 from nominatim.db.sql_preprocessor import SQLPreprocessor
24 from nominatim.errors import UsageError
25 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
27 DBCFG_NORMALIZATION = "tokenizer_normalization"
28 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
30 LOG = logging.getLogger()
32 def create(dsn, data_dir):
33 """ Create a new instance of the tokenizer provided by this module.
35 return LegacyTokenizer(dsn, data_dir)
38 def _install_module(config_module_path, src_dir, module_dir):
39 """ Copies the PostgreSQL normalisation module into the project
40 directory if necessary. For historical reasons the module is
41 saved in the '/module' subdirectory and not with the other tokenizer
44 The function detects when the installation is run from the
45 build directory. It doesn't touch the module in that case.
47 # Custom module locations are simply used as is.
48 if config_module_path:
49 LOG.info("Using custom path for database module at '%s'", config_module_path)
50 return config_module_path
52 # Compatibility mode for builddir installations.
53 if module_dir.exists() and src_dir.samefile(module_dir):
54 LOG.info('Running from build directory. Leaving database module as is.')
57 # In any other case install the module in the project directory.
58 if not module_dir.exists():
61 destfile = module_dir / 'nominatim.so'
62 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
65 LOG.info('Database module installed at %s', str(destfile))
70 def _check_module(module_dir, conn):
71 """ Try to use the PostgreSQL module to confirm that it is correctly
72 installed and accessible from PostgreSQL.
74 with conn.cursor() as cur:
76 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
77 RETURNS text AS '{}/nominatim.so', 'transliteration'
78 LANGUAGE c IMMUTABLE STRICT;
79 DROP FUNCTION nominatim_test_import_func(text)
80 """.format(module_dir))
81 except psycopg2.DatabaseError as err:
82 LOG.fatal("Error accessing database module: %s", err)
83 raise UsageError("Database module cannot be accessed.") from err
86 class LegacyTokenizer(AbstractTokenizer):
87 """ The legacy tokenizer uses a special PostgreSQL module to normalize
88 names and queries. The tokenizer thus implements normalization through
89 calls to the database.
92 def __init__(self, dsn, data_dir):
94 self.data_dir = data_dir
95 self.normalization = None
98 def init_new_db(self, config, init_db=True):
99 """ Set up a new tokenizer for the database.
101 This copies all necessary data in the project directory to make
102 sure the tokenizer remains stable even over updates.
104 module_dir = _install_module(config.DATABASE_MODULE_PATH,
105 config.lib_dir.module,
106 config.project_dir / 'module')
108 self.normalization = config.TERM_NORMALIZATION
110 self._install_php(config)
112 with connect(self.dsn) as conn:
113 _check_module(module_dir, conn)
114 self._save_config(conn, config)
118 self.update_sql_functions(config)
119 self._init_db_tables(config)
122 def init_from_project(self, _):
123 """ Initialise the tokenizer from the project directory.
125 with connect(self.dsn) as conn:
126 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
129 def finalize_import(self, config):
130 """ Do any required postprocessing to make the tokenizer data ready
133 with connect(self.dsn) as conn:
134 sqlp = SQLPreprocessor(conn, config)
135 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
138 def update_sql_functions(self, config):
139 """ Reimport the SQL functions for this tokenizer.
141 with connect(self.dsn) as conn:
142 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
143 modulepath = config.DATABASE_MODULE_PATH or \
144 str((config.project_dir / 'module').resolve())
145 sqlp = SQLPreprocessor(conn, config)
146 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
147 max_word_freq=max_word_freq,
148 modulepath=modulepath)
151 def check_database(self, _):
152 """ Check that the tokenizer is set up correctly.
155 The Postgresql extension nominatim.so was not correctly loaded.
160 * Check the output of the CMmake/make installation step
161 * Does nominatim.so exist?
162 * Does nominatim.so exist on the database server?
163 * Can nominatim.so be accessed by the database user?
165 with connect(self.dsn) as conn:
166 with conn.cursor() as cur:
168 out = cur.scalar("SELECT make_standard_name('a')")
169 except psycopg2.Error as err:
170 return hint.format(error=str(err))
173 return hint.format(error='Unexpected result for make_standard_name()')
178 def migrate_database(self, config):
179 """ Initialise the project directory of an existing database for
180 use with this tokenizer.
182 This is a special migration function for updating existing databases
183 to new software versions.
185 self.normalization = config.TERM_NORMALIZATION
186 module_dir = _install_module(config.DATABASE_MODULE_PATH,
187 config.lib_dir.module,
188 config.project_dir / 'module')
190 with connect(self.dsn) as conn:
191 _check_module(module_dir, conn)
192 self._save_config(conn, config)
195 def update_statistics(self):
196 """ Recompute the frequency of full words.
198 with connect(self.dsn) as conn:
199 if conn.table_exists('search_name'):
200 with conn.cursor() as cur:
201 cur.drop_table("word_frequencies")
202 LOG.info("Computing word frequencies")
203 cur.execute("""CREATE TEMP TABLE word_frequencies AS
204 SELECT unnest(name_vector) as id, count(*)
205 FROM search_name GROUP BY id""")
206 cur.execute("CREATE INDEX ON word_frequencies(id)")
207 LOG.info("Update word table with recomputed frequencies")
208 cur.execute("""UPDATE word SET search_name_count = count
209 FROM word_frequencies
210 WHERE word_token like ' %' and word_id = id""")
211 cur.drop_table("word_frequencies")
215 def update_word_tokens(self):
216 """ No house-keeping implemented for the legacy tokenizer.
218 LOG.info("No tokenizer clean-up available.")
221 def name_analyzer(self):
222 """ Create a new analyzer for tokenizing names and queries
223 using this tokinzer. Analyzers are context managers and should
227 with tokenizer.name_analyzer() as analyzer:
231 When used outside the with construct, the caller must ensure to
232 call the close() function before destructing the analyzer.
234 Analyzers are not thread-safe. You need to instantiate one per thread.
236 normalizer = Transliterator.createFromRules("phrase normalizer",
238 return LegacyNameAnalyzer(self.dsn, normalizer)
241 def _install_php(self, config):
242 """ Install the php script for the tokenizer.
244 php_file = self.data_dir / "tokenizer.php"
245 php_file.write_text(dedent("""\
247 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
248 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
249 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
253 def _init_db_tables(self, config):
254 """ Set up the word table and fill it with pre-computed word
257 with connect(self.dsn) as conn:
258 sqlp = SQLPreprocessor(conn, config)
259 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
262 LOG.warning("Precomputing word tokens")
263 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
266 def _save_config(self, conn, config):
267 """ Save the configuration that needs to remain stable for the given
268 database as database properties.
270 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
271 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
274 class LegacyNameAnalyzer(AbstractAnalyzer):
275 """ The legacy analyzer uses the special Postgresql module for
278 Each instance opens a connection to the database to request the
282 def __init__(self, dsn, normalizer):
283 self.conn = connect(dsn).connection
284 self.conn.autocommit = True
285 self.normalizer = normalizer
286 psycopg2.extras.register_hstore(self.conn)
288 self._cache = _TokenCache(self.conn)
292 """ Free all resources used by the analyzer.
299 def get_word_token_info(self, words):
300 """ Return token information for the given list of words.
301 If a word starts with # it is assumed to be a full name
302 otherwise is a partial name.
304 The function returns a list of tuples with
305 (original word, word token, word id).
307 The function is used for testing and debugging only
308 and not necessarily efficient.
310 with self.conn.cursor() as cur:
311 cur.execute("""SELECT t.term, word_token, word_id
312 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
313 WHERE word_token = (CASE
314 WHEN left(t.term, 1) = '#' THEN
315 ' ' || make_standard_name(substring(t.term from 2))
317 make_standard_name(t.term)
319 and class is null and country_code is null""",
322 return [(r[0], r[1], r[2]) for r in cur]
325 def normalize(self, phrase):
326 """ Normalize the given phrase, i.e. remove all properties that
327 are irrelevant for search.
329 return self.normalizer.transliterate(phrase)
333 def normalize_postcode(postcode):
334 """ Convert the postcode to a standardized form.
336 This function must yield exactly the same result as the SQL function
337 'token_normalized_postcode()'.
339 return postcode.strip().upper()
342 def update_postcodes_from_db(self):
343 """ Update postcode tokens in the word table from the location_postcode
346 with self.conn.cursor() as cur:
347 # This finds us the rows in location_postcode and word that are
348 # missing in the other table.
349 cur.execute("""SELECT * FROM
350 (SELECT pc, word FROM
351 (SELECT distinct(postcode) as pc FROM location_postcode) p
353 (SELECT word FROM word
354 WHERE class ='place' and type = 'postcode') w
356 WHERE pc is null or word is null""")
361 for postcode, word in cur:
363 to_delete.append(word)
365 to_add.append(postcode)
368 cur.execute("""DELETE FROM WORD
369 WHERE class ='place' and type = 'postcode'
373 cur.execute("""SELECT count(create_postcode_id(pc))
374 FROM unnest(%s) as pc
379 def update_special_phrases(self, phrases, should_replace):
380 """ Replace the search index for special phrases with the new phrases.
382 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
385 with self.conn.cursor() as cur:
386 # Get the old phrases.
387 existing_phrases = set()
388 cur.execute("""SELECT word, class, type, operator FROM word
389 WHERE class != 'place'
390 OR (type != 'house' AND type != 'postcode')""")
391 for label, cls, typ, oper in cur:
392 existing_phrases.add((label, cls, typ, oper or '-'))
394 to_add = norm_phrases - existing_phrases
395 to_delete = existing_phrases - norm_phrases
399 """ INSERT INTO word (word_id, word_token, word, class, type,
400 search_name_count, operator)
401 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
403 CASE WHEN op in ('in', 'near') THEN op ELSE null END
404 FROM (VALUES %s) as v(name, class, type, op))""",
407 if to_delete and should_replace:
409 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
410 WHERE word = name and class = in_class and type = in_type
411 and ((op = '-' and operator is null) or op = operator)""",
414 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
415 len(norm_phrases), len(to_add), len(to_delete))
418 def add_country_names(self, country_code, names):
419 """ Add names for the given country to the search index.
421 with self.conn.cursor() as cur:
423 """INSERT INTO word (word_id, word_token, country_code)
424 (SELECT nextval('seq_word'), lookup_token, %s
425 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
427 WHERE NOT EXISTS(SELECT * FROM word
428 WHERE word_token = lookup_token and country_code = %s))
429 """, (country_code, list(names.values()), country_code))
432 def process_place(self, place):
433 """ Determine tokenizer information about the given place.
435 Returns a JSON-serialisable structure that will be handed into
436 the database via the token_info field.
438 token_info = _TokenInfo(self._cache)
443 token_info.add_names(self.conn, names)
445 if place.is_country():
446 self.add_country_names(place.country_code, names)
448 address = place.address
450 self._process_place_address(token_info, address)
452 return token_info.data
455 def _process_place_address(self, token_info, address):
459 for key, value in address.items():
460 if key == 'postcode':
461 # Make sure the normalized postcode is present in the word table.
462 if re.search(r'[:,;]', value) is None:
463 self._cache.add_postcode(self.conn,
464 self.normalize_postcode(value))
465 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
467 elif key == 'street':
468 token_info.add_street(self.conn, value)
470 token_info.add_place(self.conn, value)
471 elif not key.startswith('_') and key not in ('country', 'full'):
472 addr_terms.append((key, value))
475 token_info.add_housenumbers(self.conn, hnrs)
478 token_info.add_address_terms(self.conn, addr_terms)
483 """ Collect token information to be sent back to the database.
485 def __init__(self, cache):
490 def add_names(self, conn, names):
491 """ Add token information for the names of the place.
493 with conn.cursor() as cur:
494 # Create the token IDs for all names.
495 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
499 def add_housenumbers(self, conn, hnrs):
500 """ Extract housenumber information from the address.
503 token = self.cache.get_housenumber(hnrs[0])
504 if token is not None:
505 self.data['hnr_tokens'] = token
506 self.data['hnr'] = hnrs[0]
509 # split numbers if necessary
512 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
514 if len(simple_list) > 1:
515 simple_list = list(set(simple_list))
517 with conn.cursor() as cur:
518 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
519 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
522 def add_street(self, conn, street):
523 """ Add addr:street match terms.
525 def _get_street(name):
526 with conn.cursor() as cur:
527 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
529 tokens = self.cache.streets.get(street, _get_street)
531 self.data['street'] = tokens
534 def add_place(self, conn, place):
535 """ Add addr:place search and match terms.
537 def _get_place(name):
538 with conn.cursor() as cur:
539 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
540 word_ids_from_name(%s)::text""",
542 return cur.fetchone()
544 self.data['place_search'], self.data['place_match'] = \
545 self.cache.places.get(place, _get_place)
548 def add_address_terms(self, conn, terms):
549 """ Add additional address terms.
551 def _get_address_term(name):
552 with conn.cursor() as cur:
553 cur.execute("""SELECT addr_ids_from_name(%s)::text,
554 word_ids_from_name(%s)::text""",
556 return cur.fetchone()
559 for key, value in terms:
560 items = self.cache.address_terms.get(value, _get_address_term)
561 if items[0] or items[1]:
565 self.data['addr'] = tokens
569 """ Least recently used cache that accepts a generator function to
570 produce the item when there is a cache miss.
573 def __init__(self, maxsize=128, init_data=None):
574 self.data = init_data or OrderedDict()
575 self.maxsize = maxsize
576 if init_data is not None and len(init_data) > maxsize:
577 self.maxsize = len(init_data)
579 def get(self, key, generator):
580 """ Get the item with the given key from the cache. If nothing
581 is found in the cache, generate the value through the
582 generator function and store it in the cache.
584 value = self.data.get(key)
585 if value is not None:
586 self.data.move_to_end(key)
588 value = generator(key)
589 if len(self.data) >= self.maxsize:
590 self.data.popitem(last=False)
591 self.data[key] = value
597 """ Cache for token information to avoid repeated database queries.
599 This cache is not thread-safe and needs to be instantiated per
602 def __init__(self, conn):
604 self.streets = _LRU(maxsize=256)
605 self.places = _LRU(maxsize=128)
606 self.address_terms = _LRU(maxsize=1024)
608 # Lookup houseunumbers up to 100 and cache them
609 with conn.cursor() as cur:
610 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
611 FROM generate_series(1, 100) as i""")
612 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
614 # For postcodes remember the ones that have already been added
615 self.postcodes = set()
617 def get_housenumber(self, number):
618 """ Get a housenumber token from the cache.
620 return self._cached_housenumbers.get(number)
623 def add_postcode(self, conn, postcode):
624 """ Make sure the given postcode is in the database.
626 if postcode not in self.postcodes:
627 with conn.cursor() as cur:
628 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
629 self.postcodes.add(postcode)