1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from collections import OrderedDict
14 from textwrap import dedent
16 from icu import Transliterator
18 import psycopg2.extras
20 from nominatim.db.connection import connect
21 from nominatim.db import properties
22 from nominatim.db import utils as db_utils
23 from nominatim.db.sql_preprocessor import SQLPreprocessor
24 from nominatim.errors import UsageError
25 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
27 DBCFG_NORMALIZATION = "tokenizer_normalization"
28 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
30 LOG = logging.getLogger()
32 def create(dsn, data_dir):
33 """ Create a new instance of the tokenizer provided by this module.
35 return LegacyTokenizer(dsn, data_dir)
38 def _install_module(config_module_path, src_dir, module_dir):
39 """ Copies the PostgreSQL normalisation module into the project
40 directory if necessary. For historical reasons the module is
41 saved in the '/module' subdirectory and not with the other tokenizer
44 The function detects when the installation is run from the
45 build directory. It doesn't touch the module in that case.
47 # Custom module locations are simply used as is.
48 if config_module_path:
49 LOG.info("Using custom path for database module at '%s'", config_module_path)
50 return config_module_path
52 # Compatibility mode for builddir installations.
53 if module_dir.exists() and src_dir.samefile(module_dir):
54 LOG.info('Running from build directory. Leaving database module as is.')
57 # In any other case install the module in the project directory.
58 if not module_dir.exists():
61 destfile = module_dir / 'nominatim.so'
62 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
65 LOG.info('Database module installed at %s', str(destfile))
70 def _check_module(module_dir, conn):
71 """ Try to use the PostgreSQL module to confirm that it is correctly
72 installed and accessible from PostgreSQL.
74 with conn.cursor() as cur:
76 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
77 RETURNS text AS %s, 'transliteration'
78 LANGUAGE c IMMUTABLE STRICT;
79 DROP FUNCTION nominatim_test_import_func(text)
80 """, (f'{module_dir}/nominatim.so', ))
81 except psycopg2.DatabaseError as err:
82 LOG.fatal("Error accessing database module: %s", err)
83 raise UsageError("Database module cannot be accessed.") from err
86 class LegacyTokenizer(AbstractTokenizer):
87 """ The legacy tokenizer uses a special PostgreSQL module to normalize
88 names and queries. The tokenizer thus implements normalization through
89 calls to the database.
92 def __init__(self, dsn, data_dir):
94 self.data_dir = data_dir
95 self.normalization = None
98 def init_new_db(self, config, init_db=True):
99 """ Set up a new tokenizer for the database.
101 This copies all necessary data in the project directory to make
102 sure the tokenizer remains stable even over updates.
104 module_dir = _install_module(config.DATABASE_MODULE_PATH,
105 config.lib_dir.module,
106 config.project_dir / 'module')
108 self.normalization = config.TERM_NORMALIZATION
110 self._install_php(config, overwrite=True)
112 with connect(self.dsn) as conn:
113 _check_module(module_dir, conn)
114 self._save_config(conn, config)
118 self.update_sql_functions(config)
119 self._init_db_tables(config)
122 def init_from_project(self, config):
123 """ Initialise the tokenizer from the project directory.
125 with connect(self.dsn) as conn:
126 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
128 if not (config.project_dir / 'module' / 'nominatim.so').exists():
129 _install_module(config.DATABASE_MODULE_PATH,
130 config.lib_dir.module,
131 config.project_dir / 'module')
133 self._install_php(config, overwrite=False)
135 def finalize_import(self, config):
136 """ Do any required postprocessing to make the tokenizer data ready
139 with connect(self.dsn) as conn:
140 sqlp = SQLPreprocessor(conn, config)
141 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
144 def update_sql_functions(self, config):
145 """ Reimport the SQL functions for this tokenizer.
147 with connect(self.dsn) as conn:
148 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
149 modulepath = config.DATABASE_MODULE_PATH or \
150 str((config.project_dir / 'module').resolve())
151 sqlp = SQLPreprocessor(conn, config)
152 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
153 max_word_freq=max_word_freq,
154 modulepath=modulepath)
157 def check_database(self, _):
158 """ Check that the tokenizer is set up correctly.
161 The Postgresql extension nominatim.so was not correctly loaded.
166 * Check the output of the CMmake/make installation step
167 * Does nominatim.so exist?
168 * Does nominatim.so exist on the database server?
169 * Can nominatim.so be accessed by the database user?
171 with connect(self.dsn) as conn:
172 with conn.cursor() as cur:
174 out = cur.scalar("SELECT make_standard_name('a')")
175 except psycopg2.Error as err:
176 return hint.format(error=str(err))
179 return hint.format(error='Unexpected result for make_standard_name()')
184 def migrate_database(self, config):
185 """ Initialise the project directory of an existing database for
186 use with this tokenizer.
188 This is a special migration function for updating existing databases
189 to new software versions.
191 self.normalization = config.TERM_NORMALIZATION
192 module_dir = _install_module(config.DATABASE_MODULE_PATH,
193 config.lib_dir.module,
194 config.project_dir / 'module')
196 with connect(self.dsn) as conn:
197 _check_module(module_dir, conn)
198 self._save_config(conn, config)
201 def update_statistics(self):
202 """ Recompute the frequency of full words.
204 with connect(self.dsn) as conn:
205 if conn.table_exists('search_name'):
206 with conn.cursor() as cur:
207 cur.drop_table("word_frequencies")
208 LOG.info("Computing word frequencies")
209 cur.execute("""CREATE TEMP TABLE word_frequencies AS
210 SELECT unnest(name_vector) as id, count(*)
211 FROM search_name GROUP BY id""")
212 cur.execute("CREATE INDEX ON word_frequencies(id)")
213 LOG.info("Update word table with recomputed frequencies")
214 cur.execute("""UPDATE word SET search_name_count = count
215 FROM word_frequencies
216 WHERE word_token like ' %' and word_id = id""")
217 cur.drop_table("word_frequencies")
221 def update_word_tokens(self):
222 """ No house-keeping implemented for the legacy tokenizer.
224 LOG.info("No tokenizer clean-up available.")
227 def name_analyzer(self):
228 """ Create a new analyzer for tokenizing names and queries
229 using this tokinzer. Analyzers are context managers and should
233 with tokenizer.name_analyzer() as analyzer:
237 When used outside the with construct, the caller must ensure to
238 call the close() function before destructing the analyzer.
240 Analyzers are not thread-safe. You need to instantiate one per thread.
242 normalizer = Transliterator.createFromRules("phrase normalizer",
244 return LegacyNameAnalyzer(self.dsn, normalizer)
247 def _install_php(self, config, overwrite=True):
248 """ Install the php script for the tokenizer.
250 php_file = self.data_dir / "tokenizer.php"
252 if not php_file.exists() or overwrite:
253 php_file.write_text(dedent(f"""\
255 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
256 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
257 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
258 """), encoding='utf-8')
261 def _init_db_tables(self, config):
262 """ Set up the word table and fill it with pre-computed word
265 with connect(self.dsn) as conn:
266 sqlp = SQLPreprocessor(conn, config)
267 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
270 LOG.warning("Precomputing word tokens")
271 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
274 def _save_config(self, conn, config):
275 """ Save the configuration that needs to remain stable for the given
276 database as database properties.
278 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
279 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
282 class LegacyNameAnalyzer(AbstractAnalyzer):
283 """ The legacy analyzer uses the special Postgresql module for
286 Each instance opens a connection to the database to request the
290 def __init__(self, dsn, normalizer):
291 self.conn = connect(dsn).connection
292 self.conn.autocommit = True
293 self.normalizer = normalizer
294 psycopg2.extras.register_hstore(self.conn)
296 self._cache = _TokenCache(self.conn)
300 """ Free all resources used by the analyzer.
307 def get_word_token_info(self, words):
308 """ Return token information for the given list of words.
309 If a word starts with # it is assumed to be a full name
310 otherwise is a partial name.
312 The function returns a list of tuples with
313 (original word, word token, word id).
315 The function is used for testing and debugging only
316 and not necessarily efficient.
318 with self.conn.cursor() as cur:
319 cur.execute("""SELECT t.term, word_token, word_id
320 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
321 WHERE word_token = (CASE
322 WHEN left(t.term, 1) = '#' THEN
323 ' ' || make_standard_name(substring(t.term from 2))
325 make_standard_name(t.term)
327 and class is null and country_code is null""",
330 return [(r[0], r[1], r[2]) for r in cur]
333 def normalize(self, phrase):
334 """ Normalize the given phrase, i.e. remove all properties that
335 are irrelevant for search.
337 return self.normalizer.transliterate(phrase)
340 def normalize_postcode(self, postcode):
341 """ Convert the postcode to a standardized form.
343 This function must yield exactly the same result as the SQL function
344 'token_normalized_postcode()'.
346 return postcode.strip().upper()
349 def update_postcodes_from_db(self):
350 """ Update postcode tokens in the word table from the location_postcode
353 with self.conn.cursor() as cur:
354 # This finds us the rows in location_postcode and word that are
355 # missing in the other table.
356 cur.execute("""SELECT * FROM
357 (SELECT pc, word FROM
358 (SELECT distinct(postcode) as pc FROM location_postcode) p
360 (SELECT word FROM word
361 WHERE class ='place' and type = 'postcode') w
363 WHERE pc is null or word is null""")
368 for postcode, word in cur:
370 to_delete.append(word)
372 to_add.append(postcode)
375 cur.execute("""DELETE FROM WORD
376 WHERE class ='place' and type = 'postcode'
380 cur.execute("""SELECT count(create_postcode_id(pc))
381 FROM unnest(%s) as pc
386 def update_special_phrases(self, phrases, should_replace):
387 """ Replace the search index for special phrases with the new phrases.
389 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
392 with self.conn.cursor() as cur:
393 # Get the old phrases.
394 existing_phrases = set()
395 cur.execute("""SELECT word, class, type, operator FROM word
396 WHERE class != 'place'
397 OR (type != 'house' AND type != 'postcode')""")
398 for label, cls, typ, oper in cur:
399 existing_phrases.add((label, cls, typ, oper or '-'))
401 to_add = norm_phrases - existing_phrases
402 to_delete = existing_phrases - norm_phrases
406 """ INSERT INTO word (word_id, word_token, word, class, type,
407 search_name_count, operator)
408 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
410 CASE WHEN op in ('in', 'near') THEN op ELSE null END
411 FROM (VALUES %s) as v(name, class, type, op))""",
414 if to_delete and should_replace:
416 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
417 WHERE word = name and class = in_class and type = in_type
418 and ((op = '-' and operator is null) or op = operator)""",
421 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
422 len(norm_phrases), len(to_add), len(to_delete))
425 def add_country_names(self, country_code, names):
426 """ Add names for the given country to the search index.
428 with self.conn.cursor() as cur:
430 """INSERT INTO word (word_id, word_token, country_code)
431 (SELECT nextval('seq_word'), lookup_token, %s
432 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
434 WHERE NOT EXISTS(SELECT * FROM word
435 WHERE word_token = lookup_token and country_code = %s))
436 """, (country_code, list(names.values()), country_code))
439 def process_place(self, place):
440 """ Determine tokenizer information about the given place.
442 Returns a JSON-serialisable structure that will be handed into
443 the database via the token_info field.
445 token_info = _TokenInfo(self._cache)
450 token_info.add_names(self.conn, names)
452 if place.is_country():
453 self.add_country_names(place.country_code, names)
455 address = place.address
457 self._process_place_address(token_info, address)
459 return token_info.data
462 def _process_place_address(self, token_info, address):
466 for key, value in address.items():
467 if key == 'postcode':
468 # Make sure the normalized postcode is present in the word table.
469 if re.search(r'[:,;]', value) is None:
470 self._cache.add_postcode(self.conn,
471 self.normalize_postcode(value))
472 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
474 elif key == 'street':
475 token_info.add_street(self.conn, value)
477 token_info.add_place(self.conn, value)
478 elif not key.startswith('_') and key not in ('country', 'full'):
479 addr_terms.append((key, value))
482 token_info.add_housenumbers(self.conn, hnrs)
485 token_info.add_address_terms(self.conn, addr_terms)
490 """ Collect token information to be sent back to the database.
492 def __init__(self, cache):
497 def add_names(self, conn, names):
498 """ Add token information for the names of the place.
500 with conn.cursor() as cur:
501 # Create the token IDs for all names.
502 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
506 def add_housenumbers(self, conn, hnrs):
507 """ Extract housenumber information from the address.
510 token = self.cache.get_housenumber(hnrs[0])
511 if token is not None:
512 self.data['hnr_tokens'] = token
513 self.data['hnr'] = hnrs[0]
516 # split numbers if necessary
519 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
521 if len(simple_list) > 1:
522 simple_list = list(set(simple_list))
524 with conn.cursor() as cur:
525 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
526 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
529 def add_street(self, conn, street):
530 """ Add addr:street match terms.
532 def _get_street(name):
533 with conn.cursor() as cur:
534 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
536 tokens = self.cache.streets.get(street, _get_street)
538 self.data['street'] = tokens
541 def add_place(self, conn, place):
542 """ Add addr:place search and match terms.
544 def _get_place(name):
545 with conn.cursor() as cur:
546 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
547 word_ids_from_name(%s)::text""",
549 return cur.fetchone()
551 self.data['place_search'], self.data['place_match'] = \
552 self.cache.places.get(place, _get_place)
555 def add_address_terms(self, conn, terms):
556 """ Add additional address terms.
558 def _get_address_term(name):
559 with conn.cursor() as cur:
560 cur.execute("""SELECT addr_ids_from_name(%s)::text,
561 word_ids_from_name(%s)::text""",
563 return cur.fetchone()
566 for key, value in terms:
567 items = self.cache.address_terms.get(value, _get_address_term)
568 if items[0] or items[1]:
572 self.data['addr'] = tokens
576 """ Least recently used cache that accepts a generator function to
577 produce the item when there is a cache miss.
580 def __init__(self, maxsize=128, init_data=None):
581 self.data = init_data or OrderedDict()
582 self.maxsize = maxsize
583 if init_data is not None and len(init_data) > maxsize:
584 self.maxsize = len(init_data)
586 def get(self, key, generator):
587 """ Get the item with the given key from the cache. If nothing
588 is found in the cache, generate the value through the
589 generator function and store it in the cache.
591 value = self.data.get(key)
592 if value is not None:
593 self.data.move_to_end(key)
595 value = generator(key)
596 if len(self.data) >= self.maxsize:
597 self.data.popitem(last=False)
598 self.data[key] = value
604 """ Cache for token information to avoid repeated database queries.
606 This cache is not thread-safe and needs to be instantiated per
609 def __init__(self, conn):
611 self.streets = _LRU(maxsize=256)
612 self.places = _LRU(maxsize=128)
613 self.address_terms = _LRU(maxsize=1024)
615 # Lookup houseunumbers up to 100 and cache them
616 with conn.cursor() as cur:
617 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
618 FROM generate_series(1, 100) as i""")
619 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
621 # For postcodes remember the ones that have already been added
622 self.postcodes = set()
624 def get_housenumber(self, number):
625 """ Get a housenumber token from the cache.
627 return self._cached_housenumbers.get(number)
630 def add_postcode(self, conn, postcode):
631 """ Make sure the given postcode is in the database.
633 if postcode not in self.postcodes:
634 with conn.cursor() as cur:
635 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
636 self.postcodes.add(postcode)