1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from collections import OrderedDict
14 from textwrap import dedent
16 from icu import Transliterator
18 import psycopg2.extras
20 from nominatim.db.connection import connect
21 from nominatim.db import properties
22 from nominatim.db import utils as db_utils
23 from nominatim.db.sql_preprocessor import SQLPreprocessor
24 from nominatim.errors import UsageError
25 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
27 DBCFG_NORMALIZATION = "tokenizer_normalization"
28 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
30 LOG = logging.getLogger()
32 def create(dsn, data_dir):
33 """ Create a new instance of the tokenizer provided by this module.
35 return LegacyTokenizer(dsn, data_dir)
38 def _install_module(config_module_path, src_dir, module_dir):
39 """ Copies the PostgreSQL normalisation module into the project
40 directory if necessary. For historical reasons the module is
41 saved in the '/module' subdirectory and not with the other tokenizer
44 The function detects when the installation is run from the
45 build directory. It doesn't touch the module in that case.
47 # Custom module locations are simply used as is.
48 if config_module_path:
49 LOG.info("Using custom path for database module at '%s'", config_module_path)
50 return config_module_path
52 # Compatibility mode for builddir installations.
53 if module_dir.exists() and src_dir.samefile(module_dir):
54 LOG.info('Running from build directory. Leaving database module as is.')
57 # In any other case install the module in the project directory.
58 if not module_dir.exists():
61 destfile = module_dir / 'nominatim.so'
62 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
65 LOG.info('Database module installed at %s', str(destfile))
70 def _check_module(module_dir, conn):
71 """ Try to use the PostgreSQL module to confirm that it is correctly
72 installed and accessible from PostgreSQL.
74 with conn.cursor() as cur:
76 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
77 RETURNS text AS %s, 'transliteration'
78 LANGUAGE c IMMUTABLE STRICT;
79 DROP FUNCTION nominatim_test_import_func(text)
80 """, (f'{module_dir}/nominatim.so', ))
81 except psycopg2.DatabaseError as err:
82 LOG.fatal("Error accessing database module: %s", err)
83 raise UsageError("Database module cannot be accessed.") from err
86 class LegacyTokenizer(AbstractTokenizer):
87 """ The legacy tokenizer uses a special PostgreSQL module to normalize
88 names and queries. The tokenizer thus implements normalization through
89 calls to the database.
92 def __init__(self, dsn, data_dir):
94 self.data_dir = data_dir
95 self.normalization = None
98 def init_new_db(self, config, init_db=True):
99 """ Set up a new tokenizer for the database.
101 This copies all necessary data in the project directory to make
102 sure the tokenizer remains stable even over updates.
104 module_dir = _install_module(config.DATABASE_MODULE_PATH,
105 config.lib_dir.module,
106 config.project_dir / 'module')
108 self.normalization = config.TERM_NORMALIZATION
110 self._install_php(config, overwrite=True)
112 with connect(self.dsn) as conn:
113 _check_module(module_dir, conn)
114 self._save_config(conn, config)
118 self.update_sql_functions(config)
119 self._init_db_tables(config)
122 def init_from_project(self, config):
123 """ Initialise the tokenizer from the project directory.
125 with connect(self.dsn) as conn:
126 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
128 if not (config.project_dir / 'module' / 'nominatim.so').exists():
129 _install_module(config.DATABASE_MODULE_PATH,
130 config.lib_dir.module,
131 config.project_dir / 'module')
133 self._install_php(config, overwrite=False)
135 def finalize_import(self, config):
136 """ Do any required postprocessing to make the tokenizer data ready
139 with connect(self.dsn) as conn:
140 sqlp = SQLPreprocessor(conn, config)
141 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
144 def update_sql_functions(self, config):
145 """ Reimport the SQL functions for this tokenizer.
147 with connect(self.dsn) as conn:
148 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
149 modulepath = config.DATABASE_MODULE_PATH or \
150 str((config.project_dir / 'module').resolve())
151 sqlp = SQLPreprocessor(conn, config)
152 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
153 max_word_freq=max_word_freq,
154 modulepath=modulepath)
157 def check_database(self, _):
158 """ Check that the tokenizer is set up correctly.
161 The Postgresql extension nominatim.so was not correctly loaded.
166 * Check the output of the CMmake/make installation step
167 * Does nominatim.so exist?
168 * Does nominatim.so exist on the database server?
169 * Can nominatim.so be accessed by the database user?
171 with connect(self.dsn) as conn:
172 with conn.cursor() as cur:
174 out = cur.scalar("SELECT make_standard_name('a')")
175 except psycopg2.Error as err:
176 return hint.format(error=str(err))
179 return hint.format(error='Unexpected result for make_standard_name()')
184 def migrate_database(self, config):
185 """ Initialise the project directory of an existing database for
186 use with this tokenizer.
188 This is a special migration function for updating existing databases
189 to new software versions.
191 self.normalization = config.TERM_NORMALIZATION
192 module_dir = _install_module(config.DATABASE_MODULE_PATH,
193 config.lib_dir.module,
194 config.project_dir / 'module')
196 with connect(self.dsn) as conn:
197 _check_module(module_dir, conn)
198 self._save_config(conn, config)
201 def update_statistics(self):
202 """ Recompute the frequency of full words.
204 with connect(self.dsn) as conn:
205 if conn.table_exists('search_name'):
206 with conn.cursor() as cur:
207 cur.drop_table("word_frequencies")
208 LOG.info("Computing word frequencies")
209 cur.execute("""CREATE TEMP TABLE word_frequencies AS
210 SELECT unnest(name_vector) as id, count(*)
211 FROM search_name GROUP BY id""")
212 cur.execute("CREATE INDEX ON word_frequencies(id)")
213 LOG.info("Update word table with recomputed frequencies")
214 cur.execute("""UPDATE word SET search_name_count = count
215 FROM word_frequencies
216 WHERE word_token like ' %' and word_id = id""")
217 cur.drop_table("word_frequencies")
221 def update_word_tokens(self):
222 """ No house-keeping implemented for the legacy tokenizer.
224 LOG.info("No tokenizer clean-up available.")
227 def name_analyzer(self):
228 """ Create a new analyzer for tokenizing names and queries
229 using this tokinzer. Analyzers are context managers and should
233 with tokenizer.name_analyzer() as analyzer:
237 When used outside the with construct, the caller must ensure to
238 call the close() function before destructing the analyzer.
240 Analyzers are not thread-safe. You need to instantiate one per thread.
242 normalizer = Transliterator.createFromRules("phrase normalizer",
244 return LegacyNameAnalyzer(self.dsn, normalizer)
247 def _install_php(self, config, overwrite=True):
248 """ Install the php script for the tokenizer.
250 php_file = self.data_dir / "tokenizer.php"
252 if not php_file.exists() or overwrite:
253 php_file.write_text(dedent(f"""\
255 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
256 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
257 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
258 """), encoding='utf-8')
261 def _init_db_tables(self, config):
262 """ Set up the word table and fill it with pre-computed word
265 with connect(self.dsn) as conn:
266 sqlp = SQLPreprocessor(conn, config)
267 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
270 LOG.warning("Precomputing word tokens")
271 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
274 def _save_config(self, conn, config):
275 """ Save the configuration that needs to remain stable for the given
276 database as database properties.
278 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
279 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
282 class LegacyNameAnalyzer(AbstractAnalyzer):
283 """ The legacy analyzer uses the special Postgresql module for
286 Each instance opens a connection to the database to request the
290 def __init__(self, dsn, normalizer):
291 self.conn = connect(dsn).connection
292 self.conn.autocommit = True
293 self.normalizer = normalizer
294 psycopg2.extras.register_hstore(self.conn)
296 self._cache = _TokenCache(self.conn)
300 """ Free all resources used by the analyzer.
307 def get_word_token_info(self, words):
308 """ Return token information for the given list of words.
309 If a word starts with # it is assumed to be a full name
310 otherwise is a partial name.
312 The function returns a list of tuples with
313 (original word, word token, word id).
315 The function is used for testing and debugging only
316 and not necessarily efficient.
318 with self.conn.cursor() as cur:
319 cur.execute("""SELECT t.term, word_token, word_id
320 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
321 WHERE word_token = (CASE
322 WHEN left(t.term, 1) = '#' THEN
323 ' ' || make_standard_name(substring(t.term from 2))
325 make_standard_name(t.term)
327 and class is null and country_code is null""",
330 return [(r[0], r[1], r[2]) for r in cur]
333 def normalize(self, phrase):
334 """ Normalize the given phrase, i.e. remove all properties that
335 are irrelevant for search.
337 return self.normalizer.transliterate(phrase)
340 def normalize_postcode(self, postcode):
341 """ Convert the postcode to a standardized form.
343 This function must yield exactly the same result as the SQL function
344 'token_normalized_postcode()'.
346 return postcode.strip().upper()
349 def update_postcodes_from_db(self):
350 """ Update postcode tokens in the word table from the location_postcode
353 with self.conn.cursor() as cur:
354 # This finds us the rows in location_postcode and word that are
355 # missing in the other table.
356 cur.execute("""SELECT * FROM
357 (SELECT pc, word FROM
358 (SELECT distinct(postcode) as pc FROM location_postcode) p
360 (SELECT word FROM word
361 WHERE class ='place' and type = 'postcode') w
363 WHERE pc is null or word is null""")
368 for postcode, word in cur:
370 to_delete.append(word)
372 to_add.append(postcode)
375 cur.execute("""DELETE FROM WORD
376 WHERE class ='place' and type = 'postcode'
380 cur.execute("""SELECT count(create_postcode_id(pc))
381 FROM unnest(%s) as pc
386 def update_special_phrases(self, phrases, should_replace):
387 """ Replace the search index for special phrases with the new phrases.
389 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
392 with self.conn.cursor() as cur:
393 # Get the old phrases.
394 existing_phrases = set()
395 cur.execute("""SELECT word, class, type, operator FROM word
396 WHERE class != 'place'
397 OR (type != 'house' AND type != 'postcode')""")
398 for label, cls, typ, oper in cur:
399 existing_phrases.add((label, cls, typ, oper or '-'))
401 to_add = norm_phrases - existing_phrases
402 to_delete = existing_phrases - norm_phrases
406 """ INSERT INTO word (word_id, word_token, word, class, type,
407 search_name_count, operator)
408 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
410 CASE WHEN op in ('in', 'near') THEN op ELSE null END
411 FROM (VALUES %s) as v(name, class, type, op))""",
414 if to_delete and should_replace:
416 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
417 WHERE word = name and class = in_class and type = in_type
418 and ((op = '-' and operator is null) or op = operator)""",
421 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
422 len(norm_phrases), len(to_add), len(to_delete))
425 def add_country_names(self, country_code, names):
426 """ Add names for the given country to the search index.
428 with self.conn.cursor() as cur:
430 """INSERT INTO word (word_id, word_token, country_code)
431 (SELECT nextval('seq_word'), lookup_token, %s
432 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
434 WHERE NOT EXISTS(SELECT * FROM word
435 WHERE word_token = lookup_token and country_code = %s))
436 """, (country_code, list(names.values()), country_code))
439 def process_place(self, place):
440 """ Determine tokenizer information about the given place.
442 Returns a JSON-serialisable structure that will be handed into
443 the database via the token_info field.
445 token_info = _TokenInfo(self._cache)
450 token_info.add_names(self.conn, names)
452 if place.is_country():
453 self.add_country_names(place.country_code, names)
455 address = place.address
457 self._process_place_address(token_info, address)
459 return token_info.data
462 def _process_place_address(self, token_info, address):
466 for key, value in address.items():
467 if key == 'postcode':
468 # Make sure the normalized postcode is present in the word table.
469 if re.search(r'[:,;]', value) is None:
470 norm_pc = self.normalize_postcode(value)
471 token_info.set_postcode(norm_pc)
472 self._cache.add_postcode(self.conn, norm_pc)
473 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
475 elif key == 'street':
476 token_info.add_street(self.conn, value)
478 token_info.add_place(self.conn, value)
479 elif not key.startswith('_') \
480 and key not in ('country', 'full', 'inclusion'):
481 addr_terms.append((key, value))
484 token_info.add_housenumbers(self.conn, hnrs)
487 token_info.add_address_terms(self.conn, addr_terms)
492 """ Collect token information to be sent back to the database.
494 def __init__(self, cache):
499 def add_names(self, conn, names):
500 """ Add token information for the names of the place.
502 with conn.cursor() as cur:
503 # Create the token IDs for all names.
504 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
508 def add_housenumbers(self, conn, hnrs):
509 """ Extract housenumber information from the address.
512 token = self.cache.get_housenumber(hnrs[0])
513 if token is not None:
514 self.data['hnr_tokens'] = token
515 self.data['hnr'] = hnrs[0]
518 # split numbers if necessary
521 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
523 if len(simple_list) > 1:
524 simple_list = list(set(simple_list))
526 with conn.cursor() as cur:
527 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
528 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
531 def set_postcode(self, postcode):
532 """ Set or replace the postcode token with the given value.
534 self.data['postcode'] = postcode
536 def add_street(self, conn, street):
537 """ Add addr:street match terms.
539 def _get_street(name):
540 with conn.cursor() as cur:
541 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
543 tokens = self.cache.streets.get(street, _get_street)
545 self.data['street'] = tokens
548 def add_place(self, conn, place):
549 """ Add addr:place search and match terms.
551 def _get_place(name):
552 with conn.cursor() as cur:
553 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
554 word_ids_from_name(%s)::text""",
556 return cur.fetchone()
558 self.data['place_search'], self.data['place_match'] = \
559 self.cache.places.get(place, _get_place)
562 def add_address_terms(self, conn, terms):
563 """ Add additional address terms.
565 def _get_address_term(name):
566 with conn.cursor() as cur:
567 cur.execute("""SELECT addr_ids_from_name(%s)::text,
568 word_ids_from_name(%s)::text""",
570 return cur.fetchone()
573 for key, value in terms:
574 items = self.cache.address_terms.get(value, _get_address_term)
575 if items[0] or items[1]:
579 self.data['addr'] = tokens
583 """ Least recently used cache that accepts a generator function to
584 produce the item when there is a cache miss.
587 def __init__(self, maxsize=128, init_data=None):
588 self.data = init_data or OrderedDict()
589 self.maxsize = maxsize
590 if init_data is not None and len(init_data) > maxsize:
591 self.maxsize = len(init_data)
593 def get(self, key, generator):
594 """ Get the item with the given key from the cache. If nothing
595 is found in the cache, generate the value through the
596 generator function and store it in the cache.
598 value = self.data.get(key)
599 if value is not None:
600 self.data.move_to_end(key)
602 value = generator(key)
603 if len(self.data) >= self.maxsize:
604 self.data.popitem(last=False)
605 self.data[key] = value
611 """ Cache for token information to avoid repeated database queries.
613 This cache is not thread-safe and needs to be instantiated per
616 def __init__(self, conn):
618 self.streets = _LRU(maxsize=256)
619 self.places = _LRU(maxsize=128)
620 self.address_terms = _LRU(maxsize=1024)
622 # Lookup houseunumbers up to 100 and cache them
623 with conn.cursor() as cur:
624 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
625 FROM generate_series(1, 100) as i""")
626 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
628 # For postcodes remember the ones that have already been added
629 self.postcodes = set()
631 def get_housenumber(self, number):
632 """ Get a housenumber token from the cache.
634 return self._cached_housenumbers.get(number)
637 def add_postcode(self, conn, postcode):
638 """ Make sure the given postcode is in the database.
640 if postcode not in self.postcodes:
641 with conn.cursor() as cur:
642 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
643 self.postcodes.add(postcode)