1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, cast, Dict, Set
11 from collections import OrderedDict
13 from pathlib import Path
16 from textwrap import dedent
18 from icu import Transliterator
20 import psycopg2.extras
22 from nominatim.db.connection import connect, Connection
23 from nominatim.config import Configuration
24 from nominatim.db import properties
25 from nominatim.db import utils as db_utils
26 from nominatim.db.sql_preprocessor import SQLPreprocessor
27 from nominatim.data.place_info import PlaceInfo
28 from nominatim.errors import UsageError
29 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_NORMALIZATION = "tokenizer_normalization"
32 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
34 LOG = logging.getLogger()
36 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
37 """ Create a new instance of the tokenizer provided by this module.
39 return LegacyTokenizer(dsn, data_dir)
42 def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
43 """ Copies the PostgreSQL normalisation module into the project
44 directory if necessary. For historical reasons the module is
45 saved in the '/module' subdirectory and not with the other tokenizer
48 The function detects when the installation is run from the
49 build directory. It doesn't touch the module in that case.
51 # Custom module locations are simply used as is.
52 if config_module_path:
53 LOG.info("Using custom path for database module at '%s'", config_module_path)
54 return config_module_path
56 # Compatibility mode for builddir installations.
57 if module_dir.exists() and src_dir.samefile(module_dir):
58 LOG.info('Running from build directory. Leaving database module as is.')
59 return str(module_dir)
61 # In any other case install the module in the project directory.
62 if not module_dir.exists():
65 destfile = module_dir / 'nominatim.so'
66 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
69 LOG.info('Database module installed at %s', str(destfile))
71 return str(module_dir)
74 def _check_module(module_dir: str, conn: Connection) -> None:
75 """ Try to use the PostgreSQL module to confirm that it is correctly
76 installed and accessible from PostgreSQL.
78 with conn.cursor() as cur:
80 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
81 RETURNS text AS %s, 'transliteration'
82 LANGUAGE c IMMUTABLE STRICT;
83 DROP FUNCTION nominatim_test_import_func(text)
84 """, (f'{module_dir}/nominatim.so', ))
85 except psycopg2.DatabaseError as err:
86 LOG.fatal("Error accessing database module: %s", err)
87 raise UsageError("Database module cannot be accessed.") from err
90 class LegacyTokenizer(AbstractTokenizer):
91 """ The legacy tokenizer uses a special PostgreSQL module to normalize
92 names and queries. The tokenizer thus implements normalization through
93 calls to the database.
96 def __init__(self, dsn: str, data_dir: Path) -> None:
98 self.data_dir = data_dir
99 self.normalization: Optional[str] = None
102 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
103 """ Set up a new tokenizer for the database.
105 This copies all necessary data in the project directory to make
106 sure the tokenizer remains stable even over updates.
108 module_dir = _install_module(config.DATABASE_MODULE_PATH,
109 config.lib_dir.module,
110 config.project_dir / 'module')
112 self.normalization = config.TERM_NORMALIZATION
114 self._install_php(config, overwrite=True)
116 with connect(self.dsn) as conn:
117 _check_module(module_dir, conn)
118 self._save_config(conn, config)
122 self.update_sql_functions(config)
123 self._init_db_tables(config)
126 def init_from_project(self, config: Configuration) -> None:
127 """ Initialise the tokenizer from the project directory.
129 with connect(self.dsn) as conn:
130 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
132 if not (config.project_dir / 'module' / 'nominatim.so').exists():
133 _install_module(config.DATABASE_MODULE_PATH,
134 config.lib_dir.module,
135 config.project_dir / 'module')
137 self._install_php(config, overwrite=False)
139 def finalize_import(self, config: Configuration) -> None:
140 """ Do any required postprocessing to make the tokenizer data ready
143 with connect(self.dsn) as conn:
144 sqlp = SQLPreprocessor(conn, config)
145 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
148 def update_sql_functions(self, config: Configuration) -> None:
149 """ Reimport the SQL functions for this tokenizer.
151 with connect(self.dsn) as conn:
152 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
153 modulepath = config.DATABASE_MODULE_PATH or \
154 str((config.project_dir / 'module').resolve())
155 sqlp = SQLPreprocessor(conn, config)
156 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
157 max_word_freq=max_word_freq,
158 modulepath=modulepath)
161 def check_database(self, _: Configuration) -> Optional[str]:
162 """ Check that the tokenizer is set up correctly.
165 The Postgresql extension nominatim.so was not correctly loaded.
170 * Check the output of the CMmake/make installation step
171 * Does nominatim.so exist?
172 * Does nominatim.so exist on the database server?
173 * Can nominatim.so be accessed by the database user?
175 with connect(self.dsn) as conn:
176 with conn.cursor() as cur:
178 out = cur.scalar("SELECT make_standard_name('a')")
179 except psycopg2.Error as err:
180 return hint.format(error=str(err))
183 return hint.format(error='Unexpected result for make_standard_name()')
188 def migrate_database(self, config: Configuration) -> None:
189 """ Initialise the project directory of an existing database for
190 use with this tokenizer.
192 This is a special migration function for updating existing databases
193 to new software versions.
195 self.normalization = config.TERM_NORMALIZATION
196 module_dir = _install_module(config.DATABASE_MODULE_PATH,
197 config.lib_dir.module,
198 config.project_dir / 'module')
200 with connect(self.dsn) as conn:
201 _check_module(module_dir, conn)
202 self._save_config(conn, config)
205 def update_statistics(self) -> None:
206 """ Recompute the frequency of full words.
208 with connect(self.dsn) as conn:
209 if conn.table_exists('search_name'):
210 with conn.cursor() as cur:
211 cur.drop_table("word_frequencies")
212 LOG.info("Computing word frequencies")
213 cur.execute("""CREATE TEMP TABLE word_frequencies AS
214 SELECT unnest(name_vector) as id, count(*)
215 FROM search_name GROUP BY id""")
216 cur.execute("CREATE INDEX ON word_frequencies(id)")
217 LOG.info("Update word table with recomputed frequencies")
218 cur.execute("""UPDATE word SET search_name_count = count
219 FROM word_frequencies
220 WHERE word_token like ' %' and word_id = id""")
221 cur.drop_table("word_frequencies")
225 def update_word_tokens(self) -> None:
226 """ No house-keeping implemented for the legacy tokenizer.
228 LOG.info("No tokenizer clean-up available.")
231 def name_analyzer(self) -> 'LegacyNameAnalyzer':
232 """ Create a new analyzer for tokenizing names and queries
233 using this tokinzer. Analyzers are context managers and should
237 with tokenizer.name_analyzer() as analyzer:
241 When used outside the with construct, the caller must ensure to
242 call the close() function before destructing the analyzer.
244 Analyzers are not thread-safe. You need to instantiate one per thread.
246 normalizer = Transliterator.createFromRules("phrase normalizer",
248 return LegacyNameAnalyzer(self.dsn, normalizer)
251 def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
252 """ Install the php script for the tokenizer.
254 php_file = self.data_dir / "tokenizer.php"
256 if not php_file.exists() or overwrite:
257 php_file.write_text(dedent(f"""\
259 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
260 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
261 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
262 """), encoding='utf-8')
265 def _init_db_tables(self, config: Configuration) -> None:
266 """ Set up the word table and fill it with pre-computed word
269 with connect(self.dsn) as conn:
270 sqlp = SQLPreprocessor(conn, config)
271 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
274 LOG.warning("Precomputing word tokens")
275 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
278 def _save_config(self, conn: Connection, config: Configuration) -> None:
279 """ Save the configuration that needs to remain stable for the given
280 database as database properties.
282 assert self.normalization is not None
284 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
285 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
288 class LegacyNameAnalyzer(AbstractAnalyzer):
289 """ The legacy analyzer uses the special Postgresql module for
292 Each instance opens a connection to the database to request the
296 def __init__(self, dsn: str, normalizer: Any):
297 self.conn: Optional[Connection] = connect(dsn).connection
298 self.conn.autocommit = True
299 self.normalizer = normalizer
300 psycopg2.extras.register_hstore(self.conn)
302 self._cache = _TokenCache(self.conn)
305 def close(self) -> None:
306 """ Free all resources used by the analyzer.
313 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
314 """ Return token information for the given list of words.
315 If a word starts with # it is assumed to be a full name
316 otherwise is a partial name.
318 The function returns a list of tuples with
319 (original word, word token, word id).
321 The function is used for testing and debugging only
322 and not necessarily efficient.
324 assert self.conn is not None
325 with self.conn.cursor() as cur:
326 cur.execute("""SELECT t.term, word_token, word_id
327 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
328 WHERE word_token = (CASE
329 WHEN left(t.term, 1) = '#' THEN
330 ' ' || make_standard_name(substring(t.term from 2))
332 make_standard_name(t.term)
334 and class is null and country_code is null""",
337 return [(r[0], r[1], r[2]) for r in cur]
340 def normalize(self, phrase: str) -> str:
341 """ Normalize the given phrase, i.e. remove all properties that
342 are irrelevant for search.
344 return cast(str, self.normalizer.transliterate(phrase))
347 def normalize_postcode(self, postcode: str) -> str:
348 """ Convert the postcode to a standardized form.
350 This function must yield exactly the same result as the SQL function
351 'token_normalized_postcode()'.
353 return postcode.strip().upper()
356 def update_postcodes_from_db(self) -> None:
357 """ Update postcode tokens in the word table from the location_postcode
360 assert self.conn is not None
362 with self.conn.cursor() as cur:
363 # This finds us the rows in location_postcode and word that are
364 # missing in the other table.
365 cur.execute("""SELECT * FROM
366 (SELECT pc, word FROM
367 (SELECT distinct(postcode) as pc FROM location_postcode) p
369 (SELECT word FROM word
370 WHERE class ='place' and type = 'postcode') w
372 WHERE pc is null or word is null""")
377 for postcode, word in cur:
379 to_delete.append(word)
381 to_add.append(postcode)
384 cur.execute("""DELETE FROM WORD
385 WHERE class ='place' and type = 'postcode'
389 cur.execute("""SELECT count(create_postcode_id(pc))
390 FROM unnest(%s) as pc
395 def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
396 should_replace: bool) -> None:
397 """ Replace the search index for special phrases with the new phrases.
399 assert self.conn is not None
401 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
404 with self.conn.cursor() as cur:
405 # Get the old phrases.
406 existing_phrases = set()
407 cur.execute("""SELECT word, class, type, operator FROM word
408 WHERE class != 'place'
409 OR (type != 'house' AND type != 'postcode')""")
410 for label, cls, typ, oper in cur:
411 existing_phrases.add((label, cls, typ, oper or '-'))
413 to_add = norm_phrases - existing_phrases
414 to_delete = existing_phrases - norm_phrases
418 """ INSERT INTO word (word_id, word_token, word, class, type,
419 search_name_count, operator)
420 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
422 CASE WHEN op in ('in', 'near') THEN op ELSE null END
423 FROM (VALUES %s) as v(name, class, type, op))""",
426 if to_delete and should_replace:
428 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
429 WHERE word = name and class = in_class and type = in_type
430 and ((op = '-' and operator is null) or op = operator)""",
433 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
434 len(norm_phrases), len(to_add), len(to_delete))
437 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
438 """ Add names for the given country to the search index.
440 assert self.conn is not None
442 with self.conn.cursor() as cur:
444 """INSERT INTO word (word_id, word_token, country_code)
445 (SELECT nextval('seq_word'), lookup_token, %s
446 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
448 WHERE NOT EXISTS(SELECT * FROM word
449 WHERE word_token = lookup_token and country_code = %s))
450 """, (country_code, list(names.values()), country_code))
453 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
454 """ Determine tokenizer information about the given place.
456 Returns a JSON-serialisable structure that will be handed into
457 the database via the token_info field.
459 assert self.conn is not None
461 token_info = _TokenInfo(self._cache)
466 token_info.add_names(self.conn, names)
468 if place.is_country():
469 assert place.country_code is not None
470 self.add_country_names(place.country_code, names)
472 address = place.address
474 self._process_place_address(token_info, address)
476 return token_info.data
479 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
480 assert self.conn is not None
484 for key, value in address.items():
485 if key == 'postcode':
486 # Make sure the normalized postcode is present in the word table.
487 if re.search(r'[:,;]', value) is None:
488 norm_pc = self.normalize_postcode(value)
489 token_info.set_postcode(norm_pc)
490 self._cache.add_postcode(self.conn, norm_pc)
491 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
493 elif key == 'street':
494 token_info.add_street(self.conn, value)
496 token_info.add_place(self.conn, value)
497 elif not key.startswith('_') \
498 and key not in ('country', 'full', 'inclusion'):
499 addr_terms.append((key, value))
502 token_info.add_housenumbers(self.conn, hnrs)
505 token_info.add_address_terms(self.conn, addr_terms)
510 """ Collect token information to be sent back to the database.
512 def __init__(self, cache: '_TokenCache') -> None:
514 self.data: Dict[str, Any] = {}
517 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
518 """ Add token information for the names of the place.
520 with conn.cursor() as cur:
521 # Create the token IDs for all names.
522 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
526 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
527 """ Extract housenumber information from the address.
530 token = self.cache.get_housenumber(hnrs[0])
531 if token is not None:
532 self.data['hnr_tokens'] = token
533 self.data['hnr'] = hnrs[0]
536 # split numbers if necessary
537 simple_list: List[str] = []
539 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
541 if len(simple_list) > 1:
542 simple_list = list(set(simple_list))
544 with conn.cursor() as cur:
545 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
546 self.data['hnr_tokens'], self.data['hnr'] = \
547 cur.fetchone() # type: ignore[no-untyped-call]
550 def set_postcode(self, postcode: str) -> None:
551 """ Set or replace the postcode token with the given value.
553 self.data['postcode'] = postcode
555 def add_street(self, conn: Connection, street: str) -> None:
556 """ Add addr:street match terms.
558 def _get_street(name: str) -> List[int]:
559 with conn.cursor() as cur:
560 return cast(List[int],
561 cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
563 tokens = self.cache.streets.get(street, _get_street)
565 self.data['street'] = tokens
568 def add_place(self, conn: Connection, place: str) -> None:
569 """ Add addr:place search and match terms.
571 def _get_place(name: str) -> Tuple[List[int], List[int]]:
572 with conn.cursor() as cur:
573 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
574 word_ids_from_name(%s)::text""",
576 return cast(Tuple[List[int], List[int]],
577 cur.fetchone()) # type: ignore[no-untyped-call]
579 self.data['place_search'], self.data['place_match'] = \
580 self.cache.places.get(place, _get_place)
583 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
584 """ Add additional address terms.
586 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
587 with conn.cursor() as cur:
588 cur.execute("""SELECT addr_ids_from_name(%s)::text,
589 word_ids_from_name(%s)::text""",
591 return cast(Tuple[List[int], List[int]],
592 cur.fetchone()) # type: ignore[no-untyped-call]
595 for key, value in terms:
596 items = self.cache.address_terms.get(value, _get_address_term)
597 if items[0] or items[1]:
601 self.data['addr'] = tokens
605 """ Least recently used cache that accepts a generator function to
606 produce the item when there is a cache miss.
609 def __init__(self, maxsize: int = 128):
610 self.data: 'OrderedDict[str, Any]' = OrderedDict()
611 self.maxsize = maxsize
614 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
615 """ Get the item with the given key from the cache. If nothing
616 is found in the cache, generate the value through the
617 generator function and store it in the cache.
619 value = self.data.get(key)
620 if value is not None:
621 self.data.move_to_end(key)
623 value = generator(key)
624 if len(self.data) >= self.maxsize:
625 self.data.popitem(last=False)
626 self.data[key] = value
632 """ Cache for token information to avoid repeated database queries.
634 This cache is not thread-safe and needs to be instantiated per
637 def __init__(self, conn: Connection):
639 self.streets = _LRU(maxsize=256)
640 self.places = _LRU(maxsize=128)
641 self.address_terms = _LRU(maxsize=1024)
643 # Lookup houseunumbers up to 100 and cache them
644 with conn.cursor() as cur:
645 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
646 FROM generate_series(1, 100) as i""")
647 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
649 # For postcodes remember the ones that have already been added
650 self.postcodes: Set[str] = set()
652 def get_housenumber(self, number: str) -> Optional[str]:
653 """ Get a housenumber token from the cache.
655 return self._cached_housenumbers.get(number)
658 def add_postcode(self, conn: Connection, postcode: str) -> None:
659 """ Make sure the given postcode is in the database.
661 if postcode not in self.postcodes:
662 with conn.cursor() as cur:
663 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
664 self.postcodes.add(postcode)