1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
11 cast, Dict, Set, Iterable
12 from collections import OrderedDict
14 from pathlib import Path
17 from textwrap import dedent
19 from icu import Transliterator
21 import psycopg2.extras
23 from ..errors import UsageError
24 from ..db.connection import connect, Connection
25 from ..config import Configuration
26 from ..db import properties
27 from ..db import utils as db_utils
28 from ..db.sql_preprocessor import SQLPreprocessor
29 from ..data.place_info import PlaceInfo
30 from .base import AbstractAnalyzer, AbstractTokenizer
32 DBCFG_NORMALIZATION = "tokenizer_normalization"
33 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
35 LOG = logging.getLogger()
37 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
38 """ Create a new instance of the tokenizer provided by this module.
40 return LegacyTokenizer(dsn, data_dir)
43 def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
44 """ Copies the PostgreSQL normalisation module into the project
45 directory if necessary. For historical reasons the module is
46 saved in the '/module' subdirectory and not with the other tokenizer
49 The function detects when the installation is run from the
50 build directory. It doesn't touch the module in that case.
52 # Custom module locations are simply used as is.
53 if config_module_path:
54 LOG.info("Using custom path for database module at '%s'", config_module_path)
55 return config_module_path
57 # Compatibility mode for builddir installations.
58 if module_dir.exists() and src_dir.samefile(module_dir):
59 LOG.info('Running from build directory. Leaving database module as is.')
60 return str(module_dir)
62 # In any other case install the module in the project directory.
63 if not module_dir.exists():
66 destfile = module_dir / 'nominatim.so'
67 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
70 LOG.info('Database module installed at %s', str(destfile))
72 return str(module_dir)
75 def _check_module(module_dir: str, conn: Connection) -> None:
76 """ Try to use the PostgreSQL module to confirm that it is correctly
77 installed and accessible from PostgreSQL.
79 with conn.cursor() as cur:
81 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
82 RETURNS text AS %s, 'transliteration'
83 LANGUAGE c IMMUTABLE STRICT;
84 DROP FUNCTION nominatim_test_import_func(text)
85 """, (f'{module_dir}/nominatim.so', ))
86 except psycopg2.DatabaseError as err:
87 LOG.fatal("Error accessing database module: %s", err)
88 raise UsageError("Database module cannot be accessed.") from err
91 class LegacyTokenizer(AbstractTokenizer):
92 """ The legacy tokenizer uses a special PostgreSQL module to normalize
93 names and queries. The tokenizer thus implements normalization through
94 calls to the database.
97 def __init__(self, dsn: str, data_dir: Path) -> None:
99 self.data_dir = data_dir
100 self.normalization: Optional[str] = None
103 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
104 """ Set up a new tokenizer for the database.
106 This copies all necessary data in the project directory to make
107 sure the tokenizer remains stable even over updates.
109 assert config.project_dir is not None
110 module_dir = _install_module(config.DATABASE_MODULE_PATH,
111 config.lib_dir.module,
112 config.project_dir / 'module')
114 self.normalization = config.TERM_NORMALIZATION
116 self._install_php(config, overwrite=True)
118 with connect(self.dsn) as conn:
119 _check_module(module_dir, conn)
120 self._save_config(conn, config)
124 self.update_sql_functions(config)
125 self._init_db_tables(config)
128 def init_from_project(self, config: Configuration) -> None:
129 """ Initialise the tokenizer from the project directory.
131 assert config.project_dir is not None
133 with connect(self.dsn) as conn:
134 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
136 if not (config.project_dir / 'module' / 'nominatim.so').exists():
137 _install_module(config.DATABASE_MODULE_PATH,
138 config.lib_dir.module,
139 config.project_dir / 'module')
141 self._install_php(config, overwrite=False)
143 def finalize_import(self, config: Configuration) -> None:
144 """ Do any required postprocessing to make the tokenizer data ready
147 with connect(self.dsn) as conn:
148 sqlp = SQLPreprocessor(conn, config)
149 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
152 def update_sql_functions(self, config: Configuration) -> None:
153 """ Reimport the SQL functions for this tokenizer.
155 assert config.project_dir is not None
157 with connect(self.dsn) as conn:
158 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
159 modulepath = config.DATABASE_MODULE_PATH or \
160 str((config.project_dir / 'module').resolve())
161 sqlp = SQLPreprocessor(conn, config)
162 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
163 max_word_freq=max_word_freq,
164 modulepath=modulepath)
167 def check_database(self, _: Configuration) -> Optional[str]:
168 """ Check that the tokenizer is set up correctly.
171 The Postgresql extension nominatim.so was not correctly loaded.
176 * Check the output of the CMmake/make installation step
177 * Does nominatim.so exist?
178 * Does nominatim.so exist on the database server?
179 * Can nominatim.so be accessed by the database user?
181 with connect(self.dsn) as conn:
182 with conn.cursor() as cur:
184 out = cur.scalar("SELECT make_standard_name('a')")
185 except psycopg2.Error as err:
186 return hint.format(error=str(err))
189 return hint.format(error='Unexpected result for make_standard_name()')
194 def migrate_database(self, config: Configuration) -> None:
195 """ Initialise the project directory of an existing database for
196 use with this tokenizer.
198 This is a special migration function for updating existing databases
199 to new software versions.
201 assert config.project_dir is not None
203 self.normalization = config.TERM_NORMALIZATION
204 module_dir = _install_module(config.DATABASE_MODULE_PATH,
205 config.lib_dir.module,
206 config.project_dir / 'module')
208 with connect(self.dsn) as conn:
209 _check_module(module_dir, conn)
210 self._save_config(conn, config)
213 def update_statistics(self, config: Configuration, threads: int = 1) -> None:
214 """ Recompute the frequency of full words.
216 with connect(self.dsn) as conn:
217 if conn.table_exists('search_name'):
218 with conn.cursor() as cur:
219 cur.drop_table("word_frequencies")
220 LOG.info("Computing word frequencies")
221 cur.execute("""CREATE TEMP TABLE word_frequencies AS
222 SELECT unnest(name_vector) as id, count(*)
223 FROM search_name GROUP BY id""")
224 cur.execute("CREATE INDEX ON word_frequencies(id)")
225 LOG.info("Update word table with recomputed frequencies")
226 cur.execute("""UPDATE word SET search_name_count = count
227 FROM word_frequencies
228 WHERE word_token like ' %' and word_id = id""")
229 cur.drop_table("word_frequencies")
233 def update_word_tokens(self) -> None:
234 """ No house-keeping implemented for the legacy tokenizer.
236 LOG.info("No tokenizer clean-up available.")
239 def name_analyzer(self) -> 'LegacyNameAnalyzer':
240 """ Create a new analyzer for tokenizing names and queries
241 using this tokinzer. Analyzers are context managers and should
245 with tokenizer.name_analyzer() as analyzer:
249 When used outside the with construct, the caller must ensure to
250 call the close() function before destructing the analyzer.
252 Analyzers are not thread-safe. You need to instantiate one per thread.
254 normalizer = Transliterator.createFromRules("phrase normalizer",
256 return LegacyNameAnalyzer(self.dsn, normalizer)
259 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
260 """ Return a list of the `num` most frequent full words
263 with conn.cursor() as cur:
264 cur.execute(""" SELECT word FROM word WHERE word is not null
265 ORDER BY search_name_count DESC LIMIT %s""", (num,))
266 return list(s[0] for s in cur)
269 def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
270 """ Install the php script for the tokenizer.
272 if config.lib_dir.php is not None:
273 php_file = self.data_dir / "tokenizer.php"
275 if not php_file.exists() or overwrite:
276 php_file.write_text(dedent(f"""\
278 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
279 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
280 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
281 """), encoding='utf-8')
284 def _init_db_tables(self, config: Configuration) -> None:
285 """ Set up the word table and fill it with pre-computed word
288 with connect(self.dsn) as conn:
289 sqlp = SQLPreprocessor(conn, config)
290 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
293 LOG.warning("Precomputing word tokens")
294 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
297 def _save_config(self, conn: Connection, config: Configuration) -> None:
298 """ Save the configuration that needs to remain stable for the given
299 database as database properties.
301 assert self.normalization is not None
303 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
304 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
307 class LegacyNameAnalyzer(AbstractAnalyzer):
308 """ The legacy analyzer uses the special Postgresql module for
311 Each instance opens a connection to the database to request the
315 def __init__(self, dsn: str, normalizer: Any):
316 self.conn: Optional[Connection] = connect(dsn).connection
317 self.conn.autocommit = True
318 self.normalizer = normalizer
319 psycopg2.extras.register_hstore(self.conn)
321 self._cache = _TokenCache(self.conn)
324 def close(self) -> None:
325 """ Free all resources used by the analyzer.
332 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
333 """ Return token information for the given list of words.
334 If a word starts with # it is assumed to be a full name
335 otherwise is a partial name.
337 The function returns a list of tuples with
338 (original word, word token, word id).
340 The function is used for testing and debugging only
341 and not necessarily efficient.
343 assert self.conn is not None
344 with self.conn.cursor() as cur:
345 cur.execute("""SELECT t.term, word_token, word_id
346 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
347 WHERE word_token = (CASE
348 WHEN left(t.term, 1) = '#' THEN
349 ' ' || make_standard_name(substring(t.term from 2))
351 make_standard_name(t.term)
353 and class is null and country_code is null""",
356 return [(r[0], r[1], r[2]) for r in cur]
359 def normalize(self, phrase: str) -> str:
360 """ Normalize the given phrase, i.e. remove all properties that
361 are irrelevant for search.
363 return cast(str, self.normalizer.transliterate(phrase))
366 def normalize_postcode(self, postcode: str) -> str:
367 """ Convert the postcode to a standardized form.
369 This function must yield exactly the same result as the SQL function
370 'token_normalized_postcode()'.
372 return postcode.strip().upper()
375 def update_postcodes_from_db(self) -> None:
376 """ Update postcode tokens in the word table from the location_postcode
379 assert self.conn is not None
381 with self.conn.cursor() as cur:
382 # This finds us the rows in location_postcode and word that are
383 # missing in the other table.
384 cur.execute("""SELECT * FROM
385 (SELECT pc, word FROM
386 (SELECT distinct(postcode) as pc FROM location_postcode) p
388 (SELECT word FROM word
389 WHERE class ='place' and type = 'postcode') w
391 WHERE pc is null or word is null""")
396 for postcode, word in cur:
398 to_delete.append(word)
400 to_add.append(postcode)
403 cur.execute("""DELETE FROM WORD
404 WHERE class ='place' and type = 'postcode'
408 cur.execute("""SELECT count(create_postcode_id(pc))
409 FROM unnest(%s) as pc
414 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
415 should_replace: bool) -> None:
416 """ Replace the search index for special phrases with the new phrases.
418 assert self.conn is not None
420 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
423 with self.conn.cursor() as cur:
424 # Get the old phrases.
425 existing_phrases = set()
426 cur.execute("""SELECT word, class, type, operator FROM word
427 WHERE class != 'place'
428 OR (type != 'house' AND type != 'postcode')""")
429 for label, cls, typ, oper in cur:
430 existing_phrases.add((label, cls, typ, oper or '-'))
432 to_add = norm_phrases - existing_phrases
433 to_delete = existing_phrases - norm_phrases
437 """ INSERT INTO word (word_id, word_token, word, class, type,
438 search_name_count, operator)
439 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
441 CASE WHEN op in ('in', 'near') THEN op ELSE null END
442 FROM (VALUES %s) as v(name, class, type, op))""",
445 if to_delete and should_replace:
447 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
448 WHERE word = name and class = in_class and type = in_type
449 and ((op = '-' and operator is null) or op = operator)""",
452 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
453 len(norm_phrases), len(to_add), len(to_delete))
456 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
457 """ Add names for the given country to the search index.
459 assert self.conn is not None
461 with self.conn.cursor() as cur:
463 """INSERT INTO word (word_id, word_token, country_code)
464 (SELECT nextval('seq_word'), lookup_token, %s
465 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
467 WHERE NOT EXISTS(SELECT * FROM word
468 WHERE word_token = lookup_token and country_code = %s))
469 """, (country_code, list(names.values()), country_code))
472 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
473 """ Determine tokenizer information about the given place.
475 Returns a JSON-serialisable structure that will be handed into
476 the database via the token_info field.
478 assert self.conn is not None
480 token_info = _TokenInfo(self._cache)
485 token_info.add_names(self.conn, names)
487 if place.is_country():
488 assert place.country_code is not None
489 self.add_country_names(place.country_code, names)
491 address = place.address
493 self._process_place_address(token_info, address)
495 return token_info.data
498 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
499 assert self.conn is not None
503 for key, value in address.items():
504 if key == 'postcode':
505 # Make sure the normalized postcode is present in the word table.
506 if re.search(r'[:,;]', value) is None:
507 norm_pc = self.normalize_postcode(value)
508 token_info.set_postcode(norm_pc)
509 self._cache.add_postcode(self.conn, norm_pc)
510 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
512 elif key == 'street':
513 token_info.add_street(self.conn, value)
515 token_info.add_place(self.conn, value)
516 elif not key.startswith('_') \
517 and key not in ('country', 'full', 'inclusion'):
518 addr_terms.append((key, value))
521 token_info.add_housenumbers(self.conn, hnrs)
524 token_info.add_address_terms(self.conn, addr_terms)
529 """ Collect token information to be sent back to the database.
531 def __init__(self, cache: '_TokenCache') -> None:
533 self.data: Dict[str, Any] = {}
536 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
537 """ Add token information for the names of the place.
539 with conn.cursor() as cur:
540 # Create the token IDs for all names.
541 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
545 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
546 """ Extract housenumber information from the address.
549 token = self.cache.get_housenumber(hnrs[0])
550 if token is not None:
551 self.data['hnr_tokens'] = token
552 self.data['hnr'] = hnrs[0]
555 # split numbers if necessary
556 simple_list: List[str] = []
558 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
560 if len(simple_list) > 1:
561 simple_list = list(set(simple_list))
563 with conn.cursor() as cur:
564 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
565 result = cur.fetchone()
566 assert result is not None
567 self.data['hnr_tokens'], self.data['hnr'] = result
570 def set_postcode(self, postcode: str) -> None:
571 """ Set or replace the postcode token with the given value.
573 self.data['postcode'] = postcode
575 def add_street(self, conn: Connection, street: str) -> None:
576 """ Add addr:street match terms.
578 def _get_street(name: str) -> Optional[str]:
579 with conn.cursor() as cur:
580 return cast(Optional[str],
581 cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
583 tokens = self.cache.streets.get(street, _get_street)
584 self.data['street'] = tokens or '{}'
587 def add_place(self, conn: Connection, place: str) -> None:
588 """ Add addr:place search and match terms.
590 def _get_place(name: str) -> Tuple[List[int], List[int]]:
591 with conn.cursor() as cur:
592 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
593 word_ids_from_name(%s)::text""",
595 return cast(Tuple[List[int], List[int]], cur.fetchone())
597 self.data['place_search'], self.data['place_match'] = \
598 self.cache.places.get(place, _get_place)
601 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
602 """ Add additional address terms.
604 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
605 with conn.cursor() as cur:
606 cur.execute("""SELECT addr_ids_from_name(%s)::text,
607 word_ids_from_name(%s)::text""",
609 return cast(Tuple[List[int], List[int]], cur.fetchone())
612 for key, value in terms:
613 items = self.cache.address_terms.get(value, _get_address_term)
614 if items[0] or items[1]:
618 self.data['addr'] = tokens
622 """ Least recently used cache that accepts a generator function to
623 produce the item when there is a cache miss.
626 def __init__(self, maxsize: int = 128):
627 self.data: 'OrderedDict[str, Any]' = OrderedDict()
628 self.maxsize = maxsize
631 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
632 """ Get the item with the given key from the cache. If nothing
633 is found in the cache, generate the value through the
634 generator function and store it in the cache.
636 value = self.data.get(key)
637 if value is not None:
638 self.data.move_to_end(key)
640 value = generator(key)
641 if len(self.data) >= self.maxsize:
642 self.data.popitem(last=False)
643 self.data[key] = value
649 """ Cache for token information to avoid repeated database queries.
651 This cache is not thread-safe and needs to be instantiated per
654 def __init__(self, conn: Connection):
656 self.streets = _LRU(maxsize=256)
657 self.places = _LRU(maxsize=128)
658 self.address_terms = _LRU(maxsize=1024)
660 # Lookup houseunumbers up to 100 and cache them
661 with conn.cursor() as cur:
662 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
663 FROM generate_series(1, 100) as i""")
664 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
666 # For postcodes remember the ones that have already been added
667 self.postcodes: Set[str] = set()
669 def get_housenumber(self, number: str) -> Optional[str]:
670 """ Get a housenumber token from the cache.
672 return self._cached_housenumbers.get(number)
675 def add_postcode(self, conn: Connection, postcode: str) -> None:
676 """ Make sure the given postcode is in the database.
678 if postcode not in self.postcodes:
679 with conn.cursor() as cur:
680 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
681 self.postcodes.add(postcode)