1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
11 cast, Dict, Set, Iterable
12 from collections import OrderedDict
14 from pathlib import Path
17 from textwrap import dedent
19 from icu import Transliterator
21 from psycopg import sql as pysql
23 from ..errors import UsageError
24 from ..db.connection import connect, Connection, drop_tables, table_exists,\
25 execute_scalar, register_hstore
26 from ..config import Configuration
27 from ..db import properties
28 from ..db import utils as db_utils
29 from ..db.sql_preprocessor import SQLPreprocessor
30 from ..data.place_info import PlaceInfo
31 from .base import AbstractAnalyzer, AbstractTokenizer
33 DBCFG_NORMALIZATION = "tokenizer_normalization"
34 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
36 LOG = logging.getLogger()
38 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
39 """ Create a new instance of the tokenizer provided by this module.
41 return LegacyTokenizer(dsn, data_dir)
44 def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
45 """ Copies the PostgreSQL normalisation module into the project
46 directory if necessary. For historical reasons the module is
47 saved in the '/module' subdirectory and not with the other tokenizer
50 The function detects when the installation is run from the
51 build directory. It doesn't touch the module in that case.
53 # Custom module locations are simply used as is.
54 if config_module_path:
55 LOG.info("Using custom path for database module at '%s'", config_module_path)
56 return config_module_path
58 # Compatibility mode for builddir installations.
59 if module_dir.exists() and src_dir.samefile(module_dir):
60 LOG.info('Running from build directory. Leaving database module as is.')
61 return str(module_dir)
63 # In any other case install the module in the project directory.
64 if not module_dir.exists():
67 destfile = module_dir / 'nominatim.so'
68 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
71 LOG.info('Database module installed at %s', str(destfile))
73 return str(module_dir)
76 def _check_module(module_dir: str, conn: Connection) -> None:
77 """ Try to use the PostgreSQL module to confirm that it is correctly
78 installed and accessible from PostgreSQL.
80 with conn.cursor() as cur:
82 cur.execute(pysql.SQL("""CREATE FUNCTION nominatim_test_import_func(text)
83 RETURNS text AS {}, 'transliteration'
84 LANGUAGE c IMMUTABLE STRICT;
85 DROP FUNCTION nominatim_test_import_func(text)
86 """).format(pysql.Literal(f'{module_dir}/nominatim.so')))
87 except psycopg.DatabaseError as err:
88 LOG.fatal("Error accessing database module: %s", err)
89 raise UsageError("Database module cannot be accessed.") from err
92 class LegacyTokenizer(AbstractTokenizer):
93 """ The legacy tokenizer uses a special PostgreSQL module to normalize
94 names and queries. The tokenizer thus implements normalization through
95 calls to the database.
98 def __init__(self, dsn: str, data_dir: Path) -> None:
100 self.data_dir = data_dir
101 self.normalization: Optional[str] = None
104 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
105 """ Set up a new tokenizer for the database.
107 This copies all necessary data in the project directory to make
108 sure the tokenizer remains stable even over updates.
110 assert config.project_dir is not None
111 module_dir = _install_module(config.DATABASE_MODULE_PATH,
112 config.lib_dir.module,
113 config.project_dir / 'module')
115 self.normalization = config.TERM_NORMALIZATION
117 self._install_php(config, overwrite=True)
119 with connect(self.dsn) as conn:
120 _check_module(module_dir, conn)
121 self._save_config(conn, config)
125 self.update_sql_functions(config)
126 self._init_db_tables(config)
129 def init_from_project(self, config: Configuration) -> None:
130 """ Initialise the tokenizer from the project directory.
132 assert config.project_dir is not None
134 with connect(self.dsn) as conn:
135 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
137 if not (config.project_dir / 'module' / 'nominatim.so').exists():
138 _install_module(config.DATABASE_MODULE_PATH,
139 config.lib_dir.module,
140 config.project_dir / 'module')
142 self._install_php(config, overwrite=False)
144 def finalize_import(self, config: Configuration) -> None:
145 """ Do any required postprocessing to make the tokenizer data ready
148 with connect(self.dsn) as conn:
149 sqlp = SQLPreprocessor(conn, config)
150 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
153 def update_sql_functions(self, config: Configuration) -> None:
154 """ Reimport the SQL functions for this tokenizer.
156 assert config.project_dir is not None
158 with connect(self.dsn) as conn:
159 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
160 modulepath = config.DATABASE_MODULE_PATH or \
161 str((config.project_dir / 'module').resolve())
162 sqlp = SQLPreprocessor(conn, config)
163 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
164 max_word_freq=max_word_freq,
165 modulepath=modulepath)
168 def check_database(self, _: Configuration) -> Optional[str]:
169 """ Check that the tokenizer is set up correctly.
172 The Postgresql extension nominatim.so was not correctly loaded.
177 * Check the output of the CMmake/make installation step
178 * Does nominatim.so exist?
179 * Does nominatim.so exist on the database server?
180 * Can nominatim.so be accessed by the database user?
182 with connect(self.dsn) as conn:
184 out = execute_scalar(conn, "SELECT make_standard_name('a')")
185 except psycopg.Error as err:
186 return hint.format(error=str(err))
189 return hint.format(error='Unexpected result for make_standard_name()')
194 def migrate_database(self, config: Configuration) -> None:
195 """ Initialise the project directory of an existing database for
196 use with this tokenizer.
198 This is a special migration function for updating existing databases
199 to new software versions.
201 assert config.project_dir is not None
203 self.normalization = config.TERM_NORMALIZATION
204 module_dir = _install_module(config.DATABASE_MODULE_PATH,
205 config.lib_dir.module,
206 config.project_dir / 'module')
208 with connect(self.dsn) as conn:
209 _check_module(module_dir, conn)
210 self._save_config(conn, config)
213 def update_statistics(self, config: Configuration, threads: int = 1) -> None:
214 """ Recompute the frequency of full words.
216 with connect(self.dsn) as conn:
217 if table_exists(conn, 'search_name'):
218 drop_tables(conn, "word_frequencies")
219 with conn.cursor() as cur:
220 LOG.info("Computing word frequencies")
221 cur.execute("""CREATE TEMP TABLE word_frequencies AS
222 SELECT unnest(name_vector) as id, count(*)
223 FROM search_name GROUP BY id""")
224 cur.execute("CREATE INDEX ON word_frequencies(id)")
225 LOG.info("Update word table with recomputed frequencies")
226 cur.execute("""UPDATE word SET search_name_count = count
227 FROM word_frequencies
228 WHERE word_token like ' %' and word_id = id""")
229 drop_tables(conn, "word_frequencies")
233 def update_word_tokens(self) -> None:
234 """ No house-keeping implemented for the legacy tokenizer.
236 LOG.info("No tokenizer clean-up available.")
239 def name_analyzer(self) -> 'LegacyNameAnalyzer':
240 """ Create a new analyzer for tokenizing names and queries
241 using this tokinzer. Analyzers are context managers and should
245 with tokenizer.name_analyzer() as analyzer:
249 When used outside the with construct, the caller must ensure to
250 call the close() function before destructing the analyzer.
252 Analyzers are not thread-safe. You need to instantiate one per thread.
254 normalizer = Transliterator.createFromRules("phrase normalizer",
256 return LegacyNameAnalyzer(self.dsn, normalizer)
259 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
260 """ Return a list of the `num` most frequent full words
263 with conn.cursor() as cur:
264 cur.execute(""" SELECT word FROM word WHERE word is not null
265 ORDER BY search_name_count DESC LIMIT %s""", (num,))
266 return list(s[0] for s in cur)
269 def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
270 """ Install the php script for the tokenizer.
272 if config.lib_dir.php is not None:
273 php_file = self.data_dir / "tokenizer.php"
275 if not php_file.exists() or overwrite:
276 php_file.write_text(dedent(f"""\
278 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
279 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
280 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
281 """), encoding='utf-8')
284 def _init_db_tables(self, config: Configuration) -> None:
285 """ Set up the word table and fill it with pre-computed word
288 with connect(self.dsn) as conn:
289 sqlp = SQLPreprocessor(conn, config)
290 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
293 LOG.warning("Precomputing word tokens")
294 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
297 def _save_config(self, conn: Connection, config: Configuration) -> None:
298 """ Save the configuration that needs to remain stable for the given
299 database as database properties.
301 assert self.normalization is not None
303 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
304 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
307 class LegacyNameAnalyzer(AbstractAnalyzer):
308 """ The legacy analyzer uses the special Postgresql module for
311 Each instance opens a connection to the database to request the
315 def __init__(self, dsn: str, normalizer: Any):
316 self.conn: Optional[Connection] = connect(dsn)
317 self.conn.autocommit = True
318 self.normalizer = normalizer
319 register_hstore(self.conn)
321 self._cache = _TokenCache(self.conn)
324 def close(self) -> None:
325 """ Free all resources used by the analyzer.
332 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
333 """ Return token information for the given list of words.
334 If a word starts with # it is assumed to be a full name
335 otherwise is a partial name.
337 The function returns a list of tuples with
338 (original word, word token, word id).
340 The function is used for testing and debugging only
341 and not necessarily efficient.
343 assert self.conn is not None
344 with self.conn.cursor() as cur:
345 cur.execute("""SELECT t.term, word_token, word_id
346 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
347 WHERE word_token = (CASE
348 WHEN left(t.term, 1) = '#' THEN
349 ' ' || make_standard_name(substring(t.term from 2))
351 make_standard_name(t.term)
353 and class is null and country_code is null""",
356 return [(r[0], r[1], r[2]) for r in cur]
359 def normalize(self, phrase: str) -> str:
360 """ Normalize the given phrase, i.e. remove all properties that
361 are irrelevant for search.
363 return cast(str, self.normalizer.transliterate(phrase))
366 def normalize_postcode(self, postcode: str) -> str:
367 """ Convert the postcode to a standardized form.
369 This function must yield exactly the same result as the SQL function
370 'token_normalized_postcode()'.
372 return postcode.strip().upper()
375 def update_postcodes_from_db(self) -> None:
376 """ Update postcode tokens in the word table from the location_postcode
379 assert self.conn is not None
381 with self.conn.cursor() as cur:
382 # This finds us the rows in location_postcode and word that are
383 # missing in the other table.
384 cur.execute("""SELECT * FROM
385 (SELECT pc, word FROM
386 (SELECT distinct(postcode) as pc FROM location_postcode) p
388 (SELECT word FROM word
389 WHERE class ='place' and type = 'postcode') w
391 WHERE pc is null or word is null""")
396 for postcode, word in cur:
398 to_delete.append(word)
400 to_add.append(postcode)
403 cur.execute("""DELETE FROM WORD
404 WHERE class ='place' and type = 'postcode'
408 cur.execute("""SELECT count(create_postcode_id(pc))
409 FROM unnest(%s::text[]) as pc
414 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
415 should_replace: bool) -> None:
416 """ Replace the search index for special phrases with the new phrases.
418 assert self.conn is not None
420 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
423 with self.conn.cursor() as cur:
424 # Get the old phrases.
425 existing_phrases = set()
426 cur.execute("""SELECT word, class as cls, type, operator FROM word
427 WHERE class != 'place'
428 OR (type != 'house' AND type != 'postcode')""")
429 for label, cls, typ, oper in cur:
430 existing_phrases.add((label, cls, typ, oper or '-'))
432 to_add = norm_phrases - existing_phrases
433 to_delete = existing_phrases - norm_phrases
437 """ INSERT INTO word (word_id, word_token, word, class, type,
438 search_name_count, operator)
439 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
441 CASE WHEN op in ('in', 'near') THEN op ELSE null END
442 FROM (VALUES (%s, %s, %s, %s)) as v(name, class, type, op))""",
445 if to_delete and should_replace:
448 USING (VALUES (%s, %s, %s, %s)) as v(name, in_class, in_type, op)
449 WHERE word = name and class = in_class and type = in_type
450 and ((op = '-' and operator is null) or op = operator)""",
453 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
454 len(norm_phrases), len(to_add), len(to_delete))
457 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
458 """ Add names for the given country to the search index.
460 assert self.conn is not None
462 with self.conn.cursor() as cur:
464 """INSERT INTO word (word_id, word_token, country_code)
465 (SELECT nextval('seq_word'), lookup_token, %s
466 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
467 FROM unnest(%s::TEXT[])n) y
468 WHERE NOT EXISTS(SELECT * FROM word
469 WHERE word_token = lookup_token and country_code = %s))
470 """, (country_code, list(names.values()), country_code))
473 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
474 """ Determine tokenizer information about the given place.
476 Returns a JSON-serialisable structure that will be handed into
477 the database via the token_info field.
479 assert self.conn is not None
481 token_info = _TokenInfo(self._cache)
486 token_info.add_names(self.conn, names)
488 if place.is_country():
489 assert place.country_code is not None
490 self.add_country_names(place.country_code, names)
492 address = place.address
494 self._process_place_address(token_info, address)
496 return token_info.data
499 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
500 assert self.conn is not None
504 for key, value in address.items():
505 if key == 'postcode':
506 # Make sure the normalized postcode is present in the word table.
507 if re.search(r'[:,;]', value) is None:
508 norm_pc = self.normalize_postcode(value)
509 token_info.set_postcode(norm_pc)
510 self._cache.add_postcode(self.conn, norm_pc)
511 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
513 elif key == 'street':
514 token_info.add_street(self.conn, value)
516 token_info.add_place(self.conn, value)
517 elif not key.startswith('_') \
518 and key not in ('country', 'full', 'inclusion'):
519 addr_terms.append((key, value))
522 token_info.add_housenumbers(self.conn, hnrs)
525 token_info.add_address_terms(self.conn, addr_terms)
530 """ Collect token information to be sent back to the database.
532 def __init__(self, cache: '_TokenCache') -> None:
534 self.data: Dict[str, Any] = {}
537 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
538 """ Add token information for the names of the place.
540 # Create the token IDs for all names.
541 self.data['names'] = execute_scalar(conn, "SELECT make_keywords(%s)::text",
545 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
546 """ Extract housenumber information from the address.
549 token = self.cache.get_housenumber(hnrs[0])
550 if token is not None:
551 self.data['hnr_tokens'] = token
552 self.data['hnr'] = hnrs[0]
555 # split numbers if necessary
556 simple_list: List[str] = []
558 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
560 if len(simple_list) > 1:
561 simple_list = list(set(simple_list))
563 with conn.cursor() as cur:
564 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
565 result = cur.fetchone()
566 assert result is not None
567 self.data['hnr_tokens'], self.data['hnr'] = result
570 def set_postcode(self, postcode: str) -> None:
571 """ Set or replace the postcode token with the given value.
573 self.data['postcode'] = postcode
575 def add_street(self, conn: Connection, street: str) -> None:
576 """ Add addr:street match terms.
578 def _get_street(name: str) -> Optional[str]:
579 return cast(Optional[str],
580 execute_scalar(conn, "SELECT word_ids_from_name(%s)::text", (name, )))
582 tokens = self.cache.streets.get(street, _get_street)
583 self.data['street'] = tokens or '{}'
586 def add_place(self, conn: Connection, place: str) -> None:
587 """ Add addr:place search and match terms.
589 def _get_place(name: str) -> Tuple[List[int], List[int]]:
590 with conn.cursor() as cur:
591 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
592 word_ids_from_name(%s)::text""",
594 return cast(Tuple[List[int], List[int]], cur.fetchone())
596 self.data['place_search'], self.data['place_match'] = \
597 self.cache.places.get(place, _get_place)
600 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
601 """ Add additional address terms.
603 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
604 with conn.cursor() as cur:
605 cur.execute("""SELECT addr_ids_from_name(%s)::text,
606 word_ids_from_name(%s)::text""",
608 return cast(Tuple[List[int], List[int]], cur.fetchone())
611 for key, value in terms:
612 items = self.cache.address_terms.get(value, _get_address_term)
613 if items[0] or items[1]:
617 self.data['addr'] = tokens
621 """ Least recently used cache that accepts a generator function to
622 produce the item when there is a cache miss.
625 def __init__(self, maxsize: int = 128):
626 self.data: 'OrderedDict[str, Any]' = OrderedDict()
627 self.maxsize = maxsize
630 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
631 """ Get the item with the given key from the cache. If nothing
632 is found in the cache, generate the value through the
633 generator function and store it in the cache.
635 value = self.data.get(key)
636 if value is not None:
637 self.data.move_to_end(key)
639 value = generator(key)
640 if len(self.data) >= self.maxsize:
641 self.data.popitem(last=False)
642 self.data[key] = value
648 """ Cache for token information to avoid repeated database queries.
650 This cache is not thread-safe and needs to be instantiated per
653 def __init__(self, conn: Connection):
655 self.streets = _LRU(maxsize=256)
656 self.places = _LRU(maxsize=128)
657 self.address_terms = _LRU(maxsize=1024)
659 # Lookup houseunumbers up to 100 and cache them
660 with conn.cursor() as cur:
661 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
662 FROM generate_series(1, 100) as i""")
663 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
665 # For postcodes remember the ones that have already been added
666 self.postcodes: Set[str] = set()
668 def get_housenumber(self, number: str) -> Optional[str]:
669 """ Get a housenumber token from the cache.
671 return self._cached_housenumbers.get(number)
674 def add_postcode(self, conn: Connection, postcode: str) -> None:
675 """ Make sure the given postcode is in the database.
677 if postcode not in self.postcodes:
678 with conn.cursor() as cur:
679 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
680 self.postcodes.add(postcode)