1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
11 cast, Dict, Set, Iterable
12 from collections import OrderedDict
14 from pathlib import Path
17 from textwrap import dedent
19 from icu import Transliterator
22 from ..errors import UsageError
23 from ..db.connection import connect, Connection, drop_tables, table_exists,\
24 execute_scalar, register_hstore
25 from ..config import Configuration
26 from ..db import properties
27 from ..db import utils as db_utils
28 from ..db.sql_preprocessor import SQLPreprocessor
29 from ..data.place_info import PlaceInfo
30 from .base import AbstractAnalyzer, AbstractTokenizer
32 DBCFG_NORMALIZATION = "tokenizer_normalization"
33 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
35 LOG = logging.getLogger()
37 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
38 """ Create a new instance of the tokenizer provided by this module.
40 return LegacyTokenizer(dsn, data_dir)
43 def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
44 """ Copies the PostgreSQL normalisation module into the project
45 directory if necessary. For historical reasons the module is
46 saved in the '/module' subdirectory and not with the other tokenizer
49 The function detects when the installation is run from the
50 build directory. It doesn't touch the module in that case.
52 # Custom module locations are simply used as is.
53 if config_module_path:
54 LOG.info("Using custom path for database module at '%s'", config_module_path)
55 return config_module_path
57 # Compatibility mode for builddir installations.
58 if module_dir.exists() and src_dir.samefile(module_dir):
59 LOG.info('Running from build directory. Leaving database module as is.')
60 return str(module_dir)
62 # In any other case install the module in the project directory.
63 if not module_dir.exists():
66 destfile = module_dir / 'nominatim.so'
67 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
70 LOG.info('Database module installed at %s', str(destfile))
72 return str(module_dir)
75 def _check_module(module_dir: str, conn: Connection) -> None:
76 """ Try to use the PostgreSQL module to confirm that it is correctly
77 installed and accessible from PostgreSQL.
79 with conn.cursor() as cur:
81 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
82 RETURNS text AS %s, 'transliteration'
83 LANGUAGE c IMMUTABLE STRICT;
84 DROP FUNCTION nominatim_test_import_func(text)
85 """, (f'{module_dir}/nominatim.so', ))
86 except psycopg2.DatabaseError as err:
87 LOG.fatal("Error accessing database module: %s", err)
88 raise UsageError("Database module cannot be accessed.") from err
91 class LegacyTokenizer(AbstractTokenizer):
92 """ The legacy tokenizer uses a special PostgreSQL module to normalize
93 names and queries. The tokenizer thus implements normalization through
94 calls to the database.
97 def __init__(self, dsn: str, data_dir: Path) -> None:
99 self.data_dir = data_dir
100 self.normalization: Optional[str] = None
103 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
104 """ Set up a new tokenizer for the database.
106 This copies all necessary data in the project directory to make
107 sure the tokenizer remains stable even over updates.
109 assert config.project_dir is not None
110 module_dir = _install_module(config.DATABASE_MODULE_PATH,
111 config.lib_dir.module,
112 config.project_dir / 'module')
114 self.normalization = config.TERM_NORMALIZATION
116 self._install_php(config, overwrite=True)
118 with connect(self.dsn) as conn:
119 _check_module(module_dir, conn)
120 self._save_config(conn, config)
124 self.update_sql_functions(config)
125 self._init_db_tables(config)
128 def init_from_project(self, config: Configuration) -> None:
129 """ Initialise the tokenizer from the project directory.
131 assert config.project_dir is not None
133 with connect(self.dsn) as conn:
134 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
136 if not (config.project_dir / 'module' / 'nominatim.so').exists():
137 _install_module(config.DATABASE_MODULE_PATH,
138 config.lib_dir.module,
139 config.project_dir / 'module')
141 self._install_php(config, overwrite=False)
143 def finalize_import(self, config: Configuration) -> None:
144 """ Do any required postprocessing to make the tokenizer data ready
147 with connect(self.dsn) as conn:
148 sqlp = SQLPreprocessor(conn, config)
149 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
152 def update_sql_functions(self, config: Configuration) -> None:
153 """ Reimport the SQL functions for this tokenizer.
155 assert config.project_dir is not None
157 with connect(self.dsn) as conn:
158 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
159 modulepath = config.DATABASE_MODULE_PATH or \
160 str((config.project_dir / 'module').resolve())
161 sqlp = SQLPreprocessor(conn, config)
162 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
163 max_word_freq=max_word_freq,
164 modulepath=modulepath)
167 def check_database(self, _: Configuration) -> Optional[str]:
168 """ Check that the tokenizer is set up correctly.
171 The Postgresql extension nominatim.so was not correctly loaded.
176 * Check the output of the CMmake/make installation step
177 * Does nominatim.so exist?
178 * Does nominatim.so exist on the database server?
179 * Can nominatim.so be accessed by the database user?
181 with connect(self.dsn) as conn:
183 out = execute_scalar(conn, "SELECT make_standard_name('a')")
184 except psycopg2.Error as err:
185 return hint.format(error=str(err))
188 return hint.format(error='Unexpected result for make_standard_name()')
193 def migrate_database(self, config: Configuration) -> None:
194 """ Initialise the project directory of an existing database for
195 use with this tokenizer.
197 This is a special migration function for updating existing databases
198 to new software versions.
200 assert config.project_dir is not None
202 self.normalization = config.TERM_NORMALIZATION
203 module_dir = _install_module(config.DATABASE_MODULE_PATH,
204 config.lib_dir.module,
205 config.project_dir / 'module')
207 with connect(self.dsn) as conn:
208 _check_module(module_dir, conn)
209 self._save_config(conn, config)
212 def update_statistics(self, config: Configuration, threads: int = 1) -> None:
213 """ Recompute the frequency of full words.
215 with connect(self.dsn) as conn:
216 if table_exists(conn, 'search_name'):
217 drop_tables(conn, "word_frequencies")
218 with conn.cursor() as cur:
219 LOG.info("Computing word frequencies")
220 cur.execute("""CREATE TEMP TABLE word_frequencies AS
221 SELECT unnest(name_vector) as id, count(*)
222 FROM search_name GROUP BY id""")
223 cur.execute("CREATE INDEX ON word_frequencies(id)")
224 LOG.info("Update word table with recomputed frequencies")
225 cur.execute("""UPDATE word SET search_name_count = count
226 FROM word_frequencies
227 WHERE word_token like ' %' and word_id = id""")
228 drop_tables(conn, "word_frequencies")
232 def update_word_tokens(self) -> None:
233 """ No house-keeping implemented for the legacy tokenizer.
235 LOG.info("No tokenizer clean-up available.")
238 def name_analyzer(self) -> 'LegacyNameAnalyzer':
239 """ Create a new analyzer for tokenizing names and queries
240 using this tokinzer. Analyzers are context managers and should
244 with tokenizer.name_analyzer() as analyzer:
248 When used outside the with construct, the caller must ensure to
249 call the close() function before destructing the analyzer.
251 Analyzers are not thread-safe. You need to instantiate one per thread.
253 normalizer = Transliterator.createFromRules("phrase normalizer",
255 return LegacyNameAnalyzer(self.dsn, normalizer)
258 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
259 """ Return a list of the `num` most frequent full words
262 with conn.cursor() as cur:
263 cur.execute(""" SELECT word FROM word WHERE word is not null
264 ORDER BY search_name_count DESC LIMIT %s""", (num,))
265 return list(s[0] for s in cur)
268 def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
269 """ Install the php script for the tokenizer.
271 if config.lib_dir.php is not None:
272 php_file = self.data_dir / "tokenizer.php"
274 if not php_file.exists() or overwrite:
275 php_file.write_text(dedent(f"""\
277 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
278 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
279 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
280 """), encoding='utf-8')
283 def _init_db_tables(self, config: Configuration) -> None:
284 """ Set up the word table and fill it with pre-computed word
287 with connect(self.dsn) as conn:
288 sqlp = SQLPreprocessor(conn, config)
289 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
292 LOG.warning("Precomputing word tokens")
293 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
296 def _save_config(self, conn: Connection, config: Configuration) -> None:
297 """ Save the configuration that needs to remain stable for the given
298 database as database properties.
300 assert self.normalization is not None
302 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
303 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
306 class LegacyNameAnalyzer(AbstractAnalyzer):
307 """ The legacy analyzer uses the special Postgresql module for
310 Each instance opens a connection to the database to request the
314 def __init__(self, dsn: str, normalizer: Any):
315 self.conn: Optional[Connection] = connect(dsn).connection
316 self.conn.autocommit = True
317 self.normalizer = normalizer
318 register_hstore(self.conn)
320 self._cache = _TokenCache(self.conn)
323 def close(self) -> None:
324 """ Free all resources used by the analyzer.
331 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
332 """ Return token information for the given list of words.
333 If a word starts with # it is assumed to be a full name
334 otherwise is a partial name.
336 The function returns a list of tuples with
337 (original word, word token, word id).
339 The function is used for testing and debugging only
340 and not necessarily efficient.
342 assert self.conn is not None
343 with self.conn.cursor() as cur:
344 cur.execute("""SELECT t.term, word_token, word_id
345 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
346 WHERE word_token = (CASE
347 WHEN left(t.term, 1) = '#' THEN
348 ' ' || make_standard_name(substring(t.term from 2))
350 make_standard_name(t.term)
352 and class is null and country_code is null""",
355 return [(r[0], r[1], r[2]) for r in cur]
358 def normalize(self, phrase: str) -> str:
359 """ Normalize the given phrase, i.e. remove all properties that
360 are irrelevant for search.
362 return cast(str, self.normalizer.transliterate(phrase))
365 def normalize_postcode(self, postcode: str) -> str:
366 """ Convert the postcode to a standardized form.
368 This function must yield exactly the same result as the SQL function
369 'token_normalized_postcode()'.
371 return postcode.strip().upper()
374 def update_postcodes_from_db(self) -> None:
375 """ Update postcode tokens in the word table from the location_postcode
378 assert self.conn is not None
380 with self.conn.cursor() as cur:
381 # This finds us the rows in location_postcode and word that are
382 # missing in the other table.
383 cur.execute("""SELECT * FROM
384 (SELECT pc, word FROM
385 (SELECT distinct(postcode) as pc FROM location_postcode) p
387 (SELECT word FROM word
388 WHERE class ='place' and type = 'postcode') w
390 WHERE pc is null or word is null""")
395 for postcode, word in cur:
397 to_delete.append(word)
399 to_add.append(postcode)
402 cur.execute("""DELETE FROM WORD
403 WHERE class ='place' and type = 'postcode'
407 cur.execute("""SELECT count(create_postcode_id(pc))
408 FROM unnest(%s) as pc
413 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
414 should_replace: bool) -> None:
415 """ Replace the search index for special phrases with the new phrases.
417 assert self.conn is not None
419 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
422 with self.conn.cursor() as cur:
423 # Get the old phrases.
424 existing_phrases = set()
425 cur.execute("""SELECT word, class, type, operator FROM word
426 WHERE class != 'place'
427 OR (type != 'house' AND type != 'postcode')""")
428 for label, cls, typ, oper in cur:
429 existing_phrases.add((label, cls, typ, oper or '-'))
431 to_add = norm_phrases - existing_phrases
432 to_delete = existing_phrases - norm_phrases
436 """ INSERT INTO word (word_id, word_token, word, class, type,
437 search_name_count, operator)
438 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
440 CASE WHEN op in ('in', 'near') THEN op ELSE null END
441 FROM (VALUES %s) as v(name, class, type, op))""",
444 if to_delete and should_replace:
446 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
447 WHERE word = name and class = in_class and type = in_type
448 and ((op = '-' and operator is null) or op = operator)""",
451 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
452 len(norm_phrases), len(to_add), len(to_delete))
455 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
456 """ Add names for the given country to the search index.
458 assert self.conn is not None
460 with self.conn.cursor() as cur:
462 """INSERT INTO word (word_id, word_token, country_code)
463 (SELECT nextval('seq_word'), lookup_token, %s
464 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
466 WHERE NOT EXISTS(SELECT * FROM word
467 WHERE word_token = lookup_token and country_code = %s))
468 """, (country_code, list(names.values()), country_code))
471 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
472 """ Determine tokenizer information about the given place.
474 Returns a JSON-serialisable structure that will be handed into
475 the database via the token_info field.
477 assert self.conn is not None
479 token_info = _TokenInfo(self._cache)
484 token_info.add_names(self.conn, names)
486 if place.is_country():
487 assert place.country_code is not None
488 self.add_country_names(place.country_code, names)
490 address = place.address
492 self._process_place_address(token_info, address)
494 return token_info.data
497 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
498 assert self.conn is not None
502 for key, value in address.items():
503 if key == 'postcode':
504 # Make sure the normalized postcode is present in the word table.
505 if re.search(r'[:,;]', value) is None:
506 norm_pc = self.normalize_postcode(value)
507 token_info.set_postcode(norm_pc)
508 self._cache.add_postcode(self.conn, norm_pc)
509 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
511 elif key == 'street':
512 token_info.add_street(self.conn, value)
514 token_info.add_place(self.conn, value)
515 elif not key.startswith('_') \
516 and key not in ('country', 'full', 'inclusion'):
517 addr_terms.append((key, value))
520 token_info.add_housenumbers(self.conn, hnrs)
523 token_info.add_address_terms(self.conn, addr_terms)
528 """ Collect token information to be sent back to the database.
530 def __init__(self, cache: '_TokenCache') -> None:
532 self.data: Dict[str, Any] = {}
535 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
536 """ Add token information for the names of the place.
538 # Create the token IDs for all names.
539 self.data['names'] = execute_scalar(conn, "SELECT make_keywords(%s)::text",
543 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
544 """ Extract housenumber information from the address.
547 token = self.cache.get_housenumber(hnrs[0])
548 if token is not None:
549 self.data['hnr_tokens'] = token
550 self.data['hnr'] = hnrs[0]
553 # split numbers if necessary
554 simple_list: List[str] = []
556 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
558 if len(simple_list) > 1:
559 simple_list = list(set(simple_list))
561 with conn.cursor() as cur:
562 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
563 result = cur.fetchone()
564 assert result is not None
565 self.data['hnr_tokens'], self.data['hnr'] = result
568 def set_postcode(self, postcode: str) -> None:
569 """ Set or replace the postcode token with the given value.
571 self.data['postcode'] = postcode
573 def add_street(self, conn: Connection, street: str) -> None:
574 """ Add addr:street match terms.
576 def _get_street(name: str) -> Optional[str]:
577 return cast(Optional[str],
578 execute_scalar(conn, "SELECT word_ids_from_name(%s)::text", (name, )))
580 tokens = self.cache.streets.get(street, _get_street)
581 self.data['street'] = tokens or '{}'
584 def add_place(self, conn: Connection, place: str) -> None:
585 """ Add addr:place search and match terms.
587 def _get_place(name: str) -> Tuple[List[int], List[int]]:
588 with conn.cursor() as cur:
589 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
590 word_ids_from_name(%s)::text""",
592 return cast(Tuple[List[int], List[int]], cur.fetchone())
594 self.data['place_search'], self.data['place_match'] = \
595 self.cache.places.get(place, _get_place)
598 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
599 """ Add additional address terms.
601 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
602 with conn.cursor() as cur:
603 cur.execute("""SELECT addr_ids_from_name(%s)::text,
604 word_ids_from_name(%s)::text""",
606 return cast(Tuple[List[int], List[int]], cur.fetchone())
609 for key, value in terms:
610 items = self.cache.address_terms.get(value, _get_address_term)
611 if items[0] or items[1]:
615 self.data['addr'] = tokens
619 """ Least recently used cache that accepts a generator function to
620 produce the item when there is a cache miss.
623 def __init__(self, maxsize: int = 128):
624 self.data: 'OrderedDict[str, Any]' = OrderedDict()
625 self.maxsize = maxsize
628 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
629 """ Get the item with the given key from the cache. If nothing
630 is found in the cache, generate the value through the
631 generator function and store it in the cache.
633 value = self.data.get(key)
634 if value is not None:
635 self.data.move_to_end(key)
637 value = generator(key)
638 if len(self.data) >= self.maxsize:
639 self.data.popitem(last=False)
640 self.data[key] = value
646 """ Cache for token information to avoid repeated database queries.
648 This cache is not thread-safe and needs to be instantiated per
651 def __init__(self, conn: Connection):
653 self.streets = _LRU(maxsize=256)
654 self.places = _LRU(maxsize=128)
655 self.address_terms = _LRU(maxsize=1024)
657 # Lookup houseunumbers up to 100 and cache them
658 with conn.cursor() as cur:
659 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
660 FROM generate_series(1, 100) as i""")
661 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
663 # For postcodes remember the ones that have already been added
664 self.postcodes: Set[str] = set()
666 def get_housenumber(self, number: str) -> Optional[str]:
667 """ Get a housenumber token from the cache.
669 return self._cached_housenumbers.get(number)
672 def add_postcode(self, conn: Connection, postcode: str) -> None:
673 """ Make sure the given postcode is in the database.
675 if postcode not in self.postcodes:
676 with conn.cursor() as cur:
677 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
678 self.postcodes.add(postcode)