1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
11 cast, Dict, Set, Iterable
12 from collections import OrderedDict
14 from pathlib import Path
18 from icu import Transliterator
20 from psycopg import sql as pysql
22 from ..errors import UsageError
23 from ..db.connection import connect, Connection, drop_tables, table_exists,\
24 execute_scalar, register_hstore
25 from ..config import Configuration
26 from ..db import properties
27 from ..db import utils as db_utils
28 from ..db.sql_preprocessor import SQLPreprocessor
29 from ..data.place_info import PlaceInfo
30 from .base import AbstractAnalyzer, AbstractTokenizer
32 DBCFG_NORMALIZATION = "tokenizer_normalization"
33 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
35 LOG = logging.getLogger()
37 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
38 """ Create a new instance of the tokenizer provided by this module.
40 LOG.warning('WARNING: the legacy tokenizer is deprecated '
41 'and will be removed in Nominatim 5.0.')
42 return LegacyTokenizer(dsn, data_dir)
45 def _install_module(config_module_path: str, src_dir: Optional[Path], module_dir: Path) -> str:
46 """ Copies the PostgreSQL normalisation module into the project
47 directory if necessary. For historical reasons the module is
48 saved in the '/module' subdirectory and not with the other tokenizer
51 The function detects when the installation is run from the
52 build directory. It doesn't touch the module in that case.
54 # Custom module locations are simply used as is.
55 if config_module_path:
56 LOG.info("Using custom path for database module at '%s'", config_module_path)
57 return config_module_path
59 # Otherwise a source dir must be given.
61 raise UsageError("The legacy tokenizer cannot be used with the Nominatim pip module.")
63 # Compatibility mode for builddir installations.
64 if module_dir.exists() and src_dir.samefile(module_dir):
65 LOG.info('Running from build directory. Leaving database module as is.')
66 return str(module_dir)
68 # In any other case install the module in the project directory.
69 if not module_dir.exists():
72 destfile = module_dir / 'nominatim.so'
73 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
76 LOG.info('Database module installed at %s', str(destfile))
78 return str(module_dir)
81 def _check_module(module_dir: str, conn: Connection) -> None:
82 """ Try to use the PostgreSQL module to confirm that it is correctly
83 installed and accessible from PostgreSQL.
85 with conn.cursor() as cur:
87 cur.execute(pysql.SQL("""CREATE FUNCTION nominatim_test_import_func(text)
88 RETURNS text AS {}, 'transliteration'
89 LANGUAGE c IMMUTABLE STRICT;
90 DROP FUNCTION nominatim_test_import_func(text)
91 """).format(pysql.Literal(f'{module_dir}/nominatim.so')))
92 except psycopg.DatabaseError as err:
93 LOG.fatal("Error accessing database module: %s", err)
94 raise UsageError("Database module cannot be accessed.") from err
97 class LegacyTokenizer(AbstractTokenizer):
98 """ The legacy tokenizer uses a special PostgreSQL module to normalize
99 names and queries. The tokenizer thus implements normalization through
100 calls to the database.
103 def __init__(self, dsn: str, data_dir: Path) -> None:
105 self.data_dir = data_dir
106 self.normalization: Optional[str] = None
109 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
110 """ Set up a new tokenizer for the database.
112 This copies all necessary data in the project directory to make
113 sure the tokenizer remains stable even over updates.
115 assert config.project_dir is not None
116 module_dir = _install_module(config.DATABASE_MODULE_PATH,
117 config.lib_dir.module,
118 config.project_dir / 'module')
120 self.normalization = config.TERM_NORMALIZATION
122 with connect(self.dsn) as conn:
123 _check_module(module_dir, conn)
124 self._save_config(conn, config)
128 self.update_sql_functions(config)
129 self._init_db_tables(config)
132 def init_from_project(self, config: Configuration) -> None:
133 """ Initialise the tokenizer from the project directory.
135 assert config.project_dir is not None
137 with connect(self.dsn) as conn:
138 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
140 if not (config.project_dir / 'module' / 'nominatim.so').exists():
141 _install_module(config.DATABASE_MODULE_PATH,
142 config.lib_dir.module,
143 config.project_dir / 'module')
145 def finalize_import(self, config: Configuration) -> None:
146 """ Do any required postprocessing to make the tokenizer data ready
149 with connect(self.dsn) as conn:
150 sqlp = SQLPreprocessor(conn, config)
151 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
154 def update_sql_functions(self, config: Configuration) -> None:
155 """ Reimport the SQL functions for this tokenizer.
157 assert config.project_dir is not None
159 with connect(self.dsn) as conn:
160 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
161 modulepath = config.DATABASE_MODULE_PATH or \
162 str((config.project_dir / 'module').resolve())
163 sqlp = SQLPreprocessor(conn, config)
164 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
165 max_word_freq=max_word_freq,
166 modulepath=modulepath)
169 def check_database(self, _: Configuration) -> Optional[str]:
170 """ Check that the tokenizer is set up correctly.
173 The Postgresql extension nominatim.so was not correctly loaded.
178 * Check the output of the CMmake/make installation step
179 * Does nominatim.so exist?
180 * Does nominatim.so exist on the database server?
181 * Can nominatim.so be accessed by the database user?
183 with connect(self.dsn) as conn:
185 out = execute_scalar(conn, "SELECT make_standard_name('a')")
186 except psycopg.Error as err:
187 return hint.format(error=str(err))
190 return hint.format(error='Unexpected result for make_standard_name()')
195 def migrate_database(self, config: Configuration) -> None:
196 """ Initialise the project directory of an existing database for
197 use with this tokenizer.
199 This is a special migration function for updating existing databases
200 to new software versions.
202 assert config.project_dir is not None
204 self.normalization = config.TERM_NORMALIZATION
205 module_dir = _install_module(config.DATABASE_MODULE_PATH,
206 config.lib_dir.module,
207 config.project_dir / 'module')
209 with connect(self.dsn) as conn:
210 _check_module(module_dir, conn)
211 self._save_config(conn, config)
214 def update_statistics(self, config: Configuration, threads: int = 1) -> None:
215 """ Recompute the frequency of full words.
217 with connect(self.dsn) as conn:
218 if table_exists(conn, 'search_name'):
219 drop_tables(conn, "word_frequencies")
220 with conn.cursor() as cur:
221 LOG.info("Computing word frequencies")
222 cur.execute("""CREATE TEMP TABLE word_frequencies AS
223 SELECT unnest(name_vector) as id, count(*)
224 FROM search_name GROUP BY id""")
225 cur.execute("CREATE INDEX ON word_frequencies(id)")
226 LOG.info("Update word table with recomputed frequencies")
227 cur.execute("""UPDATE word SET search_name_count = count
228 FROM word_frequencies
229 WHERE word_token like ' %' and word_id = id""")
230 drop_tables(conn, "word_frequencies")
234 def update_word_tokens(self) -> None:
235 """ No house-keeping implemented for the legacy tokenizer.
237 LOG.info("No tokenizer clean-up available.")
240 def name_analyzer(self) -> 'LegacyNameAnalyzer':
241 """ Create a new analyzer for tokenizing names and queries
242 using this tokinzer. Analyzers are context managers and should
246 with tokenizer.name_analyzer() as analyzer:
250 When used outside the with construct, the caller must ensure to
251 call the close() function before destructing the analyzer.
253 Analyzers are not thread-safe. You need to instantiate one per thread.
255 normalizer = Transliterator.createFromRules("phrase normalizer",
257 return LegacyNameAnalyzer(self.dsn, normalizer)
260 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
261 """ Return a list of the `num` most frequent full words
264 with conn.cursor() as cur:
265 cur.execute(""" SELECT word FROM word WHERE word is not null
266 ORDER BY search_name_count DESC LIMIT %s""", (num,))
267 return list(s[0] for s in cur)
270 def _init_db_tables(self, config: Configuration) -> None:
271 """ Set up the word table and fill it with pre-computed word
274 with connect(self.dsn) as conn:
275 sqlp = SQLPreprocessor(conn, config)
276 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
279 LOG.warning("Precomputing word tokens")
280 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
283 def _save_config(self, conn: Connection, config: Configuration) -> None:
284 """ Save the configuration that needs to remain stable for the given
285 database as database properties.
287 assert self.normalization is not None
289 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
290 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
293 class LegacyNameAnalyzer(AbstractAnalyzer):
294 """ The legacy analyzer uses the special Postgresql module for
297 Each instance opens a connection to the database to request the
301 def __init__(self, dsn: str, normalizer: Any):
302 self.conn: Optional[Connection] = connect(dsn)
303 self.conn.autocommit = True
304 self.normalizer = normalizer
305 register_hstore(self.conn)
307 self._cache = _TokenCache(self.conn)
310 def close(self) -> None:
311 """ Free all resources used by the analyzer.
318 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
319 """ Return token information for the given list of words.
320 If a word starts with # it is assumed to be a full name
321 otherwise is a partial name.
323 The function returns a list of tuples with
324 (original word, word token, word id).
326 The function is used for testing and debugging only
327 and not necessarily efficient.
329 assert self.conn is not None
330 with self.conn.cursor() as cur:
331 cur.execute("""SELECT t.term, word_token, word_id
332 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
333 WHERE word_token = (CASE
334 WHEN left(t.term, 1) = '#' THEN
335 ' ' || make_standard_name(substring(t.term from 2))
337 make_standard_name(t.term)
339 and class is null and country_code is null""",
342 return [(r[0], r[1], r[2]) for r in cur]
345 def normalize(self, phrase: str) -> str:
346 """ Normalize the given phrase, i.e. remove all properties that
347 are irrelevant for search.
349 return cast(str, self.normalizer.transliterate(phrase))
352 def normalize_postcode(self, postcode: str) -> str:
353 """ Convert the postcode to a standardized form.
355 This function must yield exactly the same result as the SQL function
356 'token_normalized_postcode()'.
358 return postcode.strip().upper()
361 def update_postcodes_from_db(self) -> None:
362 """ Update postcode tokens in the word table from the location_postcode
365 assert self.conn is not None
367 with self.conn.cursor() as cur:
368 # This finds us the rows in location_postcode and word that are
369 # missing in the other table.
370 cur.execute("""SELECT * FROM
371 (SELECT pc, word FROM
372 (SELECT distinct(postcode) as pc FROM location_postcode) p
374 (SELECT word FROM word
375 WHERE class ='place' and type = 'postcode') w
377 WHERE pc is null or word is null""")
382 for postcode, word in cur:
384 to_delete.append(word)
386 to_add.append(postcode)
389 cur.execute("""DELETE FROM WORD
390 WHERE class ='place' and type = 'postcode'
394 cur.execute("""SELECT count(create_postcode_id(pc))
395 FROM unnest(%s::text[]) as pc
400 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
401 should_replace: bool) -> None:
402 """ Replace the search index for special phrases with the new phrases.
404 assert self.conn is not None
406 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
409 with self.conn.cursor() as cur:
410 # Get the old phrases.
411 existing_phrases = set()
412 cur.execute("""SELECT word, class as cls, type, operator FROM word
413 WHERE class != 'place'
414 OR (type != 'house' AND type != 'postcode')""")
415 for label, cls, typ, oper in cur:
416 existing_phrases.add((label, cls, typ, oper or '-'))
418 to_add = norm_phrases - existing_phrases
419 to_delete = existing_phrases - norm_phrases
423 """ INSERT INTO word (word_id, word_token, word, class, type,
424 search_name_count, operator)
425 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
427 CASE WHEN op in ('in', 'near') THEN op ELSE null END
428 FROM (VALUES (%s, %s, %s, %s)) as v(name, class, type, op))""",
431 if to_delete and should_replace:
434 USING (VALUES (%s, %s, %s, %s)) as v(name, in_class, in_type, op)
435 WHERE word = name and class = in_class and type = in_type
436 and ((op = '-' and operator is null) or op = operator)""",
439 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
440 len(norm_phrases), len(to_add), len(to_delete))
443 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
444 """ Add names for the given country to the search index.
446 assert self.conn is not None
448 with self.conn.cursor() as cur:
450 """INSERT INTO word (word_id, word_token, country_code)
451 (SELECT nextval('seq_word'), lookup_token, %s
452 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
453 FROM unnest(%s::TEXT[])n) y
454 WHERE NOT EXISTS(SELECT * FROM word
455 WHERE word_token = lookup_token and country_code = %s))
456 """, (country_code, list(names.values()), country_code))
459 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
460 """ Determine tokenizer information about the given place.
462 Returns a JSON-serialisable structure that will be handed into
463 the database via the token_info field.
465 assert self.conn is not None
467 token_info = _TokenInfo(self._cache)
472 token_info.add_names(self.conn, names)
474 if place.is_country():
475 assert place.country_code is not None
476 self.add_country_names(place.country_code, names)
478 address = place.address
480 self._process_place_address(token_info, address)
482 return token_info.data
485 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
486 assert self.conn is not None
490 for key, value in address.items():
491 if key == 'postcode':
492 # Make sure the normalized postcode is present in the word table.
493 if re.search(r'[:,;]', value) is None:
494 norm_pc = self.normalize_postcode(value)
495 token_info.set_postcode(norm_pc)
496 self._cache.add_postcode(self.conn, norm_pc)
497 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
499 elif key == 'street':
500 token_info.add_street(self.conn, value)
502 token_info.add_place(self.conn, value)
503 elif not key.startswith('_') \
504 and key not in ('country', 'full', 'inclusion'):
505 addr_terms.append((key, value))
508 token_info.add_housenumbers(self.conn, hnrs)
511 token_info.add_address_terms(self.conn, addr_terms)
516 """ Collect token information to be sent back to the database.
518 def __init__(self, cache: '_TokenCache') -> None:
520 self.data: Dict[str, Any] = {}
523 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
524 """ Add token information for the names of the place.
526 # Create the token IDs for all names.
527 self.data['names'] = execute_scalar(conn, "SELECT make_keywords(%s)::text",
531 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
532 """ Extract housenumber information from the address.
535 token = self.cache.get_housenumber(hnrs[0])
536 if token is not None:
537 self.data['hnr_tokens'] = token
538 self.data['hnr'] = hnrs[0]
541 # split numbers if necessary
542 simple_list: List[str] = []
544 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
546 if len(simple_list) > 1:
547 simple_list = list(set(simple_list))
549 with conn.cursor() as cur:
550 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
551 result = cur.fetchone()
552 assert result is not None
553 self.data['hnr_tokens'], self.data['hnr'] = result
556 def set_postcode(self, postcode: str) -> None:
557 """ Set or replace the postcode token with the given value.
559 self.data['postcode'] = postcode
561 def add_street(self, conn: Connection, street: str) -> None:
562 """ Add addr:street match terms.
564 def _get_street(name: str) -> Optional[str]:
565 return cast(Optional[str],
566 execute_scalar(conn, "SELECT word_ids_from_name(%s)::text", (name, )))
568 tokens = self.cache.streets.get(street, _get_street)
569 self.data['street'] = tokens or '{}'
572 def add_place(self, conn: Connection, place: str) -> None:
573 """ Add addr:place search and match terms.
575 def _get_place(name: str) -> Tuple[List[int], List[int]]:
576 with conn.cursor() as cur:
577 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
578 word_ids_from_name(%s)::text""",
580 return cast(Tuple[List[int], List[int]], cur.fetchone())
582 self.data['place_search'], self.data['place_match'] = \
583 self.cache.places.get(place, _get_place)
586 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
587 """ Add additional address terms.
589 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
590 with conn.cursor() as cur:
591 cur.execute("""SELECT addr_ids_from_name(%s)::text,
592 word_ids_from_name(%s)::text""",
594 return cast(Tuple[List[int], List[int]], cur.fetchone())
597 for key, value in terms:
598 items = self.cache.address_terms.get(value, _get_address_term)
599 if items[0] or items[1]:
603 self.data['addr'] = tokens
607 """ Least recently used cache that accepts a generator function to
608 produce the item when there is a cache miss.
611 def __init__(self, maxsize: int = 128):
612 self.data: 'OrderedDict[str, Any]' = OrderedDict()
613 self.maxsize = maxsize
616 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
617 """ Get the item with the given key from the cache. If nothing
618 is found in the cache, generate the value through the
619 generator function and store it in the cache.
621 value = self.data.get(key)
622 if value is not None:
623 self.data.move_to_end(key)
625 value = generator(key)
626 if len(self.data) >= self.maxsize:
627 self.data.popitem(last=False)
628 self.data[key] = value
634 """ Cache for token information to avoid repeated database queries.
636 This cache is not thread-safe and needs to be instantiated per
639 def __init__(self, conn: Connection):
641 self.streets = _LRU(maxsize=256)
642 self.places = _LRU(maxsize=128)
643 self.address_terms = _LRU(maxsize=1024)
645 # Lookup houseunumbers up to 100 and cache them
646 with conn.cursor() as cur:
647 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
648 FROM generate_series(1, 100) as i""")
649 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
651 # For postcodes remember the ones that have already been added
652 self.postcodes: Set[str] = set()
654 def get_housenumber(self, number: str) -> Optional[str]:
655 """ Get a housenumber token from the cache.
657 return self._cached_housenumbers.get(number)
660 def add_postcode(self, conn: Connection, postcode: str) -> None:
661 """ Make sure the given postcode is in the database.
663 if postcode not in self.postcodes:
664 with conn.cursor() as cur:
665 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
666 self.postcodes.add(postcode)