1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
11 cast, Dict, Set, Iterable
12 from collections import OrderedDict
14 from pathlib import Path
17 from textwrap import dedent
19 from icu import Transliterator
21 from psycopg import sql as pysql
23 from ..errors import UsageError
24 from ..db.connection import connect, Connection, drop_tables, table_exists,\
25 execute_scalar, register_hstore
26 from ..config import Configuration
27 from ..db import properties
28 from ..db import utils as db_utils
29 from ..db.sql_preprocessor import SQLPreprocessor
30 from ..data.place_info import PlaceInfo
31 from .base import AbstractAnalyzer, AbstractTokenizer
33 DBCFG_NORMALIZATION = "tokenizer_normalization"
34 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
36 LOG = logging.getLogger()
38 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
39 """ Create a new instance of the tokenizer provided by this module.
41 LOG.warning('WARNING: the legacy tokenizer is deprecated '
42 'and will be removed in Nominatim 5.0.')
43 return LegacyTokenizer(dsn, data_dir)
46 def _install_module(config_module_path: str, src_dir: Optional[Path], module_dir: Path) -> str:
47 """ Copies the PostgreSQL normalisation module into the project
48 directory if necessary. For historical reasons the module is
49 saved in the '/module' subdirectory and not with the other tokenizer
52 The function detects when the installation is run from the
53 build directory. It doesn't touch the module in that case.
55 # Custom module locations are simply used as is.
56 if config_module_path:
57 LOG.info("Using custom path for database module at '%s'", config_module_path)
58 return config_module_path
60 # Otherwise a source dir must be given.
62 raise UsageError("The legacy tokenizer cannot be used with the Nominatim pip module.")
64 # Compatibility mode for builddir installations.
65 if module_dir.exists() and src_dir.samefile(module_dir):
66 LOG.info('Running from build directory. Leaving database module as is.')
67 return str(module_dir)
69 # In any other case install the module in the project directory.
70 if not module_dir.exists():
73 destfile = module_dir / 'nominatim.so'
74 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
77 LOG.info('Database module installed at %s', str(destfile))
79 return str(module_dir)
82 def _check_module(module_dir: str, conn: Connection) -> None:
83 """ Try to use the PostgreSQL module to confirm that it is correctly
84 installed and accessible from PostgreSQL.
86 with conn.cursor() as cur:
88 cur.execute(pysql.SQL("""CREATE FUNCTION nominatim_test_import_func(text)
89 RETURNS text AS {}, 'transliteration'
90 LANGUAGE c IMMUTABLE STRICT;
91 DROP FUNCTION nominatim_test_import_func(text)
92 """).format(pysql.Literal(f'{module_dir}/nominatim.so')))
93 except psycopg.DatabaseError as err:
94 LOG.fatal("Error accessing database module: %s", err)
95 raise UsageError("Database module cannot be accessed.") from err
98 class LegacyTokenizer(AbstractTokenizer):
99 """ The legacy tokenizer uses a special PostgreSQL module to normalize
100 names and queries. The tokenizer thus implements normalization through
101 calls to the database.
104 def __init__(self, dsn: str, data_dir: Path) -> None:
106 self.data_dir = data_dir
107 self.normalization: Optional[str] = None
110 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
111 """ Set up a new tokenizer for the database.
113 This copies all necessary data in the project directory to make
114 sure the tokenizer remains stable even over updates.
116 assert config.project_dir is not None
117 module_dir = _install_module(config.DATABASE_MODULE_PATH,
118 config.lib_dir.module,
119 config.project_dir / 'module')
121 self.normalization = config.TERM_NORMALIZATION
123 self._install_php(config, overwrite=True)
125 with connect(self.dsn) as conn:
126 _check_module(module_dir, conn)
127 self._save_config(conn, config)
131 self.update_sql_functions(config)
132 self._init_db_tables(config)
135 def init_from_project(self, config: Configuration) -> None:
136 """ Initialise the tokenizer from the project directory.
138 assert config.project_dir is not None
140 with connect(self.dsn) as conn:
141 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
143 if not (config.project_dir / 'module' / 'nominatim.so').exists():
144 _install_module(config.DATABASE_MODULE_PATH,
145 config.lib_dir.module,
146 config.project_dir / 'module')
148 self._install_php(config, overwrite=False)
150 def finalize_import(self, config: Configuration) -> None:
151 """ Do any required postprocessing to make the tokenizer data ready
154 with connect(self.dsn) as conn:
155 sqlp = SQLPreprocessor(conn, config)
156 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
159 def update_sql_functions(self, config: Configuration) -> None:
160 """ Reimport the SQL functions for this tokenizer.
162 assert config.project_dir is not None
164 with connect(self.dsn) as conn:
165 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
166 modulepath = config.DATABASE_MODULE_PATH or \
167 str((config.project_dir / 'module').resolve())
168 sqlp = SQLPreprocessor(conn, config)
169 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
170 max_word_freq=max_word_freq,
171 modulepath=modulepath)
174 def check_database(self, _: Configuration) -> Optional[str]:
175 """ Check that the tokenizer is set up correctly.
178 The Postgresql extension nominatim.so was not correctly loaded.
183 * Check the output of the CMmake/make installation step
184 * Does nominatim.so exist?
185 * Does nominatim.so exist on the database server?
186 * Can nominatim.so be accessed by the database user?
188 with connect(self.dsn) as conn:
190 out = execute_scalar(conn, "SELECT make_standard_name('a')")
191 except psycopg.Error as err:
192 return hint.format(error=str(err))
195 return hint.format(error='Unexpected result for make_standard_name()')
200 def migrate_database(self, config: Configuration) -> None:
201 """ Initialise the project directory of an existing database for
202 use with this tokenizer.
204 This is a special migration function for updating existing databases
205 to new software versions.
207 assert config.project_dir is not None
209 self.normalization = config.TERM_NORMALIZATION
210 module_dir = _install_module(config.DATABASE_MODULE_PATH,
211 config.lib_dir.module,
212 config.project_dir / 'module')
214 with connect(self.dsn) as conn:
215 _check_module(module_dir, conn)
216 self._save_config(conn, config)
219 def update_statistics(self, config: Configuration, threads: int = 1) -> None:
220 """ Recompute the frequency of full words.
222 with connect(self.dsn) as conn:
223 if table_exists(conn, 'search_name'):
224 drop_tables(conn, "word_frequencies")
225 with conn.cursor() as cur:
226 LOG.info("Computing word frequencies")
227 cur.execute("""CREATE TEMP TABLE word_frequencies AS
228 SELECT unnest(name_vector) as id, count(*)
229 FROM search_name GROUP BY id""")
230 cur.execute("CREATE INDEX ON word_frequencies(id)")
231 LOG.info("Update word table with recomputed frequencies")
232 cur.execute("""UPDATE word SET search_name_count = count
233 FROM word_frequencies
234 WHERE word_token like ' %' and word_id = id""")
235 drop_tables(conn, "word_frequencies")
239 def update_word_tokens(self) -> None:
240 """ No house-keeping implemented for the legacy tokenizer.
242 LOG.info("No tokenizer clean-up available.")
245 def name_analyzer(self) -> 'LegacyNameAnalyzer':
246 """ Create a new analyzer for tokenizing names and queries
247 using this tokinzer. Analyzers are context managers and should
251 with tokenizer.name_analyzer() as analyzer:
255 When used outside the with construct, the caller must ensure to
256 call the close() function before destructing the analyzer.
258 Analyzers are not thread-safe. You need to instantiate one per thread.
260 normalizer = Transliterator.createFromRules("phrase normalizer",
262 return LegacyNameAnalyzer(self.dsn, normalizer)
265 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
266 """ Return a list of the `num` most frequent full words
269 with conn.cursor() as cur:
270 cur.execute(""" SELECT word FROM word WHERE word is not null
271 ORDER BY search_name_count DESC LIMIT %s""", (num,))
272 return list(s[0] for s in cur)
275 def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
276 """ Install the php script for the tokenizer.
278 if config.lib_dir.php is not None:
279 php_file = self.data_dir / "tokenizer.php"
281 if not php_file.exists() or overwrite:
282 php_file.write_text(dedent(f"""\
284 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
285 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
286 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
287 """), encoding='utf-8')
290 def _init_db_tables(self, config: Configuration) -> None:
291 """ Set up the word table and fill it with pre-computed word
294 with connect(self.dsn) as conn:
295 sqlp = SQLPreprocessor(conn, config)
296 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
299 LOG.warning("Precomputing word tokens")
300 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
303 def _save_config(self, conn: Connection, config: Configuration) -> None:
304 """ Save the configuration that needs to remain stable for the given
305 database as database properties.
307 assert self.normalization is not None
309 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
310 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
313 class LegacyNameAnalyzer(AbstractAnalyzer):
314 """ The legacy analyzer uses the special Postgresql module for
317 Each instance opens a connection to the database to request the
321 def __init__(self, dsn: str, normalizer: Any):
322 self.conn: Optional[Connection] = connect(dsn)
323 self.conn.autocommit = True
324 self.normalizer = normalizer
325 register_hstore(self.conn)
327 self._cache = _TokenCache(self.conn)
330 def close(self) -> None:
331 """ Free all resources used by the analyzer.
338 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
339 """ Return token information for the given list of words.
340 If a word starts with # it is assumed to be a full name
341 otherwise is a partial name.
343 The function returns a list of tuples with
344 (original word, word token, word id).
346 The function is used for testing and debugging only
347 and not necessarily efficient.
349 assert self.conn is not None
350 with self.conn.cursor() as cur:
351 cur.execute("""SELECT t.term, word_token, word_id
352 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
353 WHERE word_token = (CASE
354 WHEN left(t.term, 1) = '#' THEN
355 ' ' || make_standard_name(substring(t.term from 2))
357 make_standard_name(t.term)
359 and class is null and country_code is null""",
362 return [(r[0], r[1], r[2]) for r in cur]
365 def normalize(self, phrase: str) -> str:
366 """ Normalize the given phrase, i.e. remove all properties that
367 are irrelevant for search.
369 return cast(str, self.normalizer.transliterate(phrase))
372 def normalize_postcode(self, postcode: str) -> str:
373 """ Convert the postcode to a standardized form.
375 This function must yield exactly the same result as the SQL function
376 'token_normalized_postcode()'.
378 return postcode.strip().upper()
381 def update_postcodes_from_db(self) -> None:
382 """ Update postcode tokens in the word table from the location_postcode
385 assert self.conn is not None
387 with self.conn.cursor() as cur:
388 # This finds us the rows in location_postcode and word that are
389 # missing in the other table.
390 cur.execute("""SELECT * FROM
391 (SELECT pc, word FROM
392 (SELECT distinct(postcode) as pc FROM location_postcode) p
394 (SELECT word FROM word
395 WHERE class ='place' and type = 'postcode') w
397 WHERE pc is null or word is null""")
402 for postcode, word in cur:
404 to_delete.append(word)
406 to_add.append(postcode)
409 cur.execute("""DELETE FROM WORD
410 WHERE class ='place' and type = 'postcode'
414 cur.execute("""SELECT count(create_postcode_id(pc))
415 FROM unnest(%s::text[]) as pc
420 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
421 should_replace: bool) -> None:
422 """ Replace the search index for special phrases with the new phrases.
424 assert self.conn is not None
426 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
429 with self.conn.cursor() as cur:
430 # Get the old phrases.
431 existing_phrases = set()
432 cur.execute("""SELECT word, class as cls, type, operator FROM word
433 WHERE class != 'place'
434 OR (type != 'house' AND type != 'postcode')""")
435 for label, cls, typ, oper in cur:
436 existing_phrases.add((label, cls, typ, oper or '-'))
438 to_add = norm_phrases - existing_phrases
439 to_delete = existing_phrases - norm_phrases
443 """ INSERT INTO word (word_id, word_token, word, class, type,
444 search_name_count, operator)
445 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
447 CASE WHEN op in ('in', 'near') THEN op ELSE null END
448 FROM (VALUES (%s, %s, %s, %s)) as v(name, class, type, op))""",
451 if to_delete and should_replace:
454 USING (VALUES (%s, %s, %s, %s)) as v(name, in_class, in_type, op)
455 WHERE word = name and class = in_class and type = in_type
456 and ((op = '-' and operator is null) or op = operator)""",
459 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
460 len(norm_phrases), len(to_add), len(to_delete))
463 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
464 """ Add names for the given country to the search index.
466 assert self.conn is not None
468 with self.conn.cursor() as cur:
470 """INSERT INTO word (word_id, word_token, country_code)
471 (SELECT nextval('seq_word'), lookup_token, %s
472 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
473 FROM unnest(%s::TEXT[])n) y
474 WHERE NOT EXISTS(SELECT * FROM word
475 WHERE word_token = lookup_token and country_code = %s))
476 """, (country_code, list(names.values()), country_code))
479 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
480 """ Determine tokenizer information about the given place.
482 Returns a JSON-serialisable structure that will be handed into
483 the database via the token_info field.
485 assert self.conn is not None
487 token_info = _TokenInfo(self._cache)
492 token_info.add_names(self.conn, names)
494 if place.is_country():
495 assert place.country_code is not None
496 self.add_country_names(place.country_code, names)
498 address = place.address
500 self._process_place_address(token_info, address)
502 return token_info.data
505 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
506 assert self.conn is not None
510 for key, value in address.items():
511 if key == 'postcode':
512 # Make sure the normalized postcode is present in the word table.
513 if re.search(r'[:,;]', value) is None:
514 norm_pc = self.normalize_postcode(value)
515 token_info.set_postcode(norm_pc)
516 self._cache.add_postcode(self.conn, norm_pc)
517 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
519 elif key == 'street':
520 token_info.add_street(self.conn, value)
522 token_info.add_place(self.conn, value)
523 elif not key.startswith('_') \
524 and key not in ('country', 'full', 'inclusion'):
525 addr_terms.append((key, value))
528 token_info.add_housenumbers(self.conn, hnrs)
531 token_info.add_address_terms(self.conn, addr_terms)
536 """ Collect token information to be sent back to the database.
538 def __init__(self, cache: '_TokenCache') -> None:
540 self.data: Dict[str, Any] = {}
543 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
544 """ Add token information for the names of the place.
546 # Create the token IDs for all names.
547 self.data['names'] = execute_scalar(conn, "SELECT make_keywords(%s)::text",
551 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
552 """ Extract housenumber information from the address.
555 token = self.cache.get_housenumber(hnrs[0])
556 if token is not None:
557 self.data['hnr_tokens'] = token
558 self.data['hnr'] = hnrs[0]
561 # split numbers if necessary
562 simple_list: List[str] = []
564 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
566 if len(simple_list) > 1:
567 simple_list = list(set(simple_list))
569 with conn.cursor() as cur:
570 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
571 result = cur.fetchone()
572 assert result is not None
573 self.data['hnr_tokens'], self.data['hnr'] = result
576 def set_postcode(self, postcode: str) -> None:
577 """ Set or replace the postcode token with the given value.
579 self.data['postcode'] = postcode
581 def add_street(self, conn: Connection, street: str) -> None:
582 """ Add addr:street match terms.
584 def _get_street(name: str) -> Optional[str]:
585 return cast(Optional[str],
586 execute_scalar(conn, "SELECT word_ids_from_name(%s)::text", (name, )))
588 tokens = self.cache.streets.get(street, _get_street)
589 self.data['street'] = tokens or '{}'
592 def add_place(self, conn: Connection, place: str) -> None:
593 """ Add addr:place search and match terms.
595 def _get_place(name: str) -> Tuple[List[int], List[int]]:
596 with conn.cursor() as cur:
597 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
598 word_ids_from_name(%s)::text""",
600 return cast(Tuple[List[int], List[int]], cur.fetchone())
602 self.data['place_search'], self.data['place_match'] = \
603 self.cache.places.get(place, _get_place)
606 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
607 """ Add additional address terms.
609 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
610 with conn.cursor() as cur:
611 cur.execute("""SELECT addr_ids_from_name(%s)::text,
612 word_ids_from_name(%s)::text""",
614 return cast(Tuple[List[int], List[int]], cur.fetchone())
617 for key, value in terms:
618 items = self.cache.address_terms.get(value, _get_address_term)
619 if items[0] or items[1]:
623 self.data['addr'] = tokens
627 """ Least recently used cache that accepts a generator function to
628 produce the item when there is a cache miss.
631 def __init__(self, maxsize: int = 128):
632 self.data: 'OrderedDict[str, Any]' = OrderedDict()
633 self.maxsize = maxsize
636 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
637 """ Get the item with the given key from the cache. If nothing
638 is found in the cache, generate the value through the
639 generator function and store it in the cache.
641 value = self.data.get(key)
642 if value is not None:
643 self.data.move_to_end(key)
645 value = generator(key)
646 if len(self.data) >= self.maxsize:
647 self.data.popitem(last=False)
648 self.data[key] = value
654 """ Cache for token information to avoid repeated database queries.
656 This cache is not thread-safe and needs to be instantiated per
659 def __init__(self, conn: Connection):
661 self.streets = _LRU(maxsize=256)
662 self.places = _LRU(maxsize=128)
663 self.address_terms = _LRU(maxsize=1024)
665 # Lookup houseunumbers up to 100 and cache them
666 with conn.cursor() as cur:
667 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
668 FROM generate_series(1, 100) as i""")
669 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
671 # For postcodes remember the ones that have already been added
672 self.postcodes: Set[str] = set()
674 def get_housenumber(self, number: str) -> Optional[str]:
675 """ Get a housenumber token from the cache.
677 return self._cached_housenumbers.get(number)
680 def add_postcode(self, conn: Connection, postcode: str) -> None:
681 """ Make sure the given postcode is in the database.
683 if postcode not in self.postcodes:
684 with conn.cursor() as cur:
685 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
686 self.postcodes.add(postcode)