1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4.
10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
11 cast, Dict, Set, Iterable
12 from collections import OrderedDict
14 from pathlib import Path
17 from textwrap import dedent
19 from icu import Transliterator
21 import psycopg2.extras
23 from nominatim.db.connection import connect, Connection
24 from nominatim.config import Configuration
25 from nominatim.db import properties
26 from nominatim.db import utils as db_utils
27 from nominatim.db.sql_preprocessor import SQLPreprocessor
28 from nominatim.data.place_info import PlaceInfo
29 from nominatim.errors import UsageError
30 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
32 DBCFG_NORMALIZATION = "tokenizer_normalization"
33 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
35 LOG = logging.getLogger()
37 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
38 """ Create a new instance of the tokenizer provided by this module.
40 return LegacyTokenizer(dsn, data_dir)
43 def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
44 """ Copies the PostgreSQL normalisation module into the project
45 directory if necessary. For historical reasons the module is
46 saved in the '/module' subdirectory and not with the other tokenizer
49 The function detects when the installation is run from the
50 build directory. It doesn't touch the module in that case.
52 # Custom module locations are simply used as is.
53 if config_module_path:
54 LOG.info("Using custom path for database module at '%s'", config_module_path)
55 return config_module_path
57 # Compatibility mode for builddir installations.
58 if module_dir.exists() and src_dir.samefile(module_dir):
59 LOG.info('Running from build directory. Leaving database module as is.')
60 return str(module_dir)
62 # In any other case install the module in the project directory.
63 if not module_dir.exists():
66 destfile = module_dir / 'nominatim.so'
67 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
70 LOG.info('Database module installed at %s', str(destfile))
72 return str(module_dir)
75 def _check_module(module_dir: str, conn: Connection) -> None:
76 """ Try to use the PostgreSQL module to confirm that it is correctly
77 installed and accessible from PostgreSQL.
79 with conn.cursor() as cur:
81 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
82 RETURNS text AS %s, 'transliteration'
83 LANGUAGE c IMMUTABLE STRICT;
84 DROP FUNCTION nominatim_test_import_func(text)
85 """, (f'{module_dir}/nominatim.so', ))
86 except psycopg2.DatabaseError as err:
87 LOG.fatal("Error accessing database module: %s", err)
88 raise UsageError("Database module cannot be accessed.") from err
91 class LegacyTokenizer(AbstractTokenizer):
92 """ The legacy tokenizer uses a special PostgreSQL module to normalize
93 names and queries. The tokenizer thus implements normalization through
94 calls to the database.
97 def __init__(self, dsn: str, data_dir: Path) -> None:
99 self.data_dir = data_dir
100 self.normalization: Optional[str] = None
103 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
104 """ Set up a new tokenizer for the database.
106 This copies all necessary data in the project directory to make
107 sure the tokenizer remains stable even over updates.
109 assert config.project_dir is not None
110 module_dir = _install_module(config.DATABASE_MODULE_PATH,
111 config.lib_dir.module,
112 config.project_dir / 'module')
114 self.normalization = config.TERM_NORMALIZATION
116 self._install_php(config, overwrite=True)
118 with connect(self.dsn) as conn:
119 _check_module(module_dir, conn)
120 self._save_config(conn, config)
124 self.update_sql_functions(config)
125 self._init_db_tables(config)
128 def init_from_project(self, config: Configuration) -> None:
129 """ Initialise the tokenizer from the project directory.
131 assert config.project_dir is not None
133 with connect(self.dsn) as conn:
134 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
136 if not (config.project_dir / 'module' / 'nominatim.so').exists():
137 _install_module(config.DATABASE_MODULE_PATH,
138 config.lib_dir.module,
139 config.project_dir / 'module')
141 self._install_php(config, overwrite=False)
143 def finalize_import(self, config: Configuration) -> None:
144 """ Do any required postprocessing to make the tokenizer data ready
147 with connect(self.dsn) as conn:
148 sqlp = SQLPreprocessor(conn, config)
149 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
152 def update_sql_functions(self, config: Configuration) -> None:
153 """ Reimport the SQL functions for this tokenizer.
155 assert config.project_dir is not None
157 with connect(self.dsn) as conn:
158 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
159 modulepath = config.DATABASE_MODULE_PATH or \
160 str((config.project_dir / 'module').resolve())
161 sqlp = SQLPreprocessor(conn, config)
162 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
163 max_word_freq=max_word_freq,
164 modulepath=modulepath)
167 def check_database(self, _: Configuration) -> Optional[str]:
168 """ Check that the tokenizer is set up correctly.
171 The Postgresql extension nominatim.so was not correctly loaded.
176 * Check the output of the CMmake/make installation step
177 * Does nominatim.so exist?
178 * Does nominatim.so exist on the database server?
179 * Can nominatim.so be accessed by the database user?
181 with connect(self.dsn) as conn:
182 with conn.cursor() as cur:
184 out = cur.scalar("SELECT make_standard_name('a')")
185 except psycopg2.Error as err:
186 return hint.format(error=str(err))
189 return hint.format(error='Unexpected result for make_standard_name()')
194 def migrate_database(self, config: Configuration) -> None:
195 """ Initialise the project directory of an existing database for
196 use with this tokenizer.
198 This is a special migration function for updating existing databases
199 to new software versions.
201 assert config.project_dir is not None
203 self.normalization = config.TERM_NORMALIZATION
204 module_dir = _install_module(config.DATABASE_MODULE_PATH,
205 config.lib_dir.module,
206 config.project_dir / 'module')
208 with connect(self.dsn) as conn:
209 _check_module(module_dir, conn)
210 self._save_config(conn, config)
213 def update_statistics(self) -> None:
214 """ Recompute the frequency of full words.
216 with connect(self.dsn) as conn:
217 if conn.table_exists('search_name'):
218 with conn.cursor() as cur:
219 cur.drop_table("word_frequencies")
220 LOG.info("Computing word frequencies")
221 cur.execute("""CREATE TEMP TABLE word_frequencies AS
222 SELECT unnest(name_vector) as id, count(*)
223 FROM search_name GROUP BY id""")
224 cur.execute("CREATE INDEX ON word_frequencies(id)")
225 LOG.info("Update word table with recomputed frequencies")
226 cur.execute("""UPDATE word SET search_name_count = count
227 FROM word_frequencies
228 WHERE word_token like ' %' and word_id = id""")
229 cur.drop_table("word_frequencies")
233 def update_word_tokens(self) -> None:
234 """ No house-keeping implemented for the legacy tokenizer.
236 LOG.info("No tokenizer clean-up available.")
239 def name_analyzer(self) -> 'LegacyNameAnalyzer':
240 """ Create a new analyzer for tokenizing names and queries
241 using this tokinzer. Analyzers are context managers and should
245 with tokenizer.name_analyzer() as analyzer:
249 When used outside the with construct, the caller must ensure to
250 call the close() function before destructing the analyzer.
252 Analyzers are not thread-safe. You need to instantiate one per thread.
254 normalizer = Transliterator.createFromRules("phrase normalizer",
256 return LegacyNameAnalyzer(self.dsn, normalizer)
259 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
260 """ Return a list of the `num` most frequent full words
263 with conn.cursor() as cur:
264 cur.execute(""" SELECT word FROM word WHERE word is not null
265 ORDER BY search_name_count DESC LIMIT %s""", (num,))
266 return list(s[0] for s in cur)
269 def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
270 """ Install the php script for the tokenizer.
272 php_file = self.data_dir / "tokenizer.php"
274 if not php_file.exists() or overwrite:
275 php_file.write_text(dedent(f"""\
277 @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
278 @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
279 require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
280 """), encoding='utf-8')
283 def _init_db_tables(self, config: Configuration) -> None:
284 """ Set up the word table and fill it with pre-computed word
287 with connect(self.dsn) as conn:
288 sqlp = SQLPreprocessor(conn, config)
289 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
292 LOG.warning("Precomputing word tokens")
293 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
296 def _save_config(self, conn: Connection, config: Configuration) -> None:
297 """ Save the configuration that needs to remain stable for the given
298 database as database properties.
300 assert self.normalization is not None
302 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
303 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
306 class LegacyNameAnalyzer(AbstractAnalyzer):
307 """ The legacy analyzer uses the special Postgresql module for
310 Each instance opens a connection to the database to request the
314 def __init__(self, dsn: str, normalizer: Any):
315 self.conn: Optional[Connection] = connect(dsn).connection
316 self.conn.autocommit = True
317 self.normalizer = normalizer
318 psycopg2.extras.register_hstore(self.conn)
320 self._cache = _TokenCache(self.conn)
323 def close(self) -> None:
324 """ Free all resources used by the analyzer.
331 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
332 """ Return token information for the given list of words.
333 If a word starts with # it is assumed to be a full name
334 otherwise is a partial name.
336 The function returns a list of tuples with
337 (original word, word token, word id).
339 The function is used for testing and debugging only
340 and not necessarily efficient.
342 assert self.conn is not None
343 with self.conn.cursor() as cur:
344 cur.execute("""SELECT t.term, word_token, word_id
345 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
346 WHERE word_token = (CASE
347 WHEN left(t.term, 1) = '#' THEN
348 ' ' || make_standard_name(substring(t.term from 2))
350 make_standard_name(t.term)
352 and class is null and country_code is null""",
355 return [(r[0], r[1], r[2]) for r in cur]
358 def normalize(self, phrase: str) -> str:
359 """ Normalize the given phrase, i.e. remove all properties that
360 are irrelevant for search.
362 return cast(str, self.normalizer.transliterate(phrase))
365 def normalize_postcode(self, postcode: str) -> str:
366 """ Convert the postcode to a standardized form.
368 This function must yield exactly the same result as the SQL function
369 'token_normalized_postcode()'.
371 return postcode.strip().upper()
374 def update_postcodes_from_db(self) -> None:
375 """ Update postcode tokens in the word table from the location_postcode
378 assert self.conn is not None
380 with self.conn.cursor() as cur:
381 # This finds us the rows in location_postcode and word that are
382 # missing in the other table.
383 cur.execute("""SELECT * FROM
384 (SELECT pc, word FROM
385 (SELECT distinct(postcode) as pc FROM location_postcode) p
387 (SELECT word FROM word
388 WHERE class ='place' and type = 'postcode') w
390 WHERE pc is null or word is null""")
395 for postcode, word in cur:
397 to_delete.append(word)
399 to_add.append(postcode)
402 cur.execute("""DELETE FROM WORD
403 WHERE class ='place' and type = 'postcode'
407 cur.execute("""SELECT count(create_postcode_id(pc))
408 FROM unnest(%s) as pc
413 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
414 should_replace: bool) -> None:
415 """ Replace the search index for special phrases with the new phrases.
417 assert self.conn is not None
419 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
422 with self.conn.cursor() as cur:
423 # Get the old phrases.
424 existing_phrases = set()
425 cur.execute("""SELECT word, class, type, operator FROM word
426 WHERE class != 'place'
427 OR (type != 'house' AND type != 'postcode')""")
428 for label, cls, typ, oper in cur:
429 existing_phrases.add((label, cls, typ, oper or '-'))
431 to_add = norm_phrases - existing_phrases
432 to_delete = existing_phrases - norm_phrases
436 """ INSERT INTO word (word_id, word_token, word, class, type,
437 search_name_count, operator)
438 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
440 CASE WHEN op in ('in', 'near') THEN op ELSE null END
441 FROM (VALUES %s) as v(name, class, type, op))""",
444 if to_delete and should_replace:
446 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
447 WHERE word = name and class = in_class and type = in_type
448 and ((op = '-' and operator is null) or op = operator)""",
451 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
452 len(norm_phrases), len(to_add), len(to_delete))
455 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
456 """ Add names for the given country to the search index.
458 assert self.conn is not None
460 with self.conn.cursor() as cur:
462 """INSERT INTO word (word_id, word_token, country_code)
463 (SELECT nextval('seq_word'), lookup_token, %s
464 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
466 WHERE NOT EXISTS(SELECT * FROM word
467 WHERE word_token = lookup_token and country_code = %s))
468 """, (country_code, list(names.values()), country_code))
471 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
472 """ Determine tokenizer information about the given place.
474 Returns a JSON-serialisable structure that will be handed into
475 the database via the token_info field.
477 assert self.conn is not None
479 token_info = _TokenInfo(self._cache)
484 token_info.add_names(self.conn, names)
486 if place.is_country():
487 assert place.country_code is not None
488 self.add_country_names(place.country_code, names)
490 address = place.address
492 self._process_place_address(token_info, address)
494 return token_info.data
497 def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
498 assert self.conn is not None
502 for key, value in address.items():
503 if key == 'postcode':
504 # Make sure the normalized postcode is present in the word table.
505 if re.search(r'[:,;]', value) is None:
506 norm_pc = self.normalize_postcode(value)
507 token_info.set_postcode(norm_pc)
508 self._cache.add_postcode(self.conn, norm_pc)
509 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
511 elif key == 'street':
512 token_info.add_street(self.conn, value)
514 token_info.add_place(self.conn, value)
515 elif not key.startswith('_') \
516 and key not in ('country', 'full', 'inclusion'):
517 addr_terms.append((key, value))
520 token_info.add_housenumbers(self.conn, hnrs)
523 token_info.add_address_terms(self.conn, addr_terms)
528 """ Collect token information to be sent back to the database.
530 def __init__(self, cache: '_TokenCache') -> None:
532 self.data: Dict[str, Any] = {}
535 def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
536 """ Add token information for the names of the place.
538 with conn.cursor() as cur:
539 # Create the token IDs for all names.
540 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
544 def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
545 """ Extract housenumber information from the address.
548 token = self.cache.get_housenumber(hnrs[0])
549 if token is not None:
550 self.data['hnr_tokens'] = token
551 self.data['hnr'] = hnrs[0]
554 # split numbers if necessary
555 simple_list: List[str] = []
557 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
559 if len(simple_list) > 1:
560 simple_list = list(set(simple_list))
562 with conn.cursor() as cur:
563 cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
564 result = cur.fetchone()
565 assert result is not None
566 self.data['hnr_tokens'], self.data['hnr'] = result
569 def set_postcode(self, postcode: str) -> None:
570 """ Set or replace the postcode token with the given value.
572 self.data['postcode'] = postcode
574 def add_street(self, conn: Connection, street: str) -> None:
575 """ Add addr:street match terms.
577 def _get_street(name: str) -> Optional[str]:
578 with conn.cursor() as cur:
579 return cast(Optional[str],
580 cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
582 tokens = self.cache.streets.get(street, _get_street)
583 self.data['street'] = tokens or '{}'
586 def add_place(self, conn: Connection, place: str) -> None:
587 """ Add addr:place search and match terms.
589 def _get_place(name: str) -> Tuple[List[int], List[int]]:
590 with conn.cursor() as cur:
591 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
592 word_ids_from_name(%s)::text""",
594 return cast(Tuple[List[int], List[int]], cur.fetchone())
596 self.data['place_search'], self.data['place_match'] = \
597 self.cache.places.get(place, _get_place)
600 def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
601 """ Add additional address terms.
603 def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
604 with conn.cursor() as cur:
605 cur.execute("""SELECT addr_ids_from_name(%s)::text,
606 word_ids_from_name(%s)::text""",
608 return cast(Tuple[List[int], List[int]], cur.fetchone())
611 for key, value in terms:
612 items = self.cache.address_terms.get(value, _get_address_term)
613 if items[0] or items[1]:
617 self.data['addr'] = tokens
621 """ Least recently used cache that accepts a generator function to
622 produce the item when there is a cache miss.
625 def __init__(self, maxsize: int = 128):
626 self.data: 'OrderedDict[str, Any]' = OrderedDict()
627 self.maxsize = maxsize
630 def get(self, key: str, generator: Callable[[str], Any]) -> Any:
631 """ Get the item with the given key from the cache. If nothing
632 is found in the cache, generate the value through the
633 generator function and store it in the cache.
635 value = self.data.get(key)
636 if value is not None:
637 self.data.move_to_end(key)
639 value = generator(key)
640 if len(self.data) >= self.maxsize:
641 self.data.popitem(last=False)
642 self.data[key] = value
648 """ Cache for token information to avoid repeated database queries.
650 This cache is not thread-safe and needs to be instantiated per
653 def __init__(self, conn: Connection):
655 self.streets = _LRU(maxsize=256)
656 self.places = _LRU(maxsize=128)
657 self.address_terms = _LRU(maxsize=1024)
659 # Lookup houseunumbers up to 100 and cache them
660 with conn.cursor() as cur:
661 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
662 FROM generate_series(1, 100) as i""")
663 self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
665 # For postcodes remember the ones that have already been added
666 self.postcodes: Set[str] = set()
668 def get_housenumber(self, number: str) -> Optional[str]:
669 """ Get a housenumber token from the cache.
671 return self._cached_housenumbers.get(number)
674 def add_postcode(self, conn: Connection, postcode: str) -> None:
675 """ Make sure the given postcode is in the database.
677 if postcode not in self.postcodes:
678 with conn.cursor() as cur:
679 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
680 self.postcodes.add(postcode)