2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
20 DBCFG_NORMALIZATION = "tokenizer_normalization"
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
23 LOG = logging.getLogger()
25 def create(dsn, data_dir):
26 """ Create a new instance of the tokenizer provided by this module.
28 return LegacyTokenizer(dsn, data_dir)
31 def _install_module(config_module_path, src_dir, module_dir):
32 """ Copies the PostgreSQL normalisation module into the project
33 directory if necessary. For historical reasons the module is
34 saved in the '/module' subdirectory and not with the other tokenizer
37 The function detects when the installation is run from the
38 build directory. It doesn't touch the module in that case.
40 # Custom module locations are simply used as is.
41 if config_module_path:
42 LOG.info("Using custom path for database module at '%s'", config_module_path)
43 return config_module_path
45 # Compatibility mode for builddir installations.
46 if module_dir.exists() and src_dir.samefile(module_dir):
47 LOG.info('Running from build directory. Leaving database module as is.')
50 # In any other case install the module in the project directory.
51 if not module_dir.exists():
54 destfile = module_dir / 'nominatim.so'
55 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
58 LOG.info('Database module installed at %s', str(destfile))
63 def _check_module(module_dir, conn):
64 """ Try to use the PostgreSQL module to confirm that it is correctly
65 installed and accessible from PostgreSQL.
67 with conn.cursor() as cur:
69 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
70 RETURNS text AS '{}/nominatim.so', 'transliteration'
71 LANGUAGE c IMMUTABLE STRICT;
72 DROP FUNCTION nominatim_test_import_func(text)
73 """.format(module_dir))
74 except psycopg2.DatabaseError as err:
75 LOG.fatal("Error accessing database module: %s", err)
76 raise UsageError("Database module cannot be accessed.") from err
79 class LegacyTokenizer:
80 """ The legacy tokenizer uses a special PostgreSQL module to normalize
81 names and queries. The tokenizer thus implements normalization through
82 calls to the database.
85 def __init__(self, dsn, data_dir):
87 self.data_dir = data_dir
88 self.normalization = None
91 def init_new_db(self, config, init_db=True):
92 """ Set up a new tokenizer for the database.
94 This copies all necessary data in the project directory to make
95 sure the tokenizer remains stable even over updates.
97 module_dir = _install_module(config.DATABASE_MODULE_PATH,
98 config.lib_dir.module,
99 config.project_dir / 'module')
101 self.normalization = config.TERM_NORMALIZATION
103 self._install_php(config)
105 with connect(self.dsn) as conn:
106 _check_module(module_dir, conn)
107 self._save_config(conn, config)
111 self.update_sql_functions(config)
112 self._init_db_tables(config)
115 def init_from_project(self):
116 """ Initialise the tokenizer from the project directory.
118 with connect(self.dsn) as conn:
119 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
122 def finalize_import(self, config):
123 """ Do any required postprocessing to make the tokenizer data ready
126 with connect(self.dsn) as conn:
127 sqlp = SQLPreprocessor(conn, config)
128 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
131 def update_sql_functions(self, config):
132 """ Reimport the SQL functions for this tokenizer.
134 with connect(self.dsn) as conn:
135 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
136 modulepath = config.DATABASE_MODULE_PATH or \
137 str((config.project_dir / 'module').resolve())
138 sqlp = SQLPreprocessor(conn, config)
139 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
140 max_word_freq=max_word_freq,
141 modulepath=modulepath)
144 def check_database(self):
145 """ Check that the tokenizer is set up correctly.
148 The Postgresql extension nominatim.so was not correctly loaded.
153 * Check the output of the CMmake/make installation step
154 * Does nominatim.so exist?
155 * Does nominatim.so exist on the database server?
156 * Can nominatim.so be accessed by the database user?
158 with connect(self.dsn) as conn:
159 with conn.cursor() as cur:
161 out = cur.scalar("SELECT make_standard_name('a')")
162 except psycopg2.Error as err:
163 return hint.format(error=str(err))
166 return hint.format(error='Unexpected result for make_standard_name()')
171 def migrate_database(self, config):
172 """ Initialise the project directory of an existing database for
173 use with this tokenizer.
175 This is a special migration function for updating existing databases
176 to new software versions.
178 self.normalization = config.TERM_NORMALIZATION
179 module_dir = _install_module(config.DATABASE_MODULE_PATH,
180 config.lib_dir.module,
181 config.project_dir / 'module')
183 with connect(self.dsn) as conn:
184 _check_module(module_dir, conn)
185 self._save_config(conn, config)
188 def name_analyzer(self):
189 """ Create a new analyzer for tokenizing names and queries
190 using this tokinzer. Analyzers are context managers and should
194 with tokenizer.name_analyzer() as analyzer:
198 When used outside the with construct, the caller must ensure to
199 call the close() function before destructing the analyzer.
201 Analyzers are not thread-safe. You need to instantiate one per thread.
203 normalizer = Transliterator.createFromRules("phrase normalizer",
205 return LegacyNameAnalyzer(self.dsn, normalizer)
208 def _install_php(self, config):
209 """ Install the php script for the tokenizer.
211 php_file = self.data_dir / "tokenizer.php"
212 php_file.write_text(dedent("""\
214 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
215 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
216 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
220 def _init_db_tables(self, config):
221 """ Set up the word table and fill it with pre-computed word
224 with connect(self.dsn) as conn:
225 sqlp = SQLPreprocessor(conn, config)
226 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
229 LOG.warning("Precomputing word tokens")
230 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
233 def _save_config(self, conn, config):
234 """ Save the configuration that needs to remain stable for the given
235 database as database properties.
237 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
238 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
241 class LegacyNameAnalyzer:
242 """ The legacy analyzer uses the special Postgresql module for
245 Each instance opens a connection to the database to request the
249 def __init__(self, dsn, normalizer):
250 self.conn = connect(dsn).connection
251 self.conn.autocommit = True
252 self.normalizer = normalizer
253 psycopg2.extras.register_hstore(self.conn)
255 self._cache = _TokenCache(self.conn)
262 def __exit__(self, exc_type, exc_value, traceback):
267 """ Free all resources used by the analyzer.
275 def get_word_token_info(conn, words):
276 """ Return token information for the given list of words.
277 If a word starts with # it is assumed to be a full name
278 otherwise is a partial name.
280 The function returns a list of tuples with
281 (original word, word token, word id).
283 The function is used for testing and debugging only
284 and not necessarily efficient.
286 with conn.cursor() as cur:
287 cur.execute("""SELECT t.term, word_token, word_id
288 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
289 WHERE word_token = (CASE
290 WHEN left(t.term, 1) = '#' THEN
291 ' ' || make_standard_name(substring(t.term from 2))
293 make_standard_name(t.term)
295 and class is null and country_code is null""",
298 return [(r[0], r[1], r[2]) for r in cur]
301 def normalize(self, phrase):
302 """ Normalize the given phrase, i.e. remove all properties that
303 are irrelevant for search.
305 return self.normalizer.transliterate(phrase)
309 def normalize_postcode(postcode):
310 """ Convert the postcode to a standardized form.
312 This function must yield exactly the same result as the SQL function
313 'token_normalized_postcode()'.
315 return postcode.strip().upper()
318 def update_postcodes_from_db(self):
319 """ Update postcode tokens in the word table from the location_postcode
322 with self.conn.cursor() as cur:
323 # This finds us the rows in location_postcode and word that are
324 # missing in the other table.
325 cur.execute("""SELECT * FROM
326 (SELECT pc, word FROM
327 (SELECT distinct(postcode) as pc FROM location_postcode) p
329 (SELECT word FROM word
330 WHERE class ='place' and type = 'postcode') w
332 WHERE pc is null or word is null""")
337 for postcode, word in cur:
339 to_delete.append(word)
341 to_add.append(postcode)
344 cur.execute("""DELETE FROM WORD
345 WHERE class ='place' and type = 'postcode'
349 cur.execute("""SELECT count(create_postcode_id(pc))
350 FROM unnest(%s) as pc
355 def update_special_phrases(self, phrases):
356 """ Replace the search index for special phrases with the new phrases.
358 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
361 with self.conn.cursor() as cur:
362 # Get the old phrases.
363 existing_phrases = set()
364 cur.execute("""SELECT word, class, type, operator FROM word
365 WHERE class != 'place'
366 OR (type != 'house' AND type != 'postcode')""")
367 for label, cls, typ, oper in cur:
368 existing_phrases.add((label, cls, typ, oper or '-'))
370 to_add = norm_phrases - existing_phrases
371 to_delete = existing_phrases - norm_phrases
374 psycopg2.extras.execute_values(
376 """ INSERT INTO word (word_id, word_token, word, class, type,
377 search_name_count, operator)
378 (SELECT nextval('seq_word'), make_standard_name(name), name,
380 CASE WHEN op in ('in', 'near') THEN op ELSE null END
381 FROM (VALUES %s) as v(name, class, type, op))""",
385 psycopg2.extras.execute_values(
387 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
388 WHERE word = name and class = in_class and type = in_type
389 and ((op = '-' and operator is null) or op = operator)""",
392 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
393 len(norm_phrases), len(to_add), len(to_delete))
396 def add_country_names(self, country_code, names):
397 """ Add names for the given country to the search index.
399 with self.conn.cursor() as cur:
401 """INSERT INTO word (word_id, word_token, country_code)
402 (SELECT nextval('seq_word'), lookup_token, %s
403 FROM (SELECT ' ' || make_standard_name(n) as lookup_token
405 WHERE NOT EXISTS(SELECT * FROM word
406 WHERE word_token = lookup_token and country_code = %s))
407 """, (country_code, names, country_code))
410 def process_place(self, place):
411 """ Determine tokenizer information about the given place.
413 Returns a JSON-serialisable structure that will be handed into
414 the database via the token_info field.
416 token_info = _TokenInfo(self._cache)
418 names = place.get('name')
421 token_info.add_names(self.conn, names)
423 country_feature = place.get('country_feature')
424 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
425 self.add_country_names(country_feature.lower(), list(names.values()))
427 address = place.get('address')
432 for key, value in address.items():
433 if key == 'postcode':
434 self._add_postcode(value)
435 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
437 elif key == 'street':
438 token_info.add_street(self.conn, value)
440 token_info.add_place(self.conn, value)
441 elif not key.startswith('_') and \
442 key not in ('country', 'full'):
443 addr_terms.append((key, value))
446 token_info.add_housenumbers(self.conn, hnrs)
449 token_info.add_address_terms(self.conn, addr_terms)
451 return token_info.data
454 def _add_postcode(self, postcode):
455 """ Make sure the normalized postcode is present in the word table.
457 def _create_postcode_from_db(pcode):
458 with self.conn.cursor() as cur:
459 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
461 if re.search(r'[:,;]', postcode) is None:
462 self._cache.postcodes.get(self.normalize_postcode(postcode),
463 _create_postcode_from_db)
467 """ Collect token information to be sent back to the database.
469 def __init__(self, cache):
474 def add_names(self, conn, names):
475 """ Add token information for the names of the place.
477 with conn.cursor() as cur:
478 # Create the token IDs for all names.
479 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
483 def add_housenumbers(self, conn, hnrs):
484 """ Extract housenumber information from the address.
487 token = self.cache.get_housenumber(hnrs[0])
488 if token is not None:
489 self.data['hnr_tokens'] = token
490 self.data['hnr'] = hnrs[0]
493 # split numbers if necessary
496 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
498 if len(simple_list) > 1:
499 simple_list = list(set(simple_list))
501 with conn.cursor() as cur:
502 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
503 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
506 def add_street(self, conn, street):
507 """ Add addr:street match terms.
509 def _get_street(name):
510 with conn.cursor() as cur:
511 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
513 self.data['street'] = self.cache.streets.get(street, _get_street)
516 def add_place(self, conn, place):
517 """ Add addr:place search and match terms.
519 def _get_place(name):
520 with conn.cursor() as cur:
521 cur.execute("""SELECT (addr_ids_from_name(%s)
522 || getorcreate_name_id(make_standard_name(%s), ''))::text,
523 word_ids_from_name(%s)::text""",
525 return cur.fetchone()
527 self.data['place_search'], self.data['place_match'] = \
528 self.cache.places.get(place, _get_place)
531 def add_address_terms(self, conn, terms):
532 """ Add additional address terms.
534 def _get_address_term(name):
535 with conn.cursor() as cur:
536 cur.execute("""SELECT addr_ids_from_name(%s)::text,
537 word_ids_from_name(%s)::text""",
539 return cur.fetchone()
542 for key, value in terms:
543 tokens[key] = self.cache.address_terms.get(value, _get_address_term)
545 self.data['addr'] = tokens
549 """ Least recently used cache that accepts a generator function to
550 produce the item when there is a cache miss.
553 def __init__(self, maxsize=128, init_data=None):
554 self.data = init_data or OrderedDict()
555 self.maxsize = maxsize
556 if init_data is not None and len(init_data) > maxsize:
557 self.maxsize = len(init_data)
559 def get(self, key, generator):
560 """ Get the item with the given key from the cache. If nothing
561 is found in the cache, generate the value through the
562 generator function and store it in the cache.
564 value = self.data.get(key)
565 if value is not None:
566 self.data.move_to_end(key)
568 value = generator(key)
569 if len(self.data) >= self.maxsize:
570 self.data.popitem(last=False)
571 self.data[key] = value
577 """ Cache for token information to avoid repeated database queries.
579 This cache is not thread-safe and needs to be instantiated per
582 def __init__(self, conn):
584 self.streets = _LRU(maxsize=256)
585 self.places = _LRU(maxsize=128)
586 self.address_terms = _LRU(maxsize=1024)
588 # Lookup houseunumbers up to 100 and cache them
589 with conn.cursor() as cur:
590 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
591 FROM generate_series(1, 100) as i""")
592 self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
594 # Get postcodes that are already saved
595 postcodes = OrderedDict()
596 with conn.cursor() as cur:
597 cur.execute("""SELECT word FROM word
598 WHERE class ='place' and type = 'postcode'""")
600 postcodes[row[0]] = None
601 self.postcodes = _LRU(maxsize=32, init_data=postcodes)
603 def get_housenumber(self, number):
604 """ Get a housenumber token from the cache.
606 return self._cached_housenumbers.get(number)