2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
19 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
21 DBCFG_NORMALIZATION = "tokenizer_normalization"
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyTokenizer(dsn, data_dir)
32 def _install_module(config_module_path, src_dir, module_dir):
33 """ Copies the PostgreSQL normalisation module into the project
34 directory if necessary. For historical reasons the module is
35 saved in the '/module' subdirectory and not with the other tokenizer
38 The function detects when the installation is run from the
39 build directory. It doesn't touch the module in that case.
41 # Custom module locations are simply used as is.
42 if config_module_path:
43 LOG.info("Using custom path for database module at '%s'", config_module_path)
44 return config_module_path
46 # Compatibility mode for builddir installations.
47 if module_dir.exists() and src_dir.samefile(module_dir):
48 LOG.info('Running from build directory. Leaving database module as is.')
51 # In any other case install the module in the project directory.
52 if not module_dir.exists():
55 destfile = module_dir / 'nominatim.so'
56 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
59 LOG.info('Database module installed at %s', str(destfile))
64 def _check_module(module_dir, conn):
65 """ Try to use the PostgreSQL module to confirm that it is correctly
66 installed and accessible from PostgreSQL.
68 with conn.cursor() as cur:
70 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
71 RETURNS text AS '{}/nominatim.so', 'transliteration'
72 LANGUAGE c IMMUTABLE STRICT;
73 DROP FUNCTION nominatim_test_import_func(text)
74 """.format(module_dir))
75 except psycopg2.DatabaseError as err:
76 LOG.fatal("Error accessing database module: %s", err)
77 raise UsageError("Database module cannot be accessed.") from err
80 class LegacyTokenizer(AbstractTokenizer):
81 """ The legacy tokenizer uses a special PostgreSQL module to normalize
82 names and queries. The tokenizer thus implements normalization through
83 calls to the database.
86 def __init__(self, dsn, data_dir):
88 self.data_dir = data_dir
89 self.normalization = None
92 def init_new_db(self, config, init_db=True):
93 """ Set up a new tokenizer for the database.
95 This copies all necessary data in the project directory to make
96 sure the tokenizer remains stable even over updates.
98 module_dir = _install_module(config.DATABASE_MODULE_PATH,
99 config.lib_dir.module,
100 config.project_dir / 'module')
102 self.normalization = config.TERM_NORMALIZATION
104 self._install_php(config)
106 with connect(self.dsn) as conn:
107 _check_module(module_dir, conn)
108 self._save_config(conn, config)
112 self.update_sql_functions(config)
113 self._init_db_tables(config)
116 def init_from_project(self):
117 """ Initialise the tokenizer from the project directory.
119 with connect(self.dsn) as conn:
120 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
123 def finalize_import(self, config):
124 """ Do any required postprocessing to make the tokenizer data ready
127 with connect(self.dsn) as conn:
128 sqlp = SQLPreprocessor(conn, config)
129 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
132 def update_sql_functions(self, config):
133 """ Reimport the SQL functions for this tokenizer.
135 with connect(self.dsn) as conn:
136 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
137 modulepath = config.DATABASE_MODULE_PATH or \
138 str((config.project_dir / 'module').resolve())
139 sqlp = SQLPreprocessor(conn, config)
140 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
141 max_word_freq=max_word_freq,
142 modulepath=modulepath)
145 def check_database(self):
146 """ Check that the tokenizer is set up correctly.
149 The Postgresql extension nominatim.so was not correctly loaded.
154 * Check the output of the CMmake/make installation step
155 * Does nominatim.so exist?
156 * Does nominatim.so exist on the database server?
157 * Can nominatim.so be accessed by the database user?
159 with connect(self.dsn) as conn:
160 with conn.cursor() as cur:
162 out = cur.scalar("SELECT make_standard_name('a')")
163 except psycopg2.Error as err:
164 return hint.format(error=str(err))
167 return hint.format(error='Unexpected result for make_standard_name()')
172 def migrate_database(self, config):
173 """ Initialise the project directory of an existing database for
174 use with this tokenizer.
176 This is a special migration function for updating existing databases
177 to new software versions.
179 self.normalization = config.TERM_NORMALIZATION
180 module_dir = _install_module(config.DATABASE_MODULE_PATH,
181 config.lib_dir.module,
182 config.project_dir / 'module')
184 with connect(self.dsn) as conn:
185 _check_module(module_dir, conn)
186 self._save_config(conn, config)
189 def name_analyzer(self):
190 """ Create a new analyzer for tokenizing names and queries
191 using this tokinzer. Analyzers are context managers and should
195 with tokenizer.name_analyzer() as analyzer:
199 When used outside the with construct, the caller must ensure to
200 call the close() function before destructing the analyzer.
202 Analyzers are not thread-safe. You need to instantiate one per thread.
204 normalizer = Transliterator.createFromRules("phrase normalizer",
206 return LegacyNameAnalyzer(self.dsn, normalizer)
209 def _install_php(self, config):
210 """ Install the php script for the tokenizer.
212 php_file = self.data_dir / "tokenizer.php"
213 php_file.write_text(dedent("""\
215 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
216 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
217 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
221 def _init_db_tables(self, config):
222 """ Set up the word table and fill it with pre-computed word
225 with connect(self.dsn) as conn:
226 sqlp = SQLPreprocessor(conn, config)
227 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
230 LOG.warning("Precomputing word tokens")
231 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
234 def _save_config(self, conn, config):
235 """ Save the configuration that needs to remain stable for the given
236 database as database properties.
238 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
239 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
242 class LegacyNameAnalyzer(AbstractAnalyzer):
243 """ The legacy analyzer uses the special Postgresql module for
246 Each instance opens a connection to the database to request the
250 def __init__(self, dsn, normalizer):
251 self.conn = connect(dsn).connection
252 self.conn.autocommit = True
253 self.normalizer = normalizer
254 psycopg2.extras.register_hstore(self.conn)
256 self._cache = _TokenCache(self.conn)
260 """ Free all resources used by the analyzer.
267 def get_word_token_info(self, words):
268 """ Return token information for the given list of words.
269 If a word starts with # it is assumed to be a full name
270 otherwise is a partial name.
272 The function returns a list of tuples with
273 (original word, word token, word id).
275 The function is used for testing and debugging only
276 and not necessarily efficient.
278 with self.conn.cursor() as cur:
279 cur.execute("""SELECT t.term, word_token, word_id
280 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
281 WHERE word_token = (CASE
282 WHEN left(t.term, 1) = '#' THEN
283 ' ' || make_standard_name(substring(t.term from 2))
285 make_standard_name(t.term)
287 and class is null and country_code is null""",
290 return [(r[0], r[1], r[2]) for r in cur]
293 def normalize(self, phrase):
294 """ Normalize the given phrase, i.e. remove all properties that
295 are irrelevant for search.
297 return self.normalizer.transliterate(phrase)
301 def normalize_postcode(postcode):
302 """ Convert the postcode to a standardized form.
304 This function must yield exactly the same result as the SQL function
305 'token_normalized_postcode()'.
307 return postcode.strip().upper()
310 def update_postcodes_from_db(self):
311 """ Update postcode tokens in the word table from the location_postcode
314 with self.conn.cursor() as cur:
315 # This finds us the rows in location_postcode and word that are
316 # missing in the other table.
317 cur.execute("""SELECT * FROM
318 (SELECT pc, word FROM
319 (SELECT distinct(postcode) as pc FROM location_postcode) p
321 (SELECT word FROM word
322 WHERE class ='place' and type = 'postcode') w
324 WHERE pc is null or word is null""")
329 for postcode, word in cur:
331 to_delete.append(word)
333 to_add.append(postcode)
336 cur.execute("""DELETE FROM WORD
337 WHERE class ='place' and type = 'postcode'
341 cur.execute("""SELECT count(create_postcode_id(pc))
342 FROM unnest(%s) as pc
347 def update_special_phrases(self, phrases, should_replace):
348 """ Replace the search index for special phrases with the new phrases.
350 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
353 with self.conn.cursor() as cur:
354 # Get the old phrases.
355 existing_phrases = set()
356 cur.execute("""SELECT word, class, type, operator FROM word
357 WHERE class != 'place'
358 OR (type != 'house' AND type != 'postcode')""")
359 for label, cls, typ, oper in cur:
360 existing_phrases.add((label, cls, typ, oper or '-'))
362 to_add = norm_phrases - existing_phrases
363 to_delete = existing_phrases - norm_phrases
367 """ INSERT INTO word (word_id, word_token, word, class, type,
368 search_name_count, operator)
369 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
371 CASE WHEN op in ('in', 'near') THEN op ELSE null END
372 FROM (VALUES %s) as v(name, class, type, op))""",
375 if to_delete and should_replace:
377 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
378 WHERE word = name and class = in_class and type = in_type
379 and ((op = '-' and operator is null) or op = operator)""",
382 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
383 len(norm_phrases), len(to_add), len(to_delete))
386 def add_country_names(self, country_code, names):
387 """ Add names for the given country to the search index.
389 with self.conn.cursor() as cur:
391 """INSERT INTO word (word_id, word_token, country_code)
392 (SELECT nextval('seq_word'), lookup_token, %s
393 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
395 WHERE NOT EXISTS(SELECT * FROM word
396 WHERE word_token = lookup_token and country_code = %s))
397 """, (country_code, list(names.values()), country_code))
400 def process_place(self, place):
401 """ Determine tokenizer information about the given place.
403 Returns a JSON-serialisable structure that will be handed into
404 the database via the token_info field.
406 token_info = _TokenInfo(self._cache)
408 names = place.get('name')
411 token_info.add_names(self.conn, names)
413 country_feature = place.get('country_feature')
414 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
415 self.add_country_names(country_feature.lower(), names)
417 address = place.get('address')
419 self._process_place_address(token_info, address)
421 return token_info.data
424 def _process_place_address(self, token_info, address):
428 for key, value in address.items():
429 if key == 'postcode':
430 # Make sure the normalized postcode is present in the word table.
431 if re.search(r'[:,;]', value) is None:
432 self._cache.add_postcode(self.conn,
433 self.normalize_postcode(value))
434 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
436 elif key == 'street':
437 token_info.add_street(self.conn, value)
439 token_info.add_place(self.conn, value)
440 elif not key.startswith('_') and key not in ('country', 'full'):
441 addr_terms.append((key, value))
444 token_info.add_housenumbers(self.conn, hnrs)
447 token_info.add_address_terms(self.conn, addr_terms)
452 """ Collect token information to be sent back to the database.
454 def __init__(self, cache):
459 def add_names(self, conn, names):
460 """ Add token information for the names of the place.
462 with conn.cursor() as cur:
463 # Create the token IDs for all names.
464 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
468 def add_housenumbers(self, conn, hnrs):
469 """ Extract housenumber information from the address.
472 token = self.cache.get_housenumber(hnrs[0])
473 if token is not None:
474 self.data['hnr_tokens'] = token
475 self.data['hnr'] = hnrs[0]
478 # split numbers if necessary
481 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
483 if len(simple_list) > 1:
484 simple_list = list(set(simple_list))
486 with conn.cursor() as cur:
487 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
488 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
491 def add_street(self, conn, street):
492 """ Add addr:street match terms.
494 def _get_street(name):
495 with conn.cursor() as cur:
496 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
498 self.data['street'] = self.cache.streets.get(street, _get_street)
501 def add_place(self, conn, place):
502 """ Add addr:place search and match terms.
504 def _get_place(name):
505 with conn.cursor() as cur:
506 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
507 word_ids_from_name(%s)::text""",
509 return cur.fetchone()
511 self.data['place_search'], self.data['place_match'] = \
512 self.cache.places.get(place, _get_place)
515 def add_address_terms(self, conn, terms):
516 """ Add additional address terms.
518 def _get_address_term(name):
519 with conn.cursor() as cur:
520 cur.execute("""SELECT addr_ids_from_name(%s)::text,
521 word_ids_from_name(%s)::text""",
523 return cur.fetchone()
526 for key, value in terms:
527 tokens[key] = self.cache.address_terms.get(value, _get_address_term)
529 self.data['addr'] = tokens
533 """ Least recently used cache that accepts a generator function to
534 produce the item when there is a cache miss.
537 def __init__(self, maxsize=128, init_data=None):
538 self.data = init_data or OrderedDict()
539 self.maxsize = maxsize
540 if init_data is not None and len(init_data) > maxsize:
541 self.maxsize = len(init_data)
543 def get(self, key, generator):
544 """ Get the item with the given key from the cache. If nothing
545 is found in the cache, generate the value through the
546 generator function and store it in the cache.
548 value = self.data.get(key)
549 if value is not None:
550 self.data.move_to_end(key)
552 value = generator(key)
553 if len(self.data) >= self.maxsize:
554 self.data.popitem(last=False)
555 self.data[key] = value
561 """ Cache for token information to avoid repeated database queries.
563 This cache is not thread-safe and needs to be instantiated per
566 def __init__(self, conn):
568 self.streets = _LRU(maxsize=256)
569 self.places = _LRU(maxsize=128)
570 self.address_terms = _LRU(maxsize=1024)
572 # Lookup houseunumbers up to 100 and cache them
573 with conn.cursor() as cur:
574 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
575 FROM generate_series(1, 100) as i""")
576 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
578 # For postcodes remember the ones that have already been added
579 self.postcodes = set()
581 def get_housenumber(self, number):
582 """ Get a housenumber token from the cache.
584 return self._cached_housenumbers.get(number)
587 def add_postcode(self, conn, postcode):
588 """ Make sure the given postcode is in the database.
590 if postcode not in self.postcodes:
591 with conn.cursor() as cur:
592 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
593 self.postcodes.add(postcode)