2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
20 DBCFG_NORMALIZATION = "tokenizer_normalization"
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
23 LOG = logging.getLogger()
25 def create(dsn, data_dir):
26 """ Create a new instance of the tokenizer provided by this module.
28 return LegacyTokenizer(dsn, data_dir)
31 def _install_module(config_module_path, src_dir, module_dir):
32 """ Copies the PostgreSQL normalisation module into the project
33 directory if necessary. For historical reasons the module is
34 saved in the '/module' subdirectory and not with the other tokenizer
37 The function detects when the installation is run from the
38 build directory. It doesn't touch the module in that case.
40 # Custom module locations are simply used as is.
41 if config_module_path:
42 LOG.info("Using custom path for database module at '%s'", config_module_path)
43 return config_module_path
45 # Compatibility mode for builddir installations.
46 if module_dir.exists() and src_dir.samefile(module_dir):
47 LOG.info('Running from build directory. Leaving database module as is.')
50 # In any other case install the module in the project directory.
51 if not module_dir.exists():
54 destfile = module_dir / 'nominatim.so'
55 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
58 LOG.info('Database module installed at %s', str(destfile))
63 def _check_module(module_dir, conn):
64 """ Try to use the PostgreSQL module to confirm that it is correctly
65 installed and accessible from PostgreSQL.
67 with conn.cursor() as cur:
69 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
70 RETURNS text AS '{}/nominatim.so', 'transliteration'
71 LANGUAGE c IMMUTABLE STRICT;
72 DROP FUNCTION nominatim_test_import_func(text)
73 """.format(module_dir))
74 except psycopg2.DatabaseError as err:
75 LOG.fatal("Error accessing database module: %s", err)
76 raise UsageError("Database module cannot be accessed.") from err
79 class LegacyTokenizer:
80 """ The legacy tokenizer uses a special PostgreSQL module to normalize
81 names and queries. The tokenizer thus implements normalization through
82 calls to the database.
85 def __init__(self, dsn, data_dir):
87 self.data_dir = data_dir
88 self.normalization = None
91 def init_new_db(self, config, init_db=True):
92 """ Set up a new tokenizer for the database.
94 This copies all necessary data in the project directory to make
95 sure the tokenizer remains stable even over updates.
97 module_dir = _install_module(config.DATABASE_MODULE_PATH,
98 config.lib_dir.module,
99 config.project_dir / 'module')
101 self.normalization = config.TERM_NORMALIZATION
103 self._install_php(config)
105 with connect(self.dsn) as conn:
106 _check_module(module_dir, conn)
107 self._save_config(conn, config)
111 self.update_sql_functions(config)
112 self._init_db_tables(config)
115 def init_from_project(self):
116 """ Initialise the tokenizer from the project directory.
118 with connect(self.dsn) as conn:
119 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
122 def update_sql_functions(self, config):
123 """ Reimport the SQL functions for this tokenizer.
125 with connect(self.dsn) as conn:
126 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
127 modulepath = config.DATABASE_MODULE_PATH or \
128 str((config.project_dir / 'module').resolve())
129 sqlp = SQLPreprocessor(conn, config)
130 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
131 max_word_freq=max_word_freq,
132 modulepath=modulepath)
135 def check_database(self):
136 """ Check that the tokenizer is set up correctly.
139 The Postgresql extension nominatim.so was not correctly loaded.
144 * Check the output of the CMmake/make installation step
145 * Does nominatim.so exist?
146 * Does nominatim.so exist on the database server?
147 * Can nominatim.so be accessed by the database user?
149 with connect(self.dsn) as conn:
150 with conn.cursor() as cur:
152 out = cur.scalar("SELECT make_standard_name('a')")
153 except psycopg2.Error as err:
154 return hint.format(error=str(err))
157 return hint.format(error='Unexpected result for make_standard_name()')
162 def migrate_database(self, config):
163 """ Initialise the project directory of an existing database for
164 use with this tokenizer.
166 This is a special migration function for updating existing databases
167 to new software versions.
169 self.normalization = config.TERM_NORMALIZATION
170 module_dir = _install_module(config.DATABASE_MODULE_PATH,
171 config.lib_dir.module,
172 config.project_dir / 'module')
174 with connect(self.dsn) as conn:
175 _check_module(module_dir, conn)
176 self._save_config(conn, config)
179 def name_analyzer(self):
180 """ Create a new analyzer for tokenizing names and queries
181 using this tokinzer. Analyzers are context managers and should
185 with tokenizer.name_analyzer() as analyzer:
189 When used outside the with construct, the caller must ensure to
190 call the close() function before destructing the analyzer.
192 Analyzers are not thread-safe. You need to instantiate one per thread.
194 normalizer = Transliterator.createFromRules("phrase normalizer",
196 return LegacyNameAnalyzer(self.dsn, normalizer)
199 def _install_php(self, config):
200 """ Install the php script for the tokenizer.
202 php_file = self.data_dir / "tokenizer.php"
203 php_file.write_text(dedent("""\
205 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
206 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
207 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
211 def _init_db_tables(self, config):
212 """ Set up the word table and fill it with pre-computed word
215 with connect(self.dsn) as conn:
216 sqlp = SQLPreprocessor(conn, config)
217 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
220 LOG.warning("Precomputing word tokens")
221 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
224 def _save_config(self, conn, config):
225 """ Save the configuration that needs to remain stable for the given
226 database as database properties.
228 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
229 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
232 class LegacyNameAnalyzer:
233 """ The legacy analyzer uses the special Postgresql module for
236 Each instance opens a connection to the database to request the
240 def __init__(self, dsn, normalizer):
241 self.conn = connect(dsn).connection
242 self.conn.autocommit = True
243 self.normalizer = normalizer
244 psycopg2.extras.register_hstore(self.conn)
246 self._cache = _TokenCache(self.conn)
253 def __exit__(self, exc_type, exc_value, traceback):
258 """ Free all resources used by the analyzer.
265 def normalize(self, phrase):
266 """ Normalize the given phrase, i.e. remove all properties that
267 are irrelevant for search.
269 return self.normalizer.transliterate(phrase)
272 def add_postcodes_from_db(self):
273 """ Add postcodes from the location_postcode table to the word table.
275 with self.conn.cursor() as cur:
276 cur.execute("""SELECT count(create_postcode_id(pc))
277 FROM (SELECT distinct(postcode) as pc
278 FROM location_postcode) x""")
281 def update_special_phrases(self, phrases):
282 """ Replace the search index for special phrases with the new phrases.
284 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
287 with self.conn.cursor() as cur:
288 # Get the old phrases.
289 existing_phrases = set()
290 cur.execute("""SELECT word, class, type, operator FROM word
291 WHERE class != 'place'
292 OR (type != 'house' AND type != 'postcode')""")
293 for label, cls, typ, oper in cur:
294 existing_phrases.add((label, cls, typ, oper or '-'))
296 to_add = norm_phrases - existing_phrases
297 to_delete = existing_phrases - norm_phrases
300 psycopg2.extras.execute_values(
302 """ INSERT INTO word (word_id, word_token, word, class, type,
303 search_name_count, operator)
304 (SELECT nextval('seq_word'), make_standard_name(name), name,
306 CASE WHEN op in ('in', 'near') THEN op ELSE null END
307 FROM (VALUES %s) as v(name, class, type, op))""",
311 psycopg2.extras.execute_values(
313 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
314 WHERE word = name and class = in_class and type = in_type
315 and ((op = '-' and operator is null) or op = operator)""",
318 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
319 len(norm_phrases), len(to_add), len(to_delete))
322 def add_country_names(self, country_code, names):
323 """ Add names for the given country to the search index.
325 with self.conn.cursor() as cur:
327 """INSERT INTO word (word_id, word_token, country_code)
328 (SELECT nextval('seq_word'), lookup_token, %s
329 FROM (SELECT ' ' || make_standard_name(n) as lookup_token
331 WHERE NOT EXISTS(SELECT * FROM word
332 WHERE word_token = lookup_token and country_code = %s))
333 """, (country_code, names, country_code))
336 def process_place(self, place):
337 """ Determine tokenizer information about the given place.
339 Returns a JSON-serialisable structure that will be handed into
340 the database via the token_info field.
342 token_info = _TokenInfo(self._cache)
344 names = place.get('name')
347 token_info.add_names(self.conn, names)
349 country_feature = place.get('country_feature')
350 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
351 self.add_country_names(country_feature.lower(), list(names.values()))
353 address = place.get('address')
358 for key, value in address.items():
359 if key == 'postcode':
360 self._add_postcode(value)
361 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
363 elif key == 'street':
364 token_info.add_street(self.conn, value)
366 token_info.add_place(self.conn, value)
367 elif not key.startswith('_') and \
368 key not in ('country', 'full'):
369 addr_terms.append((key, value))
372 token_info.add_housenumbers(self.conn, hnrs)
375 token_info.add_address_terms(self.conn, addr_terms)
377 return token_info.data
380 def _add_postcode(self, postcode):
381 """ Make sure the normalized postcode is present in the word table.
383 def _create_postcode_from_db(pcode):
384 with self.conn.cursor() as cur:
385 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
387 if re.search(r'[:,;]', postcode) is None:
388 self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
392 """ Collect token information to be sent back to the database.
394 def __init__(self, cache):
399 def add_names(self, conn, names):
400 """ Add token information for the names of the place.
402 with conn.cursor() as cur:
403 # Create the token IDs for all names.
404 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
408 def add_housenumbers(self, conn, hnrs):
409 """ Extract housenumber information from the address.
412 token = self.cache.get_housenumber(hnrs[0])
413 if token is not None:
414 self.data['hnr_tokens'] = token
415 self.data['hnr'] = hnrs[0]
418 # split numbers if necessary
421 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
423 if len(simple_list) > 1:
424 simple_list = list(set(simple_list))
426 with conn.cursor() as cur:
427 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
428 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
431 def add_street(self, conn, street):
432 """ Add addr:street match terms.
434 def _get_street(name):
435 with conn.cursor() as cur:
436 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
438 self.data['street'] = self.cache.streets.get(street, _get_street)
441 def add_place(self, conn, place):
442 """ Add addr:place search and match terms.
444 def _get_place(name):
445 with conn.cursor() as cur:
446 cur.execute("""SELECT (addr_ids_from_name(%s)
447 || getorcreate_name_id(make_standard_name(%s), ''))::text,
448 word_ids_from_name(%s)::text""",
450 return cur.fetchone()
452 self.data['place_search'], self.data['place_match'] = \
453 self.cache.places.get(place, _get_place)
456 def add_address_terms(self, conn, terms):
457 """ Add additional address terms.
459 def _get_address_term(name):
460 with conn.cursor() as cur:
461 cur.execute("""SELECT addr_ids_from_name(%s)::text,
462 word_ids_from_name(%s)::text""",
464 return cur.fetchone()
467 for key, value in terms:
468 tokens[key] = self.cache.address_terms.get(value, _get_address_term)
470 self.data['addr'] = tokens
474 """ Least recently used cache that accepts a generator function to
475 produce the item when there is a cache miss.
478 def __init__(self, maxsize=128, init_data=None):
479 self.data = init_data or OrderedDict()
480 self.maxsize = maxsize
481 if init_data is not None and len(init_data) > maxsize:
482 self.maxsize = len(init_data)
484 def get(self, key, generator):
485 """ Get the item with the given key from the cache. If nothing
486 is found in the cache, generate the value through the
487 generator function and store it in the cache.
489 value = self.data.get(key)
490 if value is not None:
491 self.data.move_to_end(key)
493 value = generator(key)
494 if len(self.data) >= self.maxsize:
495 self.data.popitem(last=False)
496 self.data[key] = value
502 """ Cache for token information to avoid repeated database queries.
504 This cache is not thread-safe and needs to be instantiated per
507 def __init__(self, conn):
509 self.streets = _LRU(maxsize=256)
510 self.places = _LRU(maxsize=128)
511 self.address_terms = _LRU(maxsize=1024)
513 # Lookup houseunumbers up to 100 and cache them
514 with conn.cursor() as cur:
515 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
516 FROM generate_series(1, 100) as i""")
517 self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
519 # Get postcodes that are already saved
520 postcodes = OrderedDict()
521 with conn.cursor() as cur:
522 cur.execute("""SELECT word FROM word
523 WHERE class ='place' and type = 'postcode'""")
525 postcodes[row[0]] = None
526 self.postcodes = _LRU(maxsize=32, init_data=postcodes)
528 def get_housenumber(self, number):
529 """ Get a housenumber token from the cache.
531 return self._cached_housenumbers.get(number)