2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
20 DBCFG_NORMALIZATION = "tokenizer_normalization"
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
23 LOG = logging.getLogger()
25 def create(dsn, data_dir):
26 """ Create a new instance of the tokenizer provided by this module.
28 return LegacyTokenizer(dsn, data_dir)
31 def _install_module(config_module_path, src_dir, module_dir):
32 """ Copies the PostgreSQL normalisation module into the project
33 directory if necessary. For historical reasons the module is
34 saved in the '/module' subdirectory and not with the other tokenizer
37 The function detects when the installation is run from the
38 build directory. It doesn't touch the module in that case.
40 # Custom module locations are simply used as is.
41 if config_module_path:
42 LOG.info("Using custom path for database module at '%s'", config_module_path)
43 return config_module_path
45 # Compatibility mode for builddir installations.
46 if module_dir.exists() and src_dir.samefile(module_dir):
47 LOG.info('Running from build directory. Leaving database module as is.')
50 # In any other case install the module in the project directory.
51 if not module_dir.exists():
54 destfile = module_dir / 'nominatim.so'
55 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
58 LOG.info('Database module installed at %s', str(destfile))
63 def _check_module(module_dir, conn):
64 """ Try to use the PostgreSQL module to confirm that it is correctly
65 installed and accessible from PostgreSQL.
67 with conn.cursor() as cur:
69 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
70 RETURNS text AS '{}/nominatim.so', 'transliteration'
71 LANGUAGE c IMMUTABLE STRICT;
72 DROP FUNCTION nominatim_test_import_func(text)
73 """.format(module_dir))
74 except psycopg2.DatabaseError as err:
75 LOG.fatal("Error accessing database module: %s", err)
76 raise UsageError("Database module cannot be accessed.") from err
79 class LegacyTokenizer:
80 """ The legacy tokenizer uses a special PostgreSQL module to normalize
81 names and queries. The tokenizer thus implements normalization through
82 calls to the database.
85 def __init__(self, dsn, data_dir):
87 self.data_dir = data_dir
88 self.normalization = None
91 def init_new_db(self, config, init_db=True):
92 """ Set up a new tokenizer for the database.
94 This copies all necessary data in the project directory to make
95 sure the tokenizer remains stable even over updates.
97 module_dir = _install_module(config.DATABASE_MODULE_PATH,
98 config.lib_dir.module,
99 config.project_dir / 'module')
101 self.normalization = config.TERM_NORMALIZATION
103 self._install_php(config)
105 with connect(self.dsn) as conn:
106 _check_module(module_dir, conn)
107 self._save_config(conn, config)
111 self.update_sql_functions(config)
112 self._init_db_tables(config)
115 def init_from_project(self):
116 """ Initialise the tokenizer from the project directory.
118 with connect(self.dsn) as conn:
119 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
122 def update_sql_functions(self, config):
123 """ Reimport the SQL functions for this tokenizer.
125 with connect(self.dsn) as conn:
126 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
127 modulepath = config.DATABASE_MODULE_PATH or \
128 str((config.project_dir / 'module').resolve())
129 sqlp = SQLPreprocessor(conn, config)
130 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
131 max_word_freq=max_word_freq,
132 modulepath=modulepath)
135 def migrate_database(self, config):
136 """ Initialise the project directory of an existing database for
137 use with this tokenizer.
139 This is a special migration function for updating existing databases
140 to new software versions.
142 self.normalization = config.TERM_NORMALIZATION
143 module_dir = _install_module(config.DATABASE_MODULE_PATH,
144 config.lib_dir.module,
145 config.project_dir / 'module')
147 with connect(self.dsn) as conn:
148 _check_module(module_dir, conn)
149 self._save_config(conn, config)
152 def name_analyzer(self):
153 """ Create a new analyzer for tokenizing names and queries
154 using this tokinzer. Analyzers are context managers and should
158 with tokenizer.name_analyzer() as analyzer:
162 When used outside the with construct, the caller must ensure to
163 call the close() function before destructing the analyzer.
165 Analyzers are not thread-safe. You need to instantiate one per thread.
167 normalizer = Transliterator.createFromRules("phrase normalizer",
169 return LegacyNameAnalyzer(self.dsn, normalizer)
172 def _install_php(self, config):
173 """ Install the php script for the tokenizer.
175 php_file = self.data_dir / "tokenizer.php"
176 php_file.write_text(dedent("""\
178 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
179 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
180 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
184 def _init_db_tables(self, config):
185 """ Set up the word table and fill it with pre-computed word
188 with connect(self.dsn) as conn:
189 sqlp = SQLPreprocessor(conn, config)
190 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
193 LOG.warning("Precomputing word tokens")
194 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
197 def _save_config(self, conn, config):
198 """ Save the configuration that needs to remain stable for the given
199 database as database properties.
201 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
202 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
205 class LegacyNameAnalyzer:
206 """ The legacy analyzer uses the special Postgresql module for
209 Each instance opens a connection to the database to request the
213 def __init__(self, dsn, normalizer):
214 self.conn = connect(dsn).connection
215 self.conn.autocommit = True
216 self.normalizer = normalizer
217 psycopg2.extras.register_hstore(self.conn)
219 self._cache = _TokenCache(self.conn)
226 def __exit__(self, exc_type, exc_value, traceback):
231 """ Free all resources used by the analyzer.
238 def normalize(self, phrase):
239 """ Normalize the given phrase, i.e. remove all properties that
240 are irrelevant for search.
242 return self.normalizer.transliterate(phrase)
245 def add_postcodes_from_db(self):
246 """ Add postcodes from the location_postcode table to the word table.
248 with self.conn.cursor() as cur:
249 cur.execute("""SELECT count(create_postcode_id(pc))
250 FROM (SELECT distinct(postcode) as pc
251 FROM location_postcode) x""")
254 def update_special_phrases(self, phrases):
255 """ Replace the search index for special phrases with the new phrases.
257 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
260 with self.conn.cursor() as cur:
261 # Get the old phrases.
262 existing_phrases = set()
263 cur.execute("""SELECT word, class, type, operator FROM word
264 WHERE class != 'place'
265 OR (type != 'house' AND type != 'postcode')""")
266 for label, cls, typ, oper in cur:
267 existing_phrases.add((label, cls, typ, oper or '-'))
269 to_add = norm_phrases - existing_phrases
270 to_delete = existing_phrases - norm_phrases
273 psycopg2.extras.execute_values(
275 """ INSERT INTO word (word_id, word_token, word, class, type,
276 search_name_count, operator)
277 (SELECT nextval('seq_word'), make_standard_name(name), name,
279 CASE WHEN op in ('in', 'near') THEN op ELSE null END
280 FROM (VALUES %s) as v(name, class, type, op))""",
284 psycopg2.extras.execute_values(
286 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
287 WHERE word = name and class = in_class and type = in_type
288 and ((op = '-' and operator is null) or op = operator)""",
291 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
292 len(norm_phrases), len(to_add), len(to_delete))
295 def add_country_names(self, country_code, names):
296 """ Add names for the given country to the search index.
298 with self.conn.cursor() as cur:
300 """INSERT INTO word (word_id, word_token, country_code)
301 (SELECT nextval('seq_word'), lookup_token, %s
302 FROM (SELECT ' ' || make_standard_name(n) as lookup_token
304 WHERE NOT EXISTS(SELECT * FROM word
305 WHERE word_token = lookup_token and country_code = %s))
306 """, (country_code, names, country_code))
309 def process_place(self, place):
310 """ Determine tokenizer information about the given place.
312 Returns a JSON-serialisable structure that will be handed into
313 the database via the token_info field.
315 token_info = _TokenInfo(self._cache)
317 names = place.get('name')
320 token_info.add_names(self.conn, names)
322 country_feature = place.get('country_feature')
323 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
324 self.add_country_names(country_feature.lower(), list(names.values()))
326 address = place.get('address')
331 for key, value in address.items():
332 if key == 'postcode':
333 self._add_postcode(value)
334 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
336 elif key == 'street':
337 token_info.add_street(self.conn, value)
339 token_info.add_place(self.conn, value)
340 elif not key.startswith('_') and \
341 key not in ('country', 'full'):
342 addr_terms.append((key, value))
345 token_info.add_housenumbers(self.conn, hnrs)
348 token_info.add_address_terms(self.conn, addr_terms)
350 return token_info.data
353 def _add_postcode(self, postcode):
354 """ Make sure the normalized postcode is present in the word table.
356 def _create_postcode_from_db(pcode):
357 with self.conn.cursor() as cur:
358 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
360 if re.search(r'[:,;]', postcode) is None:
361 self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
365 """ Collect token information to be sent back to the database.
367 def __init__(self, cache):
372 def add_names(self, conn, names):
373 """ Add token information for the names of the place.
375 with conn.cursor() as cur:
376 # Create the token IDs for all names.
377 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
381 def add_housenumbers(self, conn, hnrs):
382 """ Extract housenumber information from the address.
385 token = self.cache.get_housenumber(hnrs[0])
386 if token is not None:
387 self.data['hnr_tokens'] = token
388 self.data['hnr'] = hnrs[0]
391 # split numbers if necessary
394 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
396 if len(simple_list) > 1:
397 simple_list = list(set(simple_list))
399 with conn.cursor() as cur:
400 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
401 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
404 def add_street(self, conn, street):
405 """ Add addr:street match terms.
407 def _get_street(name):
408 with conn.cursor() as cur:
409 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
411 self.data['street'] = self.cache.streets.get(street, _get_street)
414 def add_place(self, conn, place):
415 """ Add addr:place search and match terms.
417 def _get_place(name):
418 with conn.cursor() as cur:
419 cur.execute("""SELECT (addr_ids_from_name(%s)
420 || getorcreate_name_id(make_standard_name(%s), ''))::text,
421 word_ids_from_name(%s)::text""",
423 return cur.fetchone()
425 self.data['place_search'], self.data['place_match'] = \
426 self.cache.places.get(place, _get_place)
429 def add_address_terms(self, conn, terms):
430 """ Add additional address terms.
432 def _get_address_term(name):
433 with conn.cursor() as cur:
434 cur.execute("""SELECT addr_ids_from_name(%s)::text,
435 word_ids_from_name(%s)::text""",
437 return cur.fetchone()
440 for key, value in terms:
441 tokens[key] = self.cache.address_terms.get(value, _get_address_term)
443 self.data['addr'] = tokens
447 """ Least recently used cache that accepts a generator function to
448 produce the item when there is a cache miss.
451 def __init__(self, maxsize=128, init_data=None):
452 self.data = init_data or OrderedDict()
453 self.maxsize = maxsize
454 if init_data is not None and len(init_data) > maxsize:
455 self.maxsize = len(init_data)
457 def get(self, key, generator):
458 """ Get the item with the given key from the cache. If nothing
459 is found in the cache, generate the value through the
460 generator function and store it in the cache.
462 value = self.data.get(key)
463 if value is not None:
464 self.data.move_to_end(key)
466 value = generator(key)
467 if len(self.data) >= self.maxsize:
468 self.data.popitem(last=False)
469 self.data[key] = value
475 """ Cache for token information to avoid repeated database queries.
477 This cache is not thread-safe and needs to be instantiated per
480 def __init__(self, conn):
482 self.streets = _LRU(maxsize=256)
483 self.places = _LRU(maxsize=128)
484 self.address_terms = _LRU(maxsize=1024)
486 # Lookup houseunumbers up to 100 and cache them
487 with conn.cursor() as cur:
488 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
489 FROM generate_series(1, 100) as i""")
490 self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
492 # Get postcodes that are already saved
493 postcodes = OrderedDict()
494 with conn.cursor() as cur:
495 cur.execute("""SELECT word FROM word
496 WHERE class ='place' and type = 'postcode'""")
498 postcodes[row[0]] = None
499 self.postcodes = _LRU(maxsize=32, init_data=postcodes)
501 def get_housenumber(self, number):
502 """ Get a housenumber token from the cache.
504 return self._cached_housenumbers.get(number)