2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
19 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
21 DBCFG_NORMALIZATION = "tokenizer_normalization"
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyTokenizer(dsn, data_dir)
32 def _install_module(config_module_path, src_dir, module_dir):
33 """ Copies the PostgreSQL normalisation module into the project
34 directory if necessary. For historical reasons the module is
35 saved in the '/module' subdirectory and not with the other tokenizer
38 The function detects when the installation is run from the
39 build directory. It doesn't touch the module in that case.
41 # Custom module locations are simply used as is.
42 if config_module_path:
43 LOG.info("Using custom path for database module at '%s'", config_module_path)
44 return config_module_path
46 # Compatibility mode for builddir installations.
47 if module_dir.exists() and src_dir.samefile(module_dir):
48 LOG.info('Running from build directory. Leaving database module as is.')
51 # In any other case install the module in the project directory.
52 if not module_dir.exists():
55 destfile = module_dir / 'nominatim.so'
56 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
59 LOG.info('Database module installed at %s', str(destfile))
64 def _check_module(module_dir, conn):
65 """ Try to use the PostgreSQL module to confirm that it is correctly
66 installed and accessible from PostgreSQL.
68 with conn.cursor() as cur:
70 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
71 RETURNS text AS '{}/nominatim.so', 'transliteration'
72 LANGUAGE c IMMUTABLE STRICT;
73 DROP FUNCTION nominatim_test_import_func(text)
74 """.format(module_dir))
75 except psycopg2.DatabaseError as err:
76 LOG.fatal("Error accessing database module: %s", err)
77 raise UsageError("Database module cannot be accessed.") from err
80 class LegacyTokenizer(AbstractTokenizer):
81 """ The legacy tokenizer uses a special PostgreSQL module to normalize
82 names and queries. The tokenizer thus implements normalization through
83 calls to the database.
86 def __init__(self, dsn, data_dir):
88 self.data_dir = data_dir
89 self.normalization = None
92 def init_new_db(self, config, init_db=True):
93 """ Set up a new tokenizer for the database.
95 This copies all necessary data in the project directory to make
96 sure the tokenizer remains stable even over updates.
98 module_dir = _install_module(config.DATABASE_MODULE_PATH,
99 config.lib_dir.module,
100 config.project_dir / 'module')
102 self.normalization = config.TERM_NORMALIZATION
104 self._install_php(config)
106 with connect(self.dsn) as conn:
107 _check_module(module_dir, conn)
108 self._save_config(conn, config)
112 self.update_sql_functions(config)
113 self._init_db_tables(config)
116 def init_from_project(self, _):
117 """ Initialise the tokenizer from the project directory.
119 with connect(self.dsn) as conn:
120 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
123 def finalize_import(self, config):
124 """ Do any required postprocessing to make the tokenizer data ready
127 with connect(self.dsn) as conn:
128 sqlp = SQLPreprocessor(conn, config)
129 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
132 def update_sql_functions(self, config):
133 """ Reimport the SQL functions for this tokenizer.
135 with connect(self.dsn) as conn:
136 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
137 modulepath = config.DATABASE_MODULE_PATH or \
138 str((config.project_dir / 'module').resolve())
139 sqlp = SQLPreprocessor(conn, config)
140 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
141 max_word_freq=max_word_freq,
142 modulepath=modulepath)
145 def check_database(self, _):
146 """ Check that the tokenizer is set up correctly.
149 The Postgresql extension nominatim.so was not correctly loaded.
154 * Check the output of the CMmake/make installation step
155 * Does nominatim.so exist?
156 * Does nominatim.so exist on the database server?
157 * Can nominatim.so be accessed by the database user?
159 with connect(self.dsn) as conn:
160 with conn.cursor() as cur:
162 out = cur.scalar("SELECT make_standard_name('a')")
163 except psycopg2.Error as err:
164 return hint.format(error=str(err))
167 return hint.format(error='Unexpected result for make_standard_name()')
172 def migrate_database(self, config):
173 """ Initialise the project directory of an existing database for
174 use with this tokenizer.
176 This is a special migration function for updating existing databases
177 to new software versions.
179 self.normalization = config.TERM_NORMALIZATION
180 module_dir = _install_module(config.DATABASE_MODULE_PATH,
181 config.lib_dir.module,
182 config.project_dir / 'module')
184 with connect(self.dsn) as conn:
185 _check_module(module_dir, conn)
186 self._save_config(conn, config)
189 def update_statistics(self):
190 """ Recompute the frequency of full words.
192 with connect(self.dsn) as conn:
193 if conn.table_exists('search_name'):
194 with conn.cursor() as cur:
195 cur.drop_table("word_frequencies")
196 LOG.info("Computing word frequencies")
197 cur.execute("""CREATE TEMP TABLE word_frequencies AS
198 SELECT unnest(name_vector) as id, count(*)
199 FROM search_name GROUP BY id""")
200 cur.execute("CREATE INDEX ON word_frequencies(id)")
201 LOG.info("Update word table with recomputed frequencies")
202 cur.execute("""UPDATE word SET search_name_count = count
203 FROM word_frequencies
204 WHERE word_token like ' %' and word_id = id""")
205 cur.drop_table("word_frequencies")
208 def name_analyzer(self):
209 """ Create a new analyzer for tokenizing names and queries
210 using this tokinzer. Analyzers are context managers and should
214 with tokenizer.name_analyzer() as analyzer:
218 When used outside the with construct, the caller must ensure to
219 call the close() function before destructing the analyzer.
221 Analyzers are not thread-safe. You need to instantiate one per thread.
223 normalizer = Transliterator.createFromRules("phrase normalizer",
225 return LegacyNameAnalyzer(self.dsn, normalizer)
228 def _install_php(self, config):
229 """ Install the php script for the tokenizer.
231 php_file = self.data_dir / "tokenizer.php"
232 php_file.write_text(dedent("""\
234 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
235 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
236 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
240 def _init_db_tables(self, config):
241 """ Set up the word table and fill it with pre-computed word
244 with connect(self.dsn) as conn:
245 sqlp = SQLPreprocessor(conn, config)
246 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
249 LOG.warning("Precomputing word tokens")
250 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
253 def _save_config(self, conn, config):
254 """ Save the configuration that needs to remain stable for the given
255 database as database properties.
257 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
258 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
261 class LegacyNameAnalyzer(AbstractAnalyzer):
262 """ The legacy analyzer uses the special Postgresql module for
265 Each instance opens a connection to the database to request the
269 def __init__(self, dsn, normalizer):
270 self.conn = connect(dsn).connection
271 self.conn.autocommit = True
272 self.normalizer = normalizer
273 psycopg2.extras.register_hstore(self.conn)
275 self._cache = _TokenCache(self.conn)
279 """ Free all resources used by the analyzer.
286 def get_word_token_info(self, words):
287 """ Return token information for the given list of words.
288 If a word starts with # it is assumed to be a full name
289 otherwise is a partial name.
291 The function returns a list of tuples with
292 (original word, word token, word id).
294 The function is used for testing and debugging only
295 and not necessarily efficient.
297 with self.conn.cursor() as cur:
298 cur.execute("""SELECT t.term, word_token, word_id
299 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
300 WHERE word_token = (CASE
301 WHEN left(t.term, 1) = '#' THEN
302 ' ' || make_standard_name(substring(t.term from 2))
304 make_standard_name(t.term)
306 and class is null and country_code is null""",
309 return [(r[0], r[1], r[2]) for r in cur]
312 def normalize(self, phrase):
313 """ Normalize the given phrase, i.e. remove all properties that
314 are irrelevant for search.
316 return self.normalizer.transliterate(phrase)
320 def normalize_postcode(postcode):
321 """ Convert the postcode to a standardized form.
323 This function must yield exactly the same result as the SQL function
324 'token_normalized_postcode()'.
326 return postcode.strip().upper()
329 def update_postcodes_from_db(self):
330 """ Update postcode tokens in the word table from the location_postcode
333 with self.conn.cursor() as cur:
334 # This finds us the rows in location_postcode and word that are
335 # missing in the other table.
336 cur.execute("""SELECT * FROM
337 (SELECT pc, word FROM
338 (SELECT distinct(postcode) as pc FROM location_postcode) p
340 (SELECT word FROM word
341 WHERE class ='place' and type = 'postcode') w
343 WHERE pc is null or word is null""")
348 for postcode, word in cur:
350 to_delete.append(word)
352 to_add.append(postcode)
355 cur.execute("""DELETE FROM WORD
356 WHERE class ='place' and type = 'postcode'
360 cur.execute("""SELECT count(create_postcode_id(pc))
361 FROM unnest(%s) as pc
366 def update_special_phrases(self, phrases, should_replace):
367 """ Replace the search index for special phrases with the new phrases.
369 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
372 with self.conn.cursor() as cur:
373 # Get the old phrases.
374 existing_phrases = set()
375 cur.execute("""SELECT word, class, type, operator FROM word
376 WHERE class != 'place'
377 OR (type != 'house' AND type != 'postcode')""")
378 for label, cls, typ, oper in cur:
379 existing_phrases.add((label, cls, typ, oper or '-'))
381 to_add = norm_phrases - existing_phrases
382 to_delete = existing_phrases - norm_phrases
386 """ INSERT INTO word (word_id, word_token, word, class, type,
387 search_name_count, operator)
388 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
390 CASE WHEN op in ('in', 'near') THEN op ELSE null END
391 FROM (VALUES %s) as v(name, class, type, op))""",
394 if to_delete and should_replace:
396 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
397 WHERE word = name and class = in_class and type = in_type
398 and ((op = '-' and operator is null) or op = operator)""",
401 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
402 len(norm_phrases), len(to_add), len(to_delete))
405 def add_country_names(self, country_code, names):
406 """ Add names for the given country to the search index.
408 with self.conn.cursor() as cur:
410 """INSERT INTO word (word_id, word_token, country_code)
411 (SELECT nextval('seq_word'), lookup_token, %s
412 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
414 WHERE NOT EXISTS(SELECT * FROM word
415 WHERE word_token = lookup_token and country_code = %s))
416 """, (country_code, list(names.values()), country_code))
419 def process_place(self, place):
420 """ Determine tokenizer information about the given place.
422 Returns a JSON-serialisable structure that will be handed into
423 the database via the token_info field.
425 token_info = _TokenInfo(self._cache)
430 token_info.add_names(self.conn, names)
432 if place.is_country():
433 self.add_country_names(place.country_code, names)
435 address = place.address
437 self._process_place_address(token_info, address)
439 return token_info.data
442 def _process_place_address(self, token_info, address):
446 for key, value in address.items():
447 if key == 'postcode':
448 # Make sure the normalized postcode is present in the word table.
449 if re.search(r'[:,;]', value) is None:
450 self._cache.add_postcode(self.conn,
451 self.normalize_postcode(value))
452 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
454 elif key == 'street':
455 token_info.add_street(self.conn, value)
457 token_info.add_place(self.conn, value)
458 elif not key.startswith('_') and key not in ('country', 'full'):
459 addr_terms.append((key, value))
462 token_info.add_housenumbers(self.conn, hnrs)
465 token_info.add_address_terms(self.conn, addr_terms)
470 """ Collect token information to be sent back to the database.
472 def __init__(self, cache):
477 def add_names(self, conn, names):
478 """ Add token information for the names of the place.
480 with conn.cursor() as cur:
481 # Create the token IDs for all names.
482 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
486 def add_housenumbers(self, conn, hnrs):
487 """ Extract housenumber information from the address.
490 token = self.cache.get_housenumber(hnrs[0])
491 if token is not None:
492 self.data['hnr_tokens'] = token
493 self.data['hnr'] = hnrs[0]
496 # split numbers if necessary
499 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
501 if len(simple_list) > 1:
502 simple_list = list(set(simple_list))
504 with conn.cursor() as cur:
505 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
506 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
509 def add_street(self, conn, street):
510 """ Add addr:street match terms.
512 def _get_street(name):
513 with conn.cursor() as cur:
514 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
516 tokens = self.cache.streets.get(street, _get_street)
518 self.data['street'] = tokens
521 def add_place(self, conn, place):
522 """ Add addr:place search and match terms.
524 def _get_place(name):
525 with conn.cursor() as cur:
526 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
527 word_ids_from_name(%s)::text""",
529 return cur.fetchone()
531 self.data['place_search'], self.data['place_match'] = \
532 self.cache.places.get(place, _get_place)
535 def add_address_terms(self, conn, terms):
536 """ Add additional address terms.
538 def _get_address_term(name):
539 with conn.cursor() as cur:
540 cur.execute("""SELECT addr_ids_from_name(%s)::text,
541 word_ids_from_name(%s)::text""",
543 return cur.fetchone()
546 for key, value in terms:
547 items = self.cache.address_terms.get(value, _get_address_term)
548 if items[0] or items[1]:
552 self.data['addr'] = tokens
556 """ Least recently used cache that accepts a generator function to
557 produce the item when there is a cache miss.
560 def __init__(self, maxsize=128, init_data=None):
561 self.data = init_data or OrderedDict()
562 self.maxsize = maxsize
563 if init_data is not None and len(init_data) > maxsize:
564 self.maxsize = len(init_data)
566 def get(self, key, generator):
567 """ Get the item with the given key from the cache. If nothing
568 is found in the cache, generate the value through the
569 generator function and store it in the cache.
571 value = self.data.get(key)
572 if value is not None:
573 self.data.move_to_end(key)
575 value = generator(key)
576 if len(self.data) >= self.maxsize:
577 self.data.popitem(last=False)
578 self.data[key] = value
584 """ Cache for token information to avoid repeated database queries.
586 This cache is not thread-safe and needs to be instantiated per
589 def __init__(self, conn):
591 self.streets = _LRU(maxsize=256)
592 self.places = _LRU(maxsize=128)
593 self.address_terms = _LRU(maxsize=1024)
595 # Lookup houseunumbers up to 100 and cache them
596 with conn.cursor() as cur:
597 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
598 FROM generate_series(1, 100) as i""")
599 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
601 # For postcodes remember the ones that have already been added
602 self.postcodes = set()
604 def get_housenumber(self, number):
605 """ Get a housenumber token from the cache.
607 return self._cached_housenumbers.get(number)
610 def add_postcode(self, conn, postcode):
611 """ Make sure the given postcode is in the database.
613 if postcode not in self.postcodes:
614 with conn.cursor() as cur:
615 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
616 self.postcodes.add(postcode)