2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
19 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
21 DBCFG_NORMALIZATION = "tokenizer_normalization"
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyTokenizer(dsn, data_dir)
32 def _install_module(config_module_path, src_dir, module_dir):
33 """ Copies the PostgreSQL normalisation module into the project
34 directory if necessary. For historical reasons the module is
35 saved in the '/module' subdirectory and not with the other tokenizer
38 The function detects when the installation is run from the
39 build directory. It doesn't touch the module in that case.
41 # Custom module locations are simply used as is.
42 if config_module_path:
43 LOG.info("Using custom path for database module at '%s'", config_module_path)
44 return config_module_path
46 # Compatibility mode for builddir installations.
47 if module_dir.exists() and src_dir.samefile(module_dir):
48 LOG.info('Running from build directory. Leaving database module as is.')
51 # In any other case install the module in the project directory.
52 if not module_dir.exists():
55 destfile = module_dir / 'nominatim.so'
56 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
59 LOG.info('Database module installed at %s', str(destfile))
64 def _check_module(module_dir, conn):
65 """ Try to use the PostgreSQL module to confirm that it is correctly
66 installed and accessible from PostgreSQL.
68 with conn.cursor() as cur:
70 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
71 RETURNS text AS '{}/nominatim.so', 'transliteration'
72 LANGUAGE c IMMUTABLE STRICT;
73 DROP FUNCTION nominatim_test_import_func(text)
74 """.format(module_dir))
75 except psycopg2.DatabaseError as err:
76 LOG.fatal("Error accessing database module: %s", err)
77 raise UsageError("Database module cannot be accessed.") from err
80 class LegacyTokenizer(AbstractTokenizer):
81 """ The legacy tokenizer uses a special PostgreSQL module to normalize
82 names and queries. The tokenizer thus implements normalization through
83 calls to the database.
86 def __init__(self, dsn, data_dir):
88 self.data_dir = data_dir
89 self.normalization = None
92 def init_new_db(self, config, init_db=True):
93 """ Set up a new tokenizer for the database.
95 This copies all necessary data in the project directory to make
96 sure the tokenizer remains stable even over updates.
98 module_dir = _install_module(config.DATABASE_MODULE_PATH,
99 config.lib_dir.module,
100 config.project_dir / 'module')
102 self.normalization = config.TERM_NORMALIZATION
104 self._install_php(config)
106 with connect(self.dsn) as conn:
107 _check_module(module_dir, conn)
108 self._save_config(conn, config)
112 self.update_sql_functions(config)
113 self._init_db_tables(config)
116 def init_from_project(self, _):
117 """ Initialise the tokenizer from the project directory.
119 with connect(self.dsn) as conn:
120 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
123 def finalize_import(self, config):
124 """ Do any required postprocessing to make the tokenizer data ready
127 with connect(self.dsn) as conn:
128 sqlp = SQLPreprocessor(conn, config)
129 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
132 def update_sql_functions(self, config):
133 """ Reimport the SQL functions for this tokenizer.
135 with connect(self.dsn) as conn:
136 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
137 modulepath = config.DATABASE_MODULE_PATH or \
138 str((config.project_dir / 'module').resolve())
139 sqlp = SQLPreprocessor(conn, config)
140 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
141 max_word_freq=max_word_freq,
142 modulepath=modulepath)
145 def check_database(self, _):
146 """ Check that the tokenizer is set up correctly.
149 The Postgresql extension nominatim.so was not correctly loaded.
154 * Check the output of the CMmake/make installation step
155 * Does nominatim.so exist?
156 * Does nominatim.so exist on the database server?
157 * Can nominatim.so be accessed by the database user?
159 with connect(self.dsn) as conn:
160 with conn.cursor() as cur:
162 out = cur.scalar("SELECT make_standard_name('a')")
163 except psycopg2.Error as err:
164 return hint.format(error=str(err))
167 return hint.format(error='Unexpected result for make_standard_name()')
172 def migrate_database(self, config):
173 """ Initialise the project directory of an existing database for
174 use with this tokenizer.
176 This is a special migration function for updating existing databases
177 to new software versions.
179 self.normalization = config.TERM_NORMALIZATION
180 module_dir = _install_module(config.DATABASE_MODULE_PATH,
181 config.lib_dir.module,
182 config.project_dir / 'module')
184 with connect(self.dsn) as conn:
185 _check_module(module_dir, conn)
186 self._save_config(conn, config)
189 def update_statistics(self):
190 """ Recompute the frequency of full words.
192 with connect(self.dsn) as conn:
193 with conn.cursor() as cur:
194 cur.drop_table("word_frequencies")
195 LOG.info("Computing word frequencies")
196 cur.execute("""CREATE TEMP TABLE word_frequencies AS
197 SELECT unnest(name_vector) as id, count(*)
198 FROM search_name GROUP BY id""")
199 cur.execute("CREATE INDEX ON word_frequencies(id)")
200 LOG.info("Update word table with recomputed frequencies")
201 cur.execute("""UPDATE word SET search_name_count = count
202 FROM word_frequencies
203 WHERE word_token like ' %' and word_id = id""")
204 cur.drop_table("word_frequencies")
207 def name_analyzer(self):
208 """ Create a new analyzer for tokenizing names and queries
209 using this tokinzer. Analyzers are context managers and should
213 with tokenizer.name_analyzer() as analyzer:
217 When used outside the with construct, the caller must ensure to
218 call the close() function before destructing the analyzer.
220 Analyzers are not thread-safe. You need to instantiate one per thread.
222 normalizer = Transliterator.createFromRules("phrase normalizer",
224 return LegacyNameAnalyzer(self.dsn, normalizer)
227 def _install_php(self, config):
228 """ Install the php script for the tokenizer.
230 php_file = self.data_dir / "tokenizer.php"
231 php_file.write_text(dedent("""\
233 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
234 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
235 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
239 def _init_db_tables(self, config):
240 """ Set up the word table and fill it with pre-computed word
243 with connect(self.dsn) as conn:
244 sqlp = SQLPreprocessor(conn, config)
245 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
248 LOG.warning("Precomputing word tokens")
249 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
252 def _save_config(self, conn, config):
253 """ Save the configuration that needs to remain stable for the given
254 database as database properties.
256 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
257 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
260 class LegacyNameAnalyzer(AbstractAnalyzer):
261 """ The legacy analyzer uses the special Postgresql module for
264 Each instance opens a connection to the database to request the
268 def __init__(self, dsn, normalizer):
269 self.conn = connect(dsn).connection
270 self.conn.autocommit = True
271 self.normalizer = normalizer
272 psycopg2.extras.register_hstore(self.conn)
274 self._cache = _TokenCache(self.conn)
278 """ Free all resources used by the analyzer.
285 def get_word_token_info(self, words):
286 """ Return token information for the given list of words.
287 If a word starts with # it is assumed to be a full name
288 otherwise is a partial name.
290 The function returns a list of tuples with
291 (original word, word token, word id).
293 The function is used for testing and debugging only
294 and not necessarily efficient.
296 with self.conn.cursor() as cur:
297 cur.execute("""SELECT t.term, word_token, word_id
298 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
299 WHERE word_token = (CASE
300 WHEN left(t.term, 1) = '#' THEN
301 ' ' || make_standard_name(substring(t.term from 2))
303 make_standard_name(t.term)
305 and class is null and country_code is null""",
308 return [(r[0], r[1], r[2]) for r in cur]
311 def normalize(self, phrase):
312 """ Normalize the given phrase, i.e. remove all properties that
313 are irrelevant for search.
315 return self.normalizer.transliterate(phrase)
319 def normalize_postcode(postcode):
320 """ Convert the postcode to a standardized form.
322 This function must yield exactly the same result as the SQL function
323 'token_normalized_postcode()'.
325 return postcode.strip().upper()
328 def update_postcodes_from_db(self):
329 """ Update postcode tokens in the word table from the location_postcode
332 with self.conn.cursor() as cur:
333 # This finds us the rows in location_postcode and word that are
334 # missing in the other table.
335 cur.execute("""SELECT * FROM
336 (SELECT pc, word FROM
337 (SELECT distinct(postcode) as pc FROM location_postcode) p
339 (SELECT word FROM word
340 WHERE class ='place' and type = 'postcode') w
342 WHERE pc is null or word is null""")
347 for postcode, word in cur:
349 to_delete.append(word)
351 to_add.append(postcode)
354 cur.execute("""DELETE FROM WORD
355 WHERE class ='place' and type = 'postcode'
359 cur.execute("""SELECT count(create_postcode_id(pc))
360 FROM unnest(%s) as pc
365 def update_special_phrases(self, phrases, should_replace):
366 """ Replace the search index for special phrases with the new phrases.
368 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
371 with self.conn.cursor() as cur:
372 # Get the old phrases.
373 existing_phrases = set()
374 cur.execute("""SELECT word, class, type, operator FROM word
375 WHERE class != 'place'
376 OR (type != 'house' AND type != 'postcode')""")
377 for label, cls, typ, oper in cur:
378 existing_phrases.add((label, cls, typ, oper or '-'))
380 to_add = norm_phrases - existing_phrases
381 to_delete = existing_phrases - norm_phrases
385 """ INSERT INTO word (word_id, word_token, word, class, type,
386 search_name_count, operator)
387 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
389 CASE WHEN op in ('in', 'near') THEN op ELSE null END
390 FROM (VALUES %s) as v(name, class, type, op))""",
393 if to_delete and should_replace:
395 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
396 WHERE word = name and class = in_class and type = in_type
397 and ((op = '-' and operator is null) or op = operator)""",
400 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
401 len(norm_phrases), len(to_add), len(to_delete))
404 def add_country_names(self, country_code, names):
405 """ Add names for the given country to the search index.
407 with self.conn.cursor() as cur:
409 """INSERT INTO word (word_id, word_token, country_code)
410 (SELECT nextval('seq_word'), lookup_token, %s
411 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
413 WHERE NOT EXISTS(SELECT * FROM word
414 WHERE word_token = lookup_token and country_code = %s))
415 """, (country_code, list(names.values()), country_code))
418 def process_place(self, place):
419 """ Determine tokenizer information about the given place.
421 Returns a JSON-serialisable structure that will be handed into
422 the database via the token_info field.
424 token_info = _TokenInfo(self._cache)
429 token_info.add_names(self.conn, names)
431 if place.is_country():
432 self.add_country_names(place.country_code, names)
434 address = place.address
436 self._process_place_address(token_info, address)
438 return token_info.data
441 def _process_place_address(self, token_info, address):
445 for key, value in address.items():
446 if key == 'postcode':
447 # Make sure the normalized postcode is present in the word table.
448 if re.search(r'[:,;]', value) is None:
449 self._cache.add_postcode(self.conn,
450 self.normalize_postcode(value))
451 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
453 elif key == 'street':
454 token_info.add_street(self.conn, value)
456 token_info.add_place(self.conn, value)
457 elif not key.startswith('_') and key not in ('country', 'full'):
458 addr_terms.append((key, value))
461 token_info.add_housenumbers(self.conn, hnrs)
464 token_info.add_address_terms(self.conn, addr_terms)
469 """ Collect token information to be sent back to the database.
471 def __init__(self, cache):
476 def add_names(self, conn, names):
477 """ Add token information for the names of the place.
479 with conn.cursor() as cur:
480 # Create the token IDs for all names.
481 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
485 def add_housenumbers(self, conn, hnrs):
486 """ Extract housenumber information from the address.
489 token = self.cache.get_housenumber(hnrs[0])
490 if token is not None:
491 self.data['hnr_tokens'] = token
492 self.data['hnr'] = hnrs[0]
495 # split numbers if necessary
498 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
500 if len(simple_list) > 1:
501 simple_list = list(set(simple_list))
503 with conn.cursor() as cur:
504 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
505 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
508 def add_street(self, conn, street):
509 """ Add addr:street match terms.
511 def _get_street(name):
512 with conn.cursor() as cur:
513 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
515 self.data['street'] = self.cache.streets.get(street, _get_street)
518 def add_place(self, conn, place):
519 """ Add addr:place search and match terms.
521 def _get_place(name):
522 with conn.cursor() as cur:
523 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
524 word_ids_from_name(%s)::text""",
526 return cur.fetchone()
528 self.data['place_search'], self.data['place_match'] = \
529 self.cache.places.get(place, _get_place)
532 def add_address_terms(self, conn, terms):
533 """ Add additional address terms.
535 def _get_address_term(name):
536 with conn.cursor() as cur:
537 cur.execute("""SELECT addr_ids_from_name(%s)::text,
538 word_ids_from_name(%s)::text""",
540 return cur.fetchone()
543 for key, value in terms:
544 tokens[key] = self.cache.address_terms.get(value, _get_address_term)
546 self.data['addr'] = tokens
550 """ Least recently used cache that accepts a generator function to
551 produce the item when there is a cache miss.
554 def __init__(self, maxsize=128, init_data=None):
555 self.data = init_data or OrderedDict()
556 self.maxsize = maxsize
557 if init_data is not None and len(init_data) > maxsize:
558 self.maxsize = len(init_data)
560 def get(self, key, generator):
561 """ Get the item with the given key from the cache. If nothing
562 is found in the cache, generate the value through the
563 generator function and store it in the cache.
565 value = self.data.get(key)
566 if value is not None:
567 self.data.move_to_end(key)
569 value = generator(key)
570 if len(self.data) >= self.maxsize:
571 self.data.popitem(last=False)
572 self.data[key] = value
578 """ Cache for token information to avoid repeated database queries.
580 This cache is not thread-safe and needs to be instantiated per
583 def __init__(self, conn):
585 self.streets = _LRU(maxsize=256)
586 self.places = _LRU(maxsize=128)
587 self.address_terms = _LRU(maxsize=1024)
589 # Lookup houseunumbers up to 100 and cache them
590 with conn.cursor() as cur:
591 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
592 FROM generate_series(1, 100) as i""")
593 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
595 # For postcodes remember the ones that have already been added
596 self.postcodes = set()
598 def get_housenumber(self, number):
599 """ Get a housenumber token from the cache.
601 return self._cached_housenumbers.get(number)
604 def add_postcode(self, conn, postcode):
605 """ Make sure the given postcode is in the database.
607 if postcode not in self.postcodes:
608 with conn.cursor() as cur:
609 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
610 self.postcodes.add(postcode)