2 Tokenizer implementing normalisation as used before Nominatim 4.
4 from collections import OrderedDict
8 from textwrap import dedent
10 from icu import Transliterator
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db import properties
16 from nominatim.db import utils as db_utils
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.errors import UsageError
19 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
21 DBCFG_NORMALIZATION = "tokenizer_normalization"
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyTokenizer(dsn, data_dir)
32 def _install_module(config_module_path, src_dir, module_dir):
33 """ Copies the PostgreSQL normalisation module into the project
34 directory if necessary. For historical reasons the module is
35 saved in the '/module' subdirectory and not with the other tokenizer
38 The function detects when the installation is run from the
39 build directory. It doesn't touch the module in that case.
41 # Custom module locations are simply used as is.
42 if config_module_path:
43 LOG.info("Using custom path for database module at '%s'", config_module_path)
44 return config_module_path
46 # Compatibility mode for builddir installations.
47 if module_dir.exists() and src_dir.samefile(module_dir):
48 LOG.info('Running from build directory. Leaving database module as is.')
51 # In any other case install the module in the project directory.
52 if not module_dir.exists():
55 destfile = module_dir / 'nominatim.so'
56 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
59 LOG.info('Database module installed at %s', str(destfile))
64 def _check_module(module_dir, conn):
65 """ Try to use the PostgreSQL module to confirm that it is correctly
66 installed and accessible from PostgreSQL.
68 with conn.cursor() as cur:
70 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
71 RETURNS text AS '{}/nominatim.so', 'transliteration'
72 LANGUAGE c IMMUTABLE STRICT;
73 DROP FUNCTION nominatim_test_import_func(text)
74 """.format(module_dir))
75 except psycopg2.DatabaseError as err:
76 LOG.fatal("Error accessing database module: %s", err)
77 raise UsageError("Database module cannot be accessed.") from err
80 class LegacyTokenizer(AbstractTokenizer):
81 """ The legacy tokenizer uses a special PostgreSQL module to normalize
82 names and queries. The tokenizer thus implements normalization through
83 calls to the database.
86 def __init__(self, dsn, data_dir):
88 self.data_dir = data_dir
89 self.normalization = None
92 def init_new_db(self, config, init_db=True):
93 """ Set up a new tokenizer for the database.
95 This copies all necessary data in the project directory to make
96 sure the tokenizer remains stable even over updates.
98 module_dir = _install_module(config.DATABASE_MODULE_PATH,
99 config.lib_dir.module,
100 config.project_dir / 'module')
102 self.normalization = config.TERM_NORMALIZATION
104 self._install_php(config)
106 with connect(self.dsn) as conn:
107 _check_module(module_dir, conn)
108 self._save_config(conn, config)
112 self.update_sql_functions(config)
113 self._init_db_tables(config)
116 def init_from_project(self, _):
117 """ Initialise the tokenizer from the project directory.
119 with connect(self.dsn) as conn:
120 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
123 def finalize_import(self, config):
124 """ Do any required postprocessing to make the tokenizer data ready
127 with connect(self.dsn) as conn:
128 sqlp = SQLPreprocessor(conn, config)
129 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
132 def update_sql_functions(self, config):
133 """ Reimport the SQL functions for this tokenizer.
135 with connect(self.dsn) as conn:
136 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
137 modulepath = config.DATABASE_MODULE_PATH or \
138 str((config.project_dir / 'module').resolve())
139 sqlp = SQLPreprocessor(conn, config)
140 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
141 max_word_freq=max_word_freq,
142 modulepath=modulepath)
145 def check_database(self, _):
146 """ Check that the tokenizer is set up correctly.
149 The Postgresql extension nominatim.so was not correctly loaded.
154 * Check the output of the CMmake/make installation step
155 * Does nominatim.so exist?
156 * Does nominatim.so exist on the database server?
157 * Can nominatim.so be accessed by the database user?
159 with connect(self.dsn) as conn:
160 with conn.cursor() as cur:
162 out = cur.scalar("SELECT make_standard_name('a')")
163 except psycopg2.Error as err:
164 return hint.format(error=str(err))
167 return hint.format(error='Unexpected result for make_standard_name()')
172 def migrate_database(self, config):
173 """ Initialise the project directory of an existing database for
174 use with this tokenizer.
176 This is a special migration function for updating existing databases
177 to new software versions.
179 self.normalization = config.TERM_NORMALIZATION
180 module_dir = _install_module(config.DATABASE_MODULE_PATH,
181 config.lib_dir.module,
182 config.project_dir / 'module')
184 with connect(self.dsn) as conn:
185 _check_module(module_dir, conn)
186 self._save_config(conn, config)
189 def name_analyzer(self):
190 """ Create a new analyzer for tokenizing names and queries
191 using this tokinzer. Analyzers are context managers and should
195 with tokenizer.name_analyzer() as analyzer:
199 When used outside the with construct, the caller must ensure to
200 call the close() function before destructing the analyzer.
202 Analyzers are not thread-safe. You need to instantiate one per thread.
204 normalizer = Transliterator.createFromRules("phrase normalizer",
206 return LegacyNameAnalyzer(self.dsn, normalizer)
209 def _install_php(self, config):
210 """ Install the php script for the tokenizer.
212 php_file = self.data_dir / "tokenizer.php"
213 php_file.write_text(dedent("""\
215 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
216 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
217 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
221 def _init_db_tables(self, config):
222 """ Set up the word table and fill it with pre-computed word
225 with connect(self.dsn) as conn:
226 sqlp = SQLPreprocessor(conn, config)
227 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
230 LOG.warning("Precomputing word tokens")
231 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
234 def _save_config(self, conn, config):
235 """ Save the configuration that needs to remain stable for the given
236 database as database properties.
238 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
239 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
242 class LegacyNameAnalyzer(AbstractAnalyzer):
243 """ The legacy analyzer uses the special Postgresql module for
246 Each instance opens a connection to the database to request the
250 def __init__(self, dsn, normalizer):
251 self.conn = connect(dsn).connection
252 self.conn.autocommit = True
253 self.normalizer = normalizer
254 psycopg2.extras.register_hstore(self.conn)
256 self._cache = _TokenCache(self.conn)
260 """ Free all resources used by the analyzer.
267 def get_word_token_info(self, words):
268 """ Return token information for the given list of words.
269 If a word starts with # it is assumed to be a full name
270 otherwise is a partial name.
272 The function returns a list of tuples with
273 (original word, word token, word id).
275 The function is used for testing and debugging only
276 and not necessarily efficient.
278 with self.conn.cursor() as cur:
279 cur.execute("""SELECT t.term, word_token, word_id
280 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
281 WHERE word_token = (CASE
282 WHEN left(t.term, 1) = '#' THEN
283 ' ' || make_standard_name(substring(t.term from 2))
285 make_standard_name(t.term)
287 and class is null and country_code is null""",
290 return [(r[0], r[1], r[2]) for r in cur]
293 def normalize(self, phrase):
294 """ Normalize the given phrase, i.e. remove all properties that
295 are irrelevant for search.
297 return self.normalizer.transliterate(phrase)
301 def normalize_postcode(postcode):
302 """ Convert the postcode to a standardized form.
304 This function must yield exactly the same result as the SQL function
305 'token_normalized_postcode()'.
307 return postcode.strip().upper()
310 def update_postcodes_from_db(self):
311 """ Update postcode tokens in the word table from the location_postcode
314 with self.conn.cursor() as cur:
315 # This finds us the rows in location_postcode and word that are
316 # missing in the other table.
317 cur.execute("""SELECT * FROM
318 (SELECT pc, word FROM
319 (SELECT distinct(postcode) as pc FROM location_postcode) p
321 (SELECT word FROM word
322 WHERE class ='place' and type = 'postcode') w
324 WHERE pc is null or word is null""")
329 for postcode, word in cur:
331 to_delete.append(word)
333 to_add.append(postcode)
336 cur.execute("""DELETE FROM WORD
337 WHERE class ='place' and type = 'postcode'
341 cur.execute("""SELECT count(create_postcode_id(pc))
342 FROM unnest(%s) as pc
347 def update_special_phrases(self, phrases, should_replace):
348 """ Replace the search index for special phrases with the new phrases.
350 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
353 with self.conn.cursor() as cur:
354 # Get the old phrases.
355 existing_phrases = set()
356 cur.execute("""SELECT word, class, type, operator FROM word
357 WHERE class != 'place'
358 OR (type != 'house' AND type != 'postcode')""")
359 for label, cls, typ, oper in cur:
360 existing_phrases.add((label, cls, typ, oper or '-'))
362 to_add = norm_phrases - existing_phrases
363 to_delete = existing_phrases - norm_phrases
367 """ INSERT INTO word (word_id, word_token, word, class, type,
368 search_name_count, operator)
369 (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
371 CASE WHEN op in ('in', 'near') THEN op ELSE null END
372 FROM (VALUES %s) as v(name, class, type, op))""",
375 if to_delete and should_replace:
377 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
378 WHERE word = name and class = in_class and type = in_type
379 and ((op = '-' and operator is null) or op = operator)""",
382 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
383 len(norm_phrases), len(to_add), len(to_delete))
386 def add_country_names(self, country_code, names):
387 """ Add names for the given country to the search index.
389 with self.conn.cursor() as cur:
391 """INSERT INTO word (word_id, word_token, country_code)
392 (SELECT nextval('seq_word'), lookup_token, %s
393 FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
395 WHERE NOT EXISTS(SELECT * FROM word
396 WHERE word_token = lookup_token and country_code = %s))
397 """, (country_code, list(names.values()), country_code))
400 def process_place(self, place):
401 """ Determine tokenizer information about the given place.
403 Returns a JSON-serialisable structure that will be handed into
404 the database via the token_info field.
406 token_info = _TokenInfo(self._cache)
411 token_info.add_names(self.conn, names)
413 if place.is_country():
414 self.add_country_names(place.country_code, names)
416 address = place.address
418 self._process_place_address(token_info, address)
420 return token_info.data
423 def _process_place_address(self, token_info, address):
427 for key, value in address.items():
428 if key == 'postcode':
429 # Make sure the normalized postcode is present in the word table.
430 if re.search(r'[:,;]', value) is None:
431 self._cache.add_postcode(self.conn,
432 self.normalize_postcode(value))
433 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
435 elif key == 'street':
436 token_info.add_street(self.conn, value)
438 token_info.add_place(self.conn, value)
439 elif not key.startswith('_') and key not in ('country', 'full'):
440 addr_terms.append((key, value))
443 token_info.add_housenumbers(self.conn, hnrs)
446 token_info.add_address_terms(self.conn, addr_terms)
451 """ Collect token information to be sent back to the database.
453 def __init__(self, cache):
458 def add_names(self, conn, names):
459 """ Add token information for the names of the place.
461 with conn.cursor() as cur:
462 # Create the token IDs for all names.
463 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
467 def add_housenumbers(self, conn, hnrs):
468 """ Extract housenumber information from the address.
471 token = self.cache.get_housenumber(hnrs[0])
472 if token is not None:
473 self.data['hnr_tokens'] = token
474 self.data['hnr'] = hnrs[0]
477 # split numbers if necessary
480 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
482 if len(simple_list) > 1:
483 simple_list = list(set(simple_list))
485 with conn.cursor() as cur:
486 cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
487 self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
490 def add_street(self, conn, street):
491 """ Add addr:street match terms.
493 def _get_street(name):
494 with conn.cursor() as cur:
495 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
497 self.data['street'] = self.cache.streets.get(street, _get_street)
500 def add_place(self, conn, place):
501 """ Add addr:place search and match terms.
503 def _get_place(name):
504 with conn.cursor() as cur:
505 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
506 word_ids_from_name(%s)::text""",
508 return cur.fetchone()
510 self.data['place_search'], self.data['place_match'] = \
511 self.cache.places.get(place, _get_place)
514 def add_address_terms(self, conn, terms):
515 """ Add additional address terms.
517 def _get_address_term(name):
518 with conn.cursor() as cur:
519 cur.execute("""SELECT addr_ids_from_name(%s)::text,
520 word_ids_from_name(%s)::text""",
522 return cur.fetchone()
525 for key, value in terms:
526 tokens[key] = self.cache.address_terms.get(value, _get_address_term)
528 self.data['addr'] = tokens
532 """ Least recently used cache that accepts a generator function to
533 produce the item when there is a cache miss.
536 def __init__(self, maxsize=128, init_data=None):
537 self.data = init_data or OrderedDict()
538 self.maxsize = maxsize
539 if init_data is not None and len(init_data) > maxsize:
540 self.maxsize = len(init_data)
542 def get(self, key, generator):
543 """ Get the item with the given key from the cache. If nothing
544 is found in the cache, generate the value through the
545 generator function and store it in the cache.
547 value = self.data.get(key)
548 if value is not None:
549 self.data.move_to_end(key)
551 value = generator(key)
552 if len(self.data) >= self.maxsize:
553 self.data.popitem(last=False)
554 self.data[key] = value
560 """ Cache for token information to avoid repeated database queries.
562 This cache is not thread-safe and needs to be instantiated per
565 def __init__(self, conn):
567 self.streets = _LRU(maxsize=256)
568 self.places = _LRU(maxsize=128)
569 self.address_terms = _LRU(maxsize=1024)
571 # Lookup houseunumbers up to 100 and cache them
572 with conn.cursor() as cur:
573 cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
574 FROM generate_series(1, 100) as i""")
575 self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
577 # For postcodes remember the ones that have already been added
578 self.postcodes = set()
580 def get_housenumber(self, number):
581 """ Get a housenumber token from the cache.
583 return self._cached_housenumbers.get(number)
586 def add_postcode(self, conn, postcode):
587 """ Make sure the given postcode is in the database.
589 if postcode not in self.postcodes:
590 with conn.cursor() as cur:
591 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
592 self.postcodes.add(postcode)