2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
9 from textwrap import dedent
11 from nominatim.db.connection import connect
12 from nominatim.db.utils import CopyBuffer
13 from nominatim.db.sql_preprocessor import SQLPreprocessor
14 from nominatim.indexer.place_info import PlaceInfo
15 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
16 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
18 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
20 LOG = logging.getLogger()
22 def create(dsn, data_dir):
23 """ Create a new instance of the tokenizer provided by this module.
25 return LegacyICUTokenizer(dsn, data_dir)
28 class LegacyICUTokenizer(AbstractTokenizer):
29 """ This tokenizer uses libICU to covert names and queries to ASCII.
30 Otherwise it uses the same algorithms and data structures as the
31 normalization routines in Nominatim 3.
34 def __init__(self, dsn, data_dir):
36 self.data_dir = data_dir
40 def init_new_db(self, config, init_db=True):
41 """ Set up a new tokenizer for the database.
43 This copies all necessary data in the project directory to make
44 sure the tokenizer remains stable even over updates.
46 self.loader = ICURuleLoader(config)
48 self._install_php(config.lib_dir.php)
52 self.update_sql_functions(config)
53 self._init_db_tables(config)
56 def init_from_project(self, config):
57 """ Initialise the tokenizer from the project directory.
59 self.loader = ICURuleLoader(config)
61 with connect(self.dsn) as conn:
62 self.loader.load_config_from_db(conn)
65 def finalize_import(self, config):
66 """ Do any required postprocessing to make the tokenizer data ready
69 with connect(self.dsn) as conn:
70 sqlp = SQLPreprocessor(conn, config)
71 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
74 def update_sql_functions(self, config):
75 """ Reimport the SQL functions for this tokenizer.
77 with connect(self.dsn) as conn:
78 sqlp = SQLPreprocessor(conn, config)
79 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
82 def check_database(self, config):
83 """ Check that the tokenizer is set up correctly.
85 # Will throw an error if there is an issue.
86 self.init_from_project(config)
89 def update_statistics(self):
90 """ Recompute frequencies for all name words.
92 with connect(self.dsn) as conn:
93 if conn.table_exists('search_name'):
94 with conn.cursor() as cur:
95 cur.drop_table("word_frequencies")
96 LOG.info("Computing word frequencies")
97 cur.execute("""CREATE TEMP TABLE word_frequencies AS
98 SELECT unnest(name_vector) as id, count(*)
99 FROM search_name GROUP BY id""")
100 cur.execute("CREATE INDEX ON word_frequencies(id)")
101 LOG.info("Update word table with recomputed frequencies")
102 cur.execute("""UPDATE word
103 SET info = info || jsonb_build_object('count', count)
104 FROM word_frequencies WHERE word_id = id""")
105 cur.drop_table("word_frequencies")
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
125 self.loader.make_token_analysis())
128 def _install_php(self, phpdir):
129 """ Install the php script for the tokenizer.
131 php_file = self.data_dir / "tokenizer.php"
132 php_file.write_text(dedent(f"""\
134 @define('CONST_Max_Word_Frequency', 10000000);
135 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
136 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
137 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
140 def _save_config(self):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.loader.save_config_to_db(conn)
148 def _init_db_tables(self, config):
149 """ Set up the word table and fill it with pre-computed word
152 with connect(self.dsn) as conn:
153 sqlp = SQLPreprocessor(conn, config)
154 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
158 class LegacyICUNameAnalyzer(AbstractAnalyzer):
159 """ The legacy analyzer uses the ICU library for splitting names.
161 Each instance opens a connection to the database to request the
165 def __init__(self, dsn, sanitizer, token_analysis):
166 self.conn = connect(dsn).connection
167 self.conn.autocommit = True
168 self.sanitizer = sanitizer
169 self.token_analysis = token_analysis
171 self._cache = _TokenCache()
175 """ Free all resources used by the analyzer.
182 def _search_normalized(self, name):
183 """ Return the search token transliteration of the given name.
185 return self.token_analysis.search.transliterate(name).strip()
188 def _normalized(self, name):
189 """ Return the normalized version of the given name with all
190 non-relevant information removed.
192 return self.token_analysis.normalizer.transliterate(name).strip()
195 def get_word_token_info(self, words):
196 """ Return token information for the given list of words.
197 If a word starts with # it is assumed to be a full name
198 otherwise is a partial name.
200 The function returns a list of tuples with
201 (original word, word token, word id).
203 The function is used for testing and debugging only
204 and not necessarily efficient.
209 if word.startswith('#'):
210 full_tokens[word] = self._search_normalized(word[1:])
212 partial_tokens[word] = self._search_normalized(word)
214 with self.conn.cursor() as cur:
215 cur.execute("""SELECT word_token, word_id
216 FROM word WHERE word_token = ANY(%s) and type = 'W'
217 """, (list(full_tokens.values()),))
218 full_ids = {r[0]: r[1] for r in cur}
219 cur.execute("""SELECT word_token, word_id
220 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
221 (list(partial_tokens.values()),))
222 part_ids = {r[0]: r[1] for r in cur}
224 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
225 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
229 def normalize_postcode(postcode):
230 """ Convert the postcode to a standardized form.
232 This function must yield exactly the same result as the SQL function
233 'token_normalized_postcode()'.
235 return postcode.strip().upper()
238 def _make_standard_hnr(self, hnr):
239 """ Create a normalised version of a housenumber.
241 This function takes minor shortcuts on transliteration.
243 return self._search_normalized(hnr)
245 def update_postcodes_from_db(self):
246 """ Update postcode tokens in the word table from the location_postcode
250 with self.conn.cursor() as cur:
251 # This finds us the rows in location_postcode and word that are
252 # missing in the other table.
253 cur.execute("""SELECT * FROM
254 (SELECT pc, word FROM
255 (SELECT distinct(postcode) as pc FROM location_postcode) p
257 (SELECT word FROM word WHERE type = 'P') w
259 WHERE pc is null or word is null""")
261 with CopyBuffer() as copystr:
262 for postcode, word in cur:
264 to_delete.append(word)
266 copystr.add(self._search_normalized(postcode),
270 cur.execute("""DELETE FROM WORD
271 WHERE type ='P' and word = any(%s)
274 copystr.copy_out(cur, 'word',
275 columns=['word_token', 'type', 'word'])
278 def update_special_phrases(self, phrases, should_replace):
279 """ Replace the search index for special phrases with the new phrases.
280 If `should_replace` is True, then the previous set of will be
281 completely replaced. Otherwise the phrases are added to the
282 already existing ones.
284 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
287 with self.conn.cursor() as cur:
288 # Get the old phrases.
289 existing_phrases = set()
290 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
291 for word, info in cur:
292 existing_phrases.add((word, info['class'], info['type'],
293 info.get('op') or '-'))
295 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
297 deleted = self._remove_special_phrases(cur, norm_phrases,
302 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
303 len(norm_phrases), added, deleted)
306 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
307 """ Add all phrases to the database that are not yet there.
309 to_add = new_phrases - existing_phrases
312 with CopyBuffer() as copystr:
313 for word, cls, typ, oper in to_add:
314 term = self._search_normalized(word)
316 copystr.add(term, 'S', word,
317 json.dumps({'class': cls, 'type': typ,
318 'op': oper if oper in ('in', 'near') else None}))
321 copystr.copy_out(cursor, 'word',
322 columns=['word_token', 'type', 'word', 'info'])
328 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
329 """ Remove all phrases from the databse that are no longer in the
332 to_delete = existing_phrases - new_phrases
335 cursor.execute_values(
336 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
337 WHERE type = 'S' and word = name
338 and info->>'class' = in_class and info->>'type' = in_type
339 and ((op = '-' and info->>'op' is null) or op = info->>'op')
342 return len(to_delete)
345 def add_country_names(self, country_code, names):
346 """ Add names for the given country to the search index.
348 # Make sure any name preprocessing for country names applies.
349 info = PlaceInfo({'name': names, 'country_code': country_code,
350 'rank_address': 4, 'class': 'boundary',
351 'type': 'administrative'})
352 self._add_country_full_names(country_code,
353 self.sanitizer.process_names(info)[0])
356 def _add_country_full_names(self, country_code, names):
357 """ Add names for the given country from an already sanitized
362 norm_name = self._search_normalized(name.name)
364 word_tokens.add(norm_name)
366 with self.conn.cursor() as cur:
368 cur.execute("""SELECT word_token FROM word
369 WHERE type = 'C' and word = %s""",
371 word_tokens.difference_update((t[0] for t in cur))
373 # Only add those names that are not yet in the list.
375 cur.execute("""INSERT INTO word (word_token, type, word)
376 (SELECT token, 'C', %s
377 FROM unnest(%s) as token)
378 """, (country_code, list(word_tokens)))
380 # No names are deleted at the moment.
381 # If deletion is made possible, then the static names from the
382 # initial 'country_name' table should be kept.
385 def process_place(self, place):
386 """ Determine tokenizer information about the given place.
388 Returns a JSON-serializable structure that will be handed into
389 the database via the token_info field.
391 token_info = _TokenInfo(self._cache)
393 names, address = self.sanitizer.process_names(place)
396 fulls, partials = self._compute_name_tokens(names)
398 token_info.add_names(fulls, partials)
400 if place.is_country():
401 self._add_country_full_names(place.country_code, names)
404 self._process_place_address(token_info, address)
406 return token_info.data
409 def _process_place_address(self, token_info, address):
414 if item.kind == 'postcode':
415 self._add_postcode(item.name)
416 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
417 hnrs.append(item.name)
418 elif item.kind == 'street':
419 streets.extend(self._retrieve_full_tokens(item.name))
420 elif item.kind == 'place':
422 token_info.add_place(self._compute_partial_tokens(item.name))
423 elif not item.kind.startswith('_') and not item.suffix and \
424 item.kind not in ('country', 'full'):
425 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
428 hnrs = self._split_housenumbers(hnrs)
429 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
432 token_info.add_address_terms(addr_terms)
435 token_info.add_street(streets)
438 def _compute_partial_tokens(self, name):
439 """ Normalize the given term, split it into partial words and return
440 then token list for them.
442 norm_name = self._search_normalized(name)
446 for partial in norm_name.split():
447 token = self._cache.partials.get(partial)
451 need_lookup.append(partial)
454 with self.conn.cursor() as cur:
455 cur.execute("""SELECT word, getorcreate_partial_word(word)
456 FROM unnest(%s) word""",
459 for partial, token in cur:
461 self._cache.partials[partial] = token
466 def _retrieve_full_tokens(self, name):
467 """ Get the full name token for the given name, if it exists.
468 The name is only retrived for the standard analyser.
470 norm_name = self._search_normalized(name)
472 # return cached if possible
473 if norm_name in self._cache.fulls:
474 return self._cache.fulls[norm_name]
476 with self.conn.cursor() as cur:
477 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
479 full = [row[0] for row in cur]
481 self._cache.fulls[norm_name] = full
486 def _compute_name_tokens(self, names):
487 """ Computes the full name and partial name tokens for the given
491 partial_tokens = set()
494 analyzer_id = name.get_attr('analyzer')
495 norm_name = self._normalized(name.name)
496 if analyzer_id is None:
499 token_id = f'{norm_name}@{analyzer_id}'
501 full, part = self._cache.names.get(token_id, (None, None))
503 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
507 with self.conn.cursor() as cur:
508 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
509 (token_id, variants))
510 full, part = cur.fetchone()
512 self._cache.names[token_id] = (full, part)
514 full_tokens.add(full)
515 partial_tokens.update(part)
517 return full_tokens, partial_tokens
520 def _add_postcode(self, postcode):
521 """ Make sure the normalized postcode is present in the word table.
523 if re.search(r'[:,;]', postcode) is None:
524 postcode = self.normalize_postcode(postcode)
526 if postcode not in self._cache.postcodes:
527 term = self._search_normalized(postcode)
531 with self.conn.cursor() as cur:
532 # no word_id needed for postcodes
533 cur.execute("""INSERT INTO word (word_token, type, word)
534 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
537 WHERE type = 'P' and word = pc))
538 """, (term, postcode))
539 self._cache.postcodes.add(postcode)
543 def _split_housenumbers(hnrs):
544 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
545 # split numbers if necessary
548 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
550 if len(simple_list) > 1:
551 hnrs = list(set(simple_list))
561 """ Collect token information to be sent back to the database.
563 def __init__(self, cache):
568 def _mk_array(tokens):
569 return '{%s}' % ','.join((str(s) for s in tokens))
572 def add_names(self, fulls, partials):
573 """ Adds token information for the normalised names.
575 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
578 def add_housenumbers(self, conn, hnrs):
579 """ Extract housenumber information from a list of normalised
582 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
583 self.data['hnr'] = ';'.join(hnrs)
586 def add_street(self, tokens):
587 """ Add addr:street match terms.
589 self.data['street'] = self._mk_array(tokens)
592 def add_place(self, tokens):
593 """ Add addr:place search and match terms.
596 self.data['place'] = self._mk_array(tokens)
599 def add_address_terms(self, terms):
600 """ Add additional address terms.
602 tokens = {key: self._mk_array(partials)
603 for key, partials in terms if partials}
606 self.data['addr'] = tokens
610 """ Cache for token information to avoid repeated database queries.
612 This cache is not thread-safe and needs to be instantiated per
619 self.postcodes = set()
620 self.housenumbers = {}
623 def get_hnr_tokens(self, conn, terms):
624 """ Get token ids for a list of housenumbers, looking them up in the
625 database if necessary. `terms` is an iterable of normalized
632 token = self.housenumbers.get(term)
639 with conn.cursor() as cur:
640 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
642 for term, tid in cur:
643 self.housenumbers[term] = tid