2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
9 from textwrap import dedent
11 from nominatim.db.connection import connect
12 from nominatim.db.utils import CopyBuffer
13 from nominatim.db.sql_preprocessor import SQLPreprocessor
14 from nominatim.indexer.place_info import PlaceInfo
15 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
16 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
18 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
20 LOG = logging.getLogger()
22 def create(dsn, data_dir):
23 """ Create a new instance of the tokenizer provided by this module.
25 return LegacyICUTokenizer(dsn, data_dir)
28 class LegacyICUTokenizer(AbstractTokenizer):
29 """ This tokenizer uses libICU to covert names and queries to ASCII.
30 Otherwise it uses the same algorithms and data structures as the
31 normalization routines in Nominatim 3.
34 def __init__(self, dsn, data_dir):
36 self.data_dir = data_dir
40 def init_new_db(self, config, init_db=True):
41 """ Set up a new tokenizer for the database.
43 This copies all necessary data in the project directory to make
44 sure the tokenizer remains stable even over updates.
46 self.loader = ICURuleLoader(config)
48 self._install_php(config.lib_dir.php)
52 self.update_sql_functions(config)
53 self._init_db_tables(config)
56 def init_from_project(self, config):
57 """ Initialise the tokenizer from the project directory.
59 self.loader = ICURuleLoader(config)
61 with connect(self.dsn) as conn:
62 self.loader.load_config_from_db(conn)
65 def finalize_import(self, config):
66 """ Do any required postprocessing to make the tokenizer data ready
69 with connect(self.dsn) as conn:
70 sqlp = SQLPreprocessor(conn, config)
71 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
74 def update_sql_functions(self, config):
75 """ Reimport the SQL functions for this tokenizer.
77 with connect(self.dsn) as conn:
78 sqlp = SQLPreprocessor(conn, config)
79 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
82 def check_database(self, config):
83 """ Check that the tokenizer is set up correctly.
85 # Will throw an error if there is an issue.
86 self.init_from_project(config)
89 def update_statistics(self):
90 """ Recompute frequencies for all name words.
92 with connect(self.dsn) as conn:
93 if conn.table_exists('search_name'):
94 with conn.cursor() as cur:
95 cur.drop_table("word_frequencies")
96 LOG.info("Computing word frequencies")
97 cur.execute("""CREATE TEMP TABLE word_frequencies AS
98 SELECT unnest(name_vector) as id, count(*)
99 FROM search_name GROUP BY id""")
100 cur.execute("CREATE INDEX ON word_frequencies(id)")
101 LOG.info("Update word table with recomputed frequencies")
102 cur.execute("""UPDATE word
103 SET info = info || jsonb_build_object('count', count)
104 FROM word_frequencies WHERE word_id = id""")
105 cur.drop_table("word_frequencies")
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
125 self.loader.make_token_analysis())
128 def _install_php(self, phpdir):
129 """ Install the php script for the tokenizer.
131 php_file = self.data_dir / "tokenizer.php"
132 php_file.write_text(dedent(f"""\
134 @define('CONST_Max_Word_Frequency', 10000000);
135 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
136 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
137 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
140 def _save_config(self):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.loader.save_config_to_db(conn)
148 def _init_db_tables(self, config):
149 """ Set up the word table and fill it with pre-computed word
152 with connect(self.dsn) as conn:
153 sqlp = SQLPreprocessor(conn, config)
154 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
158 class LegacyICUNameAnalyzer(AbstractAnalyzer):
159 """ The legacy analyzer uses the ICU library for splitting names.
161 Each instance opens a connection to the database to request the
165 def __init__(self, dsn, sanitizer, token_analysis):
166 self.conn = connect(dsn).connection
167 self.conn.autocommit = True
168 self.sanitizer = sanitizer
169 self.token_analysis = token_analysis
171 self._cache = _TokenCache()
175 """ Free all resources used by the analyzer.
182 def _search_normalized(self, name):
183 """ Return the search token transliteration of the given name.
185 return self.token_analysis.search.transliterate(name).strip()
188 def _normalized(self, name):
189 """ Return the normalized version of the given name with all
190 non-relevant information removed.
192 return self.token_analysis.normalizer.transliterate(name).strip()
195 def get_word_token_info(self, words):
196 """ Return token information for the given list of words.
197 If a word starts with # it is assumed to be a full name
198 otherwise is a partial name.
200 The function returns a list of tuples with
201 (original word, word token, word id).
203 The function is used for testing and debugging only
204 and not necessarily efficient.
209 if word.startswith('#'):
210 full_tokens[word] = self._search_normalized(word[1:])
212 partial_tokens[word] = self._search_normalized(word)
214 with self.conn.cursor() as cur:
215 cur.execute("""SELECT word_token, word_id
216 FROM word WHERE word_token = ANY(%s) and type = 'W'
217 """, (list(full_tokens.values()),))
218 full_ids = {r[0]: r[1] for r in cur}
219 cur.execute("""SELECT word_token, word_id
220 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
221 (list(partial_tokens.values()),))
222 part_ids = {r[0]: r[1] for r in cur}
224 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
225 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
229 def normalize_postcode(postcode):
230 """ Convert the postcode to a standardized form.
232 This function must yield exactly the same result as the SQL function
233 'token_normalized_postcode()'.
235 return postcode.strip().upper()
238 def _make_standard_hnr(self, hnr):
239 """ Create a normalised version of a housenumber.
241 This function takes minor shortcuts on transliteration.
243 return self._search_normalized(hnr)
245 def update_postcodes_from_db(self):
246 """ Update postcode tokens in the word table from the location_postcode
250 with self.conn.cursor() as cur:
251 # This finds us the rows in location_postcode and word that are
252 # missing in the other table.
253 cur.execute("""SELECT * FROM
254 (SELECT pc, word FROM
255 (SELECT distinct(postcode) as pc FROM location_postcode) p
257 (SELECT word FROM word WHERE type = 'P') w
259 WHERE pc is null or word is null""")
261 with CopyBuffer() as copystr:
262 for postcode, word in cur:
264 to_delete.append(word)
266 copystr.add(self._search_normalized(postcode),
270 cur.execute("""DELETE FROM WORD
271 WHERE type ='P' and word = any(%s)
274 copystr.copy_out(cur, 'word',
275 columns=['word_token', 'type', 'word'])
278 def update_special_phrases(self, phrases, should_replace):
279 """ Replace the search index for special phrases with the new phrases.
280 If `should_replace` is True, then the previous set of will be
281 completely replaced. Otherwise the phrases are added to the
282 already existing ones.
284 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
287 with self.conn.cursor() as cur:
288 # Get the old phrases.
289 existing_phrases = set()
290 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
291 for word, info in cur:
292 existing_phrases.add((word, info['class'], info['type'],
293 info.get('op') or '-'))
295 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
297 deleted = self._remove_special_phrases(cur, norm_phrases,
302 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
303 len(norm_phrases), added, deleted)
306 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
307 """ Add all phrases to the database that are not yet there.
309 to_add = new_phrases - existing_phrases
312 with CopyBuffer() as copystr:
313 for word, cls, typ, oper in to_add:
314 term = self._search_normalized(word)
316 copystr.add(term, 'S', word,
317 json.dumps({'class': cls, 'type': typ,
318 'op': oper if oper in ('in', 'near') else None}))
321 copystr.copy_out(cursor, 'word',
322 columns=['word_token', 'type', 'word', 'info'])
328 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
329 """ Remove all phrases from the databse that are no longer in the
332 to_delete = existing_phrases - new_phrases
335 cursor.execute_values(
336 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
337 WHERE type = 'S' and word = name
338 and info->>'class' = in_class and info->>'type' = in_type
339 and ((op = '-' and info->>'op' is null) or op = info->>'op')
342 return len(to_delete)
345 def add_country_names(self, country_code, names):
346 """ Add names for the given country to the search index.
348 # Make sure any name preprocessing for country names applies.
349 info = PlaceInfo({'name': names, 'country_code': country_code,
350 'rank_address': 4, 'class': 'boundary',
351 'type': 'administrative'})
352 self._add_country_full_names(country_code,
353 self.sanitizer.process_names(info)[0])
356 def _add_country_full_names(self, country_code, names):
357 """ Add names for the given country from an already sanitized
362 norm_name = self._search_normalized(name.name)
364 word_tokens.add(norm_name)
366 with self.conn.cursor() as cur:
368 cur.execute("""SELECT word_token FROM word
369 WHERE type = 'C' and word = %s""",
371 word_tokens.difference_update((t[0] for t in cur))
373 # Only add those names that are not yet in the list.
375 cur.execute("""INSERT INTO word (word_token, type, word)
376 (SELECT token, 'C', %s
377 FROM unnest(%s) as token)
378 """, (country_code, list(word_tokens)))
380 # No names are deleted at the moment.
381 # If deletion is made possible, then the static names from the
382 # initial 'country_name' table should be kept.
385 def process_place(self, place):
386 """ Determine tokenizer information about the given place.
388 Returns a JSON-serializable structure that will be handed into
389 the database via the token_info field.
391 token_info = _TokenInfo(self._cache)
393 names, address = self.sanitizer.process_names(place)
396 fulls, partials = self._compute_name_tokens(names)
398 token_info.add_names(fulls, partials)
400 if place.is_country():
401 self._add_country_full_names(place.country_code, names)
404 self._process_place_address(token_info, address)
406 return token_info.data
409 def _process_place_address(self, token_info, address):
414 if item.kind == 'postcode':
415 self._add_postcode(item.name)
416 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
417 hnrs.append(item.name)
418 elif item.kind == 'street':
419 token = self._retrieve_full_token(item.name)
421 streets.append(token)
422 elif item.kind == 'place':
424 token_info.add_place(self._compute_partial_tokens(item.name))
425 elif not item.kind.startswith('_') and not item.suffix and \
426 item.kind not in ('country', 'full'):
427 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
430 hnrs = self._split_housenumbers(hnrs)
431 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
434 token_info.add_address_terms(addr_terms)
437 token_info.add_street(streets)
440 def _compute_partial_tokens(self, name):
441 """ Normalize the given term, split it into partial words and return
442 then token list for them.
444 norm_name = self._search_normalized(name)
448 for partial in norm_name.split():
449 token = self._cache.partials.get(partial)
453 need_lookup.append(partial)
456 with self.conn.cursor() as cur:
457 cur.execute("""SELECT word, getorcreate_partial_word(word)
458 FROM unnest(%s) word""",
461 for partial, token in cur:
463 self._cache.partials[partial] = token
468 def _retrieve_full_token(self, name):
469 """ Get the full name token for the given name, if it exists.
470 The name is only retrived for the standard analyser.
472 norm_name = self._normalized(name)
474 # return cached if possible
475 if norm_name in self._cache.fulls:
476 return self._cache.fulls[norm_name]
479 full, _ = self._cache.names.get(norm_name, (None, None))
482 with self.conn.cursor() as cur:
483 cur.execute("SELECT word_id FROM word WHERE word = %s and type = 'W' LIMIT 1",
486 full = cur.fetchone()[0]
488 self._cache.fulls[norm_name] = full
493 def _compute_name_tokens(self, names):
494 """ Computes the full name and partial name tokens for the given
498 partial_tokens = set()
501 analyzer_id = name.get_attr('analyzer')
502 norm_name = self._normalized(name.name)
503 if analyzer_id is None:
506 token_id = f'{norm_name}@{analyzer_id}'
508 full, part = self._cache.names.get(token_id, (None, None))
510 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
514 with self.conn.cursor() as cur:
515 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
516 (token_id, variants))
517 full, part = cur.fetchone()
519 self._cache.names[token_id] = (full, part)
521 full_tokens.add(full)
522 partial_tokens.update(part)
524 return full_tokens, partial_tokens
527 def _add_postcode(self, postcode):
528 """ Make sure the normalized postcode is present in the word table.
530 if re.search(r'[:,;]', postcode) is None:
531 postcode = self.normalize_postcode(postcode)
533 if postcode not in self._cache.postcodes:
534 term = self._search_normalized(postcode)
538 with self.conn.cursor() as cur:
539 # no word_id needed for postcodes
540 cur.execute("""INSERT INTO word (word_token, type, word)
541 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
544 WHERE type = 'P' and word = pc))
545 """, (term, postcode))
546 self._cache.postcodes.add(postcode)
550 def _split_housenumbers(hnrs):
551 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
552 # split numbers if necessary
555 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
557 if len(simple_list) > 1:
558 hnrs = list(set(simple_list))
568 """ Collect token information to be sent back to the database.
570 def __init__(self, cache):
575 def _mk_array(tokens):
576 return '{%s}' % ','.join((str(s) for s in tokens))
579 def add_names(self, fulls, partials):
580 """ Adds token information for the normalised names.
582 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
585 def add_housenumbers(self, conn, hnrs):
586 """ Extract housenumber information from a list of normalised
589 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
590 self.data['hnr'] = ';'.join(hnrs)
593 def add_street(self, tokens):
594 """ Add addr:street match terms.
596 self.data['street'] = self._mk_array(tokens)
599 def add_place(self, tokens):
600 """ Add addr:place search and match terms.
603 self.data['place'] = self._mk_array(tokens)
606 def add_address_terms(self, terms):
607 """ Add additional address terms.
609 tokens = {key: self._mk_array(partials)
610 for key, partials in terms if partials}
613 self.data['addr'] = tokens
617 """ Cache for token information to avoid repeated database queries.
619 This cache is not thread-safe and needs to be instantiated per
626 self.postcodes = set()
627 self.housenumbers = {}
630 def get_hnr_tokens(self, conn, terms):
631 """ Get token ids for a list of housenumbers, looking them up in the
632 database if necessary. `terms` is an iterable of normalized
639 token = self.housenumbers.get(term)
646 with conn.cursor() as cur:
647 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
649 for term, tid in cur:
650 self.housenumbers[term] = tid