1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def name_analyzer(self):
116 """ Create a new analyzer for tokenizing names and queries
117 using this tokinzer. Analyzers are context managers and should
121 with tokenizer.name_analyzer() as analyzer:
125 When used outside the with construct, the caller must ensure to
126 call the close() function before destructing the analyzer.
128 Analyzers are not thread-safe. You need to instantiate one per thread.
130 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
131 self.loader.make_token_analysis())
134 def _install_php(self, phpdir):
135 """ Install the php script for the tokenizer.
137 php_file = self.data_dir / "tokenizer.php"
138 php_file.write_text(dedent(f"""\
140 @define('CONST_Max_Word_Frequency', 10000000);
141 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
142 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
143 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
146 def _save_config(self):
147 """ Save the configuration that needs to remain stable for the given
148 database as database properties.
150 with connect(self.dsn) as conn:
151 self.loader.save_config_to_db(conn)
154 def _init_db_tables(self, config):
155 """ Set up the word table and fill it with pre-computed word
158 with connect(self.dsn) as conn:
159 sqlp = SQLPreprocessor(conn, config)
160 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
164 class LegacyICUNameAnalyzer(AbstractAnalyzer):
165 """ The legacy analyzer uses the ICU library for splitting names.
167 Each instance opens a connection to the database to request the
171 def __init__(self, dsn, sanitizer, token_analysis):
172 self.conn = connect(dsn).connection
173 self.conn.autocommit = True
174 self.sanitizer = sanitizer
175 self.token_analysis = token_analysis
177 self._cache = _TokenCache()
181 """ Free all resources used by the analyzer.
188 def _search_normalized(self, name):
189 """ Return the search token transliteration of the given name.
191 return self.token_analysis.search.transliterate(name).strip()
194 def _normalized(self, name):
195 """ Return the normalized version of the given name with all
196 non-relevant information removed.
198 return self.token_analysis.normalizer.transliterate(name).strip()
201 def get_word_token_info(self, words):
202 """ Return token information for the given list of words.
203 If a word starts with # it is assumed to be a full name
204 otherwise is a partial name.
206 The function returns a list of tuples with
207 (original word, word token, word id).
209 The function is used for testing and debugging only
210 and not necessarily efficient.
215 if word.startswith('#'):
216 full_tokens[word] = self._search_normalized(word[1:])
218 partial_tokens[word] = self._search_normalized(word)
220 with self.conn.cursor() as cur:
221 cur.execute("""SELECT word_token, word_id
222 FROM word WHERE word_token = ANY(%s) and type = 'W'
223 """, (list(full_tokens.values()),))
224 full_ids = {r[0]: r[1] for r in cur}
225 cur.execute("""SELECT word_token, word_id
226 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
227 (list(partial_tokens.values()),))
228 part_ids = {r[0]: r[1] for r in cur}
230 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
231 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
235 def normalize_postcode(postcode):
236 """ Convert the postcode to a standardized form.
238 This function must yield exactly the same result as the SQL function
239 'token_normalized_postcode()'.
241 return postcode.strip().upper()
244 def _make_standard_hnr(self, hnr):
245 """ Create a normalised version of a housenumber.
247 This function takes minor shortcuts on transliteration.
249 return self._search_normalized(hnr)
251 def update_postcodes_from_db(self):
252 """ Update postcode tokens in the word table from the location_postcode
256 with self.conn.cursor() as cur:
257 # This finds us the rows in location_postcode and word that are
258 # missing in the other table.
259 cur.execute("""SELECT * FROM
260 (SELECT pc, word FROM
261 (SELECT distinct(postcode) as pc FROM location_postcode) p
263 (SELECT word FROM word WHERE type = 'P') w
265 WHERE pc is null or word is null""")
267 with CopyBuffer() as copystr:
268 for postcode, word in cur:
270 to_delete.append(word)
272 copystr.add(self._search_normalized(postcode),
276 cur.execute("""DELETE FROM WORD
277 WHERE type ='P' and word = any(%s)
280 copystr.copy_out(cur, 'word',
281 columns=['word_token', 'type', 'word'])
284 def update_special_phrases(self, phrases, should_replace):
285 """ Replace the search index for special phrases with the new phrases.
286 If `should_replace` is True, then the previous set of will be
287 completely replaced. Otherwise the phrases are added to the
288 already existing ones.
290 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
293 with self.conn.cursor() as cur:
294 # Get the old phrases.
295 existing_phrases = set()
296 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
297 for word, info in cur:
298 existing_phrases.add((word, info['class'], info['type'],
299 info.get('op') or '-'))
301 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
303 deleted = self._remove_special_phrases(cur, norm_phrases,
308 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
309 len(norm_phrases), added, deleted)
312 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
313 """ Add all phrases to the database that are not yet there.
315 to_add = new_phrases - existing_phrases
318 with CopyBuffer() as copystr:
319 for word, cls, typ, oper in to_add:
320 term = self._search_normalized(word)
322 copystr.add(term, 'S', word,
323 json.dumps({'class': cls, 'type': typ,
324 'op': oper if oper in ('in', 'near') else None}))
327 copystr.copy_out(cursor, 'word',
328 columns=['word_token', 'type', 'word', 'info'])
334 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
335 """ Remove all phrases from the databse that are no longer in the
338 to_delete = existing_phrases - new_phrases
341 cursor.execute_values(
342 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
343 WHERE type = 'S' and word = name
344 and info->>'class' = in_class and info->>'type' = in_type
345 and ((op = '-' and info->>'op' is null) or op = info->>'op')
348 return len(to_delete)
351 def add_country_names(self, country_code, names):
352 """ Add names for the given country to the search index.
354 # Make sure any name preprocessing for country names applies.
355 info = PlaceInfo({'name': names, 'country_code': country_code,
356 'rank_address': 4, 'class': 'boundary',
357 'type': 'administrative'})
358 self._add_country_full_names(country_code,
359 self.sanitizer.process_names(info)[0])
362 def _add_country_full_names(self, country_code, names):
363 """ Add names for the given country from an already sanitized
368 norm_name = self._search_normalized(name.name)
370 word_tokens.add(norm_name)
372 with self.conn.cursor() as cur:
374 cur.execute("""SELECT word_token FROM word
375 WHERE type = 'C' and word = %s""",
377 word_tokens.difference_update((t[0] for t in cur))
379 # Only add those names that are not yet in the list.
381 cur.execute("""INSERT INTO word (word_token, type, word)
382 (SELECT token, 'C', %s
383 FROM unnest(%s) as token)
384 """, (country_code, list(word_tokens)))
386 # No names are deleted at the moment.
387 # If deletion is made possible, then the static names from the
388 # initial 'country_name' table should be kept.
391 def process_place(self, place):
392 """ Determine tokenizer information about the given place.
394 Returns a JSON-serializable structure that will be handed into
395 the database via the token_info field.
397 token_info = _TokenInfo(self._cache)
399 names, address = self.sanitizer.process_names(place)
402 fulls, partials = self._compute_name_tokens(names)
404 token_info.add_names(fulls, partials)
406 if place.is_country():
407 self._add_country_full_names(place.country_code, names)
410 self._process_place_address(token_info, address)
412 return token_info.data
415 def _process_place_address(self, token_info, address):
420 if item.kind == 'postcode':
421 self._add_postcode(item.name)
422 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
423 hnrs.append(item.name)
424 elif item.kind == 'street':
425 streets.extend(self._retrieve_full_tokens(item.name))
426 elif item.kind == 'place':
428 token_info.add_place(self._compute_partial_tokens(item.name))
429 elif not item.kind.startswith('_') and not item.suffix and \
430 item.kind not in ('country', 'full'):
431 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
434 hnrs = self._split_housenumbers(hnrs)
435 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
438 token_info.add_address_terms(addr_terms)
441 token_info.add_street(streets)
444 def _compute_partial_tokens(self, name):
445 """ Normalize the given term, split it into partial words and return
446 then token list for them.
448 norm_name = self._search_normalized(name)
452 for partial in norm_name.split():
453 token = self._cache.partials.get(partial)
457 need_lookup.append(partial)
460 with self.conn.cursor() as cur:
461 cur.execute("""SELECT word, getorcreate_partial_word(word)
462 FROM unnest(%s) word""",
465 for partial, token in cur:
467 self._cache.partials[partial] = token
472 def _retrieve_full_tokens(self, name):
473 """ Get the full name token for the given name, if it exists.
474 The name is only retrived for the standard analyser.
476 norm_name = self._search_normalized(name)
478 # return cached if possible
479 if norm_name in self._cache.fulls:
480 return self._cache.fulls[norm_name]
482 with self.conn.cursor() as cur:
483 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
485 full = [row[0] for row in cur]
487 self._cache.fulls[norm_name] = full
492 def _compute_name_tokens(self, names):
493 """ Computes the full name and partial name tokens for the given
497 partial_tokens = set()
500 analyzer_id = name.get_attr('analyzer')
501 norm_name = self._normalized(name.name)
502 if analyzer_id is None:
505 token_id = f'{norm_name}@{analyzer_id}'
507 full, part = self._cache.names.get(token_id, (None, None))
509 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
513 with self.conn.cursor() as cur:
514 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
515 (token_id, variants))
516 full, part = cur.fetchone()
518 self._cache.names[token_id] = (full, part)
520 full_tokens.add(full)
521 partial_tokens.update(part)
523 return full_tokens, partial_tokens
526 def _add_postcode(self, postcode):
527 """ Make sure the normalized postcode is present in the word table.
529 if re.search(r'[:,;]', postcode) is None:
530 postcode = self.normalize_postcode(postcode)
532 if postcode not in self._cache.postcodes:
533 term = self._search_normalized(postcode)
537 with self.conn.cursor() as cur:
538 # no word_id needed for postcodes
539 cur.execute("""INSERT INTO word (word_token, type, word)
540 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
543 WHERE type = 'P' and word = pc))
544 """, (term, postcode))
545 self._cache.postcodes.add(postcode)
549 def _split_housenumbers(hnrs):
550 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
551 # split numbers if necessary
554 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
556 if len(simple_list) > 1:
557 hnrs = list(set(simple_list))
567 """ Collect token information to be sent back to the database.
569 def __init__(self, cache):
574 def _mk_array(tokens):
575 return '{%s}' % ','.join((str(s) for s in tokens))
578 def add_names(self, fulls, partials):
579 """ Adds token information for the normalised names.
581 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
584 def add_housenumbers(self, conn, hnrs):
585 """ Extract housenumber information from a list of normalised
588 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
589 self.data['hnr'] = ';'.join(hnrs)
592 def add_street(self, tokens):
593 """ Add addr:street match terms.
595 self.data['street'] = self._mk_array(tokens)
598 def add_place(self, tokens):
599 """ Add addr:place search and match terms.
602 self.data['place'] = self._mk_array(tokens)
605 def add_address_terms(self, terms):
606 """ Add additional address terms.
608 tokens = {key: self._mk_array(partials)
609 for key, partials in terms if partials}
612 self.data['addr'] = tokens
616 """ Cache for token information to avoid repeated database queries.
618 This cache is not thread-safe and needs to be instantiated per
625 self.postcodes = set()
626 self.housenumbers = {}
629 def get_hnr_tokens(self, conn, terms):
630 """ Get token ids for a list of housenumbers, looking them up in the
631 database if necessary. `terms` is an iterable of normalized
638 token = self.housenumbers.get(term)
645 with conn.cursor() as cur:
646 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
648 for term, tid in cur:
649 self.housenumbers[term] = tid