1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def _cleanup_housenumbers(self):
116 """ Remove unused house numbers.
118 with connect(self.dsn) as conn:
119 with conn.cursor(name="hnr_counter") as cur:
120 cur.execute("""SELECT word_id, word_token FROM word
122 AND NOT EXISTS(SELECT * FROM search_name
123 WHERE ARRAY[word.word_id] && name_vector)
124 AND (char_length(word_token) > 6
125 OR word_token not similar to '\d+')
127 candidates = {token: wid for wid, token in cur}
128 with conn.cursor(name="hnr_counter") as cur:
129 cur.execute("""SELECT housenumber FROM placex
130 WHERE housenumber is not null
131 AND (char_length(housenumber) > 6
132 OR housenumber not similar to '\d+')
135 for hnr in row[0].split(';'):
136 candidates.pop(hnr, None)
137 LOG.info("There are %s outdated housenumbers.", len(candidates))
140 def update_word_tokens(self):
141 """ Remove unused tokens.
143 LOG.info("Cleaning up housenumber tokens.")
144 self._cleanup_housenumbers()
145 LOG.info("Tokenizer house-keeping done.")
148 def name_analyzer(self):
149 """ Create a new analyzer for tokenizing names and queries
150 using this tokinzer. Analyzers are context managers and should
154 with tokenizer.name_analyzer() as analyzer:
158 When used outside the with construct, the caller must ensure to
159 call the close() function before destructing the analyzer.
161 Analyzers are not thread-safe. You need to instantiate one per thread.
163 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
164 self.loader.make_token_analysis())
167 def _install_php(self, phpdir):
168 """ Install the php script for the tokenizer.
170 php_file = self.data_dir / "tokenizer.php"
171 php_file.write_text(dedent(f"""\
173 @define('CONST_Max_Word_Frequency', 10000000);
174 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
175 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
176 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
179 def _save_config(self):
180 """ Save the configuration that needs to remain stable for the given
181 database as database properties.
183 with connect(self.dsn) as conn:
184 self.loader.save_config_to_db(conn)
187 def _init_db_tables(self, config):
188 """ Set up the word table and fill it with pre-computed word
191 with connect(self.dsn) as conn:
192 sqlp = SQLPreprocessor(conn, config)
193 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
197 class LegacyICUNameAnalyzer(AbstractAnalyzer):
198 """ The legacy analyzer uses the ICU library for splitting names.
200 Each instance opens a connection to the database to request the
204 def __init__(self, dsn, sanitizer, token_analysis):
205 self.conn = connect(dsn).connection
206 self.conn.autocommit = True
207 self.sanitizer = sanitizer
208 self.token_analysis = token_analysis
210 self._cache = _TokenCache()
214 """ Free all resources used by the analyzer.
221 def _search_normalized(self, name):
222 """ Return the search token transliteration of the given name.
224 return self.token_analysis.search.transliterate(name).strip()
227 def _normalized(self, name):
228 """ Return the normalized version of the given name with all
229 non-relevant information removed.
231 return self.token_analysis.normalizer.transliterate(name).strip()
234 def get_word_token_info(self, words):
235 """ Return token information for the given list of words.
236 If a word starts with # it is assumed to be a full name
237 otherwise is a partial name.
239 The function returns a list of tuples with
240 (original word, word token, word id).
242 The function is used for testing and debugging only
243 and not necessarily efficient.
248 if word.startswith('#'):
249 full_tokens[word] = self._search_normalized(word[1:])
251 partial_tokens[word] = self._search_normalized(word)
253 with self.conn.cursor() as cur:
254 cur.execute("""SELECT word_token, word_id
255 FROM word WHERE word_token = ANY(%s) and type = 'W'
256 """, (list(full_tokens.values()),))
257 full_ids = {r[0]: r[1] for r in cur}
258 cur.execute("""SELECT word_token, word_id
259 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
260 (list(partial_tokens.values()),))
261 part_ids = {r[0]: r[1] for r in cur}
263 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
264 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
268 def normalize_postcode(postcode):
269 """ Convert the postcode to a standardized form.
271 This function must yield exactly the same result as the SQL function
272 'token_normalized_postcode()'.
274 return postcode.strip().upper()
277 def _make_standard_hnr(self, hnr):
278 """ Create a normalised version of a housenumber.
280 This function takes minor shortcuts on transliteration.
282 return self._search_normalized(hnr)
284 def update_postcodes_from_db(self):
285 """ Update postcode tokens in the word table from the location_postcode
289 with self.conn.cursor() as cur:
290 # This finds us the rows in location_postcode and word that are
291 # missing in the other table.
292 cur.execute("""SELECT * FROM
293 (SELECT pc, word FROM
294 (SELECT distinct(postcode) as pc FROM location_postcode) p
296 (SELECT word FROM word WHERE type = 'P') w
298 WHERE pc is null or word is null""")
300 with CopyBuffer() as copystr:
301 for postcode, word in cur:
303 to_delete.append(word)
305 copystr.add(self._search_normalized(postcode),
309 cur.execute("""DELETE FROM WORD
310 WHERE type ='P' and word = any(%s)
313 copystr.copy_out(cur, 'word',
314 columns=['word_token', 'type', 'word'])
317 def update_special_phrases(self, phrases, should_replace):
318 """ Replace the search index for special phrases with the new phrases.
319 If `should_replace` is True, then the previous set of will be
320 completely replaced. Otherwise the phrases are added to the
321 already existing ones.
323 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
326 with self.conn.cursor() as cur:
327 # Get the old phrases.
328 existing_phrases = set()
329 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
330 for word, info in cur:
331 existing_phrases.add((word, info['class'], info['type'],
332 info.get('op') or '-'))
334 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
336 deleted = self._remove_special_phrases(cur, norm_phrases,
341 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
342 len(norm_phrases), added, deleted)
345 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
346 """ Add all phrases to the database that are not yet there.
348 to_add = new_phrases - existing_phrases
351 with CopyBuffer() as copystr:
352 for word, cls, typ, oper in to_add:
353 term = self._search_normalized(word)
355 copystr.add(term, 'S', word,
356 json.dumps({'class': cls, 'type': typ,
357 'op': oper if oper in ('in', 'near') else None}))
360 copystr.copy_out(cursor, 'word',
361 columns=['word_token', 'type', 'word', 'info'])
367 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
368 """ Remove all phrases from the databse that are no longer in the
371 to_delete = existing_phrases - new_phrases
374 cursor.execute_values(
375 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
376 WHERE type = 'S' and word = name
377 and info->>'class' = in_class and info->>'type' = in_type
378 and ((op = '-' and info->>'op' is null) or op = info->>'op')
381 return len(to_delete)
384 def add_country_names(self, country_code, names):
385 """ Add names for the given country to the search index.
387 # Make sure any name preprocessing for country names applies.
388 info = PlaceInfo({'name': names, 'country_code': country_code,
389 'rank_address': 4, 'class': 'boundary',
390 'type': 'administrative'})
391 self._add_country_full_names(country_code,
392 self.sanitizer.process_names(info)[0])
395 def _add_country_full_names(self, country_code, names):
396 """ Add names for the given country from an already sanitized
401 norm_name = self._search_normalized(name.name)
403 word_tokens.add(norm_name)
405 with self.conn.cursor() as cur:
407 cur.execute("""SELECT word_token FROM word
408 WHERE type = 'C' and word = %s""",
410 word_tokens.difference_update((t[0] for t in cur))
412 # Only add those names that are not yet in the list.
414 cur.execute("""INSERT INTO word (word_token, type, word)
415 (SELECT token, 'C', %s
416 FROM unnest(%s) as token)
417 """, (country_code, list(word_tokens)))
419 # No names are deleted at the moment.
420 # If deletion is made possible, then the static names from the
421 # initial 'country_name' table should be kept.
424 def process_place(self, place):
425 """ Determine tokenizer information about the given place.
427 Returns a JSON-serializable structure that will be handed into
428 the database via the token_info field.
430 token_info = _TokenInfo(self._cache)
432 names, address = self.sanitizer.process_names(place)
435 fulls, partials = self._compute_name_tokens(names)
437 token_info.add_names(fulls, partials)
439 if place.is_country():
440 self._add_country_full_names(place.country_code, names)
443 self._process_place_address(token_info, address)
445 return token_info.data
448 def _process_place_address(self, token_info, address):
453 if item.kind == 'postcode':
454 self._add_postcode(item.name)
455 elif item.kind == 'housenumber':
456 norm_name = self._make_standard_hnr(item.name)
459 elif item.kind == 'street':
460 streets.extend(self._retrieve_full_tokens(item.name))
461 elif item.kind == 'place':
463 token_info.add_place(self._compute_partial_tokens(item.name))
464 elif not item.kind.startswith('_') and not item.suffix and \
465 item.kind not in ('country', 'full'):
466 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
469 token_info.add_housenumbers(self.conn, hnrs)
472 token_info.add_address_terms(addr_terms)
475 token_info.add_street(streets)
478 def _compute_partial_tokens(self, name):
479 """ Normalize the given term, split it into partial words and return
480 then token list for them.
482 norm_name = self._search_normalized(name)
486 for partial in norm_name.split():
487 token = self._cache.partials.get(partial)
491 need_lookup.append(partial)
494 with self.conn.cursor() as cur:
495 cur.execute("""SELECT word, getorcreate_partial_word(word)
496 FROM unnest(%s) word""",
499 for partial, token in cur:
501 self._cache.partials[partial] = token
506 def _retrieve_full_tokens(self, name):
507 """ Get the full name token for the given name, if it exists.
508 The name is only retrived for the standard analyser.
510 norm_name = self._search_normalized(name)
512 # return cached if possible
513 if norm_name in self._cache.fulls:
514 return self._cache.fulls[norm_name]
516 with self.conn.cursor() as cur:
517 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
519 full = [row[0] for row in cur]
521 self._cache.fulls[norm_name] = full
526 def _compute_name_tokens(self, names):
527 """ Computes the full name and partial name tokens for the given
531 partial_tokens = set()
534 analyzer_id = name.get_attr('analyzer')
535 norm_name = self._normalized(name.name)
536 if analyzer_id is None:
539 token_id = f'{norm_name}@{analyzer_id}'
541 full, part = self._cache.names.get(token_id, (None, None))
543 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
547 with self.conn.cursor() as cur:
548 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
549 (token_id, variants))
550 full, part = cur.fetchone()
552 self._cache.names[token_id] = (full, part)
554 full_tokens.add(full)
555 partial_tokens.update(part)
557 return full_tokens, partial_tokens
560 def _add_postcode(self, postcode):
561 """ Make sure the normalized postcode is present in the word table.
563 if re.search(r'[:,;]', postcode) is None:
564 postcode = self.normalize_postcode(postcode)
566 if postcode not in self._cache.postcodes:
567 term = self._search_normalized(postcode)
571 with self.conn.cursor() as cur:
572 # no word_id needed for postcodes
573 cur.execute("""INSERT INTO word (word_token, type, word)
574 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
577 WHERE type = 'P' and word = pc))
578 """, (term, postcode))
579 self._cache.postcodes.add(postcode)
583 """ Collect token information to be sent back to the database.
585 def __init__(self, cache):
590 def _mk_array(tokens):
591 return '{%s}' % ','.join((str(s) for s in tokens))
594 def add_names(self, fulls, partials):
595 """ Adds token information for the normalised names.
597 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
600 def add_housenumbers(self, conn, hnrs):
601 """ Extract housenumber information from a list of normalised
604 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
605 self.data['hnr'] = ';'.join(hnrs)
608 def add_street(self, tokens):
609 """ Add addr:street match terms.
611 self.data['street'] = self._mk_array(tokens)
614 def add_place(self, tokens):
615 """ Add addr:place search and match terms.
618 self.data['place'] = self._mk_array(tokens)
621 def add_address_terms(self, terms):
622 """ Add additional address terms.
624 tokens = {key: self._mk_array(partials)
625 for key, partials in terms if partials}
628 self.data['addr'] = tokens
632 """ Cache for token information to avoid repeated database queries.
634 This cache is not thread-safe and needs to be instantiated per
641 self.postcodes = set()
642 self.housenumbers = {}
645 def get_hnr_tokens(self, conn, terms):
646 """ Get token ids for a list of housenumbers, looking them up in the
647 database if necessary. `terms` is an iterable of normalized
654 token = self.housenumbers.get(term)
661 with conn.cursor() as cur:
662 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
664 for term, tid in cur:
665 self.housenumbers[term] = tid