2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.indexer.place_info import PlaceInfo
17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer(AbstractTokenizer):
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
40 self.term_normalization = None
43 def init_new_db(self, config, init_db=True):
44 """ Set up a new tokenizer for the database.
46 This copies all necessary data in the project directory to make
47 sure the tokenizer remains stable even over updates.
49 self.loader = ICURuleLoader(config)
51 self.term_normalization = config.TERM_NORMALIZATION
53 self._install_php(config.lib_dir.php)
57 self.update_sql_functions(config)
58 self._init_db_tables(config)
61 def init_from_project(self, config):
62 """ Initialise the tokenizer from the project directory.
64 self.loader = ICURuleLoader(config)
66 with connect(self.dsn) as conn:
67 self.loader.load_config_from_db(conn)
68 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
71 def finalize_import(self, _):
72 """ Do any required postprocessing to make the tokenizer data ready
77 def update_sql_functions(self, config):
78 """ Reimport the SQL functions for this tokenizer.
80 with connect(self.dsn) as conn:
81 sqlp = SQLPreprocessor(conn, config)
82 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
85 def check_database(self, config):
86 """ Check that the tokenizer is set up correctly.
88 self.init_from_project(config)
90 if self.term_normalization is None:
91 return "Configuration for tokenizer 'icu' are missing."
96 def name_analyzer(self):
97 """ Create a new analyzer for tokenizing names and queries
98 using this tokinzer. Analyzers are context managers and should
102 with tokenizer.name_analyzer() as analyzer:
106 When used outside the with construct, the caller must ensure to
107 call the close() function before destructing the analyzer.
109 Analyzers are not thread-safe. You need to instantiate one per thread.
111 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
112 self.loader.make_token_analysis())
115 def _install_php(self, phpdir):
116 """ Install the php script for the tokenizer.
118 php_file = self.data_dir / "tokenizer.php"
119 php_file.write_text(dedent(f"""\
121 @define('CONST_Max_Word_Frequency', 10000000);
122 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
123 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
124 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
127 def _save_config(self):
128 """ Save the configuration that needs to remain stable for the given
129 database as database properties.
131 with connect(self.dsn) as conn:
132 self.loader.save_config_to_db(conn)
133 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
136 def _init_db_tables(self, config):
137 """ Set up the word table and fill it with pre-computed word
140 with connect(self.dsn) as conn:
141 sqlp = SQLPreprocessor(conn, config)
142 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
145 LOG.warning("Precomputing word tokens")
147 # get partial words and their frequencies
148 words = self._count_partial_terms(conn)
150 # copy them back into the word table
151 with CopyBuffer() as copystr:
152 for term, cnt in words.items():
153 copystr.add('w', term, json.dumps({'count': cnt}))
155 with conn.cursor() as cur:
156 copystr.copy_out(cur, 'word',
157 columns=['type', 'word_token', 'info'])
158 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
159 WHERE word_id is null and type = 'w'""")
163 def _count_partial_terms(self, conn):
164 """ Count the partial terms from the names in the place table.
167 analysis = self.loader.make_token_analysis()
169 with conn.cursor(name="words") as cur:
170 cur.execute(""" SELECT v, count(*) FROM
171 (SELECT svals(name) as v FROM place)x
172 WHERE length(v) < 75 GROUP BY v""")
174 for name, cnt in cur:
175 word = analysis.search.transliterate(name)
176 if word and ' ' in word:
177 for term in set(word.split()):
183 class LegacyICUNameAnalyzer(AbstractAnalyzer):
184 """ The legacy analyzer uses the ICU library for splitting names.
186 Each instance opens a connection to the database to request the
190 def __init__(self, dsn, sanitizer, token_analysis):
191 self.conn = connect(dsn).connection
192 self.conn.autocommit = True
193 self.sanitizer = sanitizer
194 self.token_analysis = token_analysis
196 self._cache = _TokenCache()
200 """ Free all resources used by the analyzer.
207 def _search_normalized(self, name):
208 """ Return the search token transliteration of the given name.
210 return self.token_analysis.search.transliterate(name).strip()
213 def _normalized(self, name):
214 """ Return the normalized version of the given name with all
215 non-relevant information removed.
217 return self.token_analysis.normalizer.transliterate(name).strip()
220 def get_word_token_info(self, words):
221 """ Return token information for the given list of words.
222 If a word starts with # it is assumed to be a full name
223 otherwise is a partial name.
225 The function returns a list of tuples with
226 (original word, word token, word id).
228 The function is used for testing and debugging only
229 and not necessarily efficient.
234 if word.startswith('#'):
235 full_tokens[word] = self._search_normalized(word[1:])
237 partial_tokens[word] = self._search_normalized(word)
239 with self.conn.cursor() as cur:
240 cur.execute("""SELECT word_token, word_id
241 FROM word WHERE word_token = ANY(%s) and type = 'W'
242 """, (list(full_tokens.values()),))
243 full_ids = {r[0]: r[1] for r in cur}
244 cur.execute("""SELECT word_token, word_id
245 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
246 (list(partial_tokens.values()),))
247 part_ids = {r[0]: r[1] for r in cur}
249 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
250 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
254 def normalize_postcode(postcode):
255 """ Convert the postcode to a standardized form.
257 This function must yield exactly the same result as the SQL function
258 'token_normalized_postcode()'.
260 return postcode.strip().upper()
263 def _make_standard_hnr(self, hnr):
264 """ Create a normalised version of a housenumber.
266 This function takes minor shortcuts on transliteration.
268 return self._search_normalized(hnr)
270 def update_postcodes_from_db(self):
271 """ Update postcode tokens in the word table from the location_postcode
275 with self.conn.cursor() as cur:
276 # This finds us the rows in location_postcode and word that are
277 # missing in the other table.
278 cur.execute("""SELECT * FROM
279 (SELECT pc, word FROM
280 (SELECT distinct(postcode) as pc FROM location_postcode) p
282 (SELECT word FROM word WHERE type = 'P') w
284 WHERE pc is null or word is null""")
286 with CopyBuffer() as copystr:
287 for postcode, word in cur:
289 to_delete.append(word)
291 copystr.add(self._search_normalized(postcode),
295 cur.execute("""DELETE FROM WORD
296 WHERE type ='P' and word = any(%s)
299 copystr.copy_out(cur, 'word',
300 columns=['word_token', 'type', 'word'])
303 def update_special_phrases(self, phrases, should_replace):
304 """ Replace the search index for special phrases with the new phrases.
305 If `should_replace` is True, then the previous set of will be
306 completely replaced. Otherwise the phrases are added to the
307 already existing ones.
309 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
312 with self.conn.cursor() as cur:
313 # Get the old phrases.
314 existing_phrases = set()
315 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
316 for word, info in cur:
317 existing_phrases.add((word, info['class'], info['type'],
318 info.get('op') or '-'))
320 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
322 deleted = self._remove_special_phrases(cur, norm_phrases,
327 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
328 len(norm_phrases), added, deleted)
331 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
332 """ Add all phrases to the database that are not yet there.
334 to_add = new_phrases - existing_phrases
337 with CopyBuffer() as copystr:
338 for word, cls, typ, oper in to_add:
339 term = self._search_normalized(word)
341 copystr.add(term, 'S', word,
342 json.dumps({'class': cls, 'type': typ,
343 'op': oper if oper in ('in', 'near') else None}))
346 copystr.copy_out(cursor, 'word',
347 columns=['word_token', 'type', 'word', 'info'])
353 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
354 """ Remove all phrases from the databse that are no longer in the
357 to_delete = existing_phrases - new_phrases
360 cursor.execute_values(
361 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
362 WHERE type = 'S' and word = name
363 and info->>'class' = in_class and info->>'type' = in_type
364 and ((op = '-' and info->>'op' is null) or op = info->>'op')
367 return len(to_delete)
370 def add_country_names(self, country_code, names):
371 """ Add names for the given country to the search index.
373 # Make sure any name preprocessing for country names applies.
374 info = PlaceInfo({'name': names, 'country_code': country_code,
375 'rank_address': 4, 'class': 'boundary',
376 'type': 'administrative'})
377 self._add_country_full_names(country_code,
378 self.sanitizer.process_names(info)[0])
381 def _add_country_full_names(self, country_code, names):
382 """ Add names for the given country from an already sanitized
387 norm_name = self._search_normalized(name.name)
389 word_tokens.add(norm_name)
391 with self.conn.cursor() as cur:
393 cur.execute("""SELECT word_token FROM word
394 WHERE type = 'C' and word = %s""",
396 word_tokens.difference_update((t[0] for t in cur))
398 # Only add those names that are not yet in the list.
400 cur.execute("""INSERT INTO word (word_token, type, word)
401 (SELECT token, 'C', %s
402 FROM unnest(%s) as token)
403 """, (country_code, list(word_tokens)))
405 # No names are deleted at the moment.
406 # If deletion is made possible, then the static names from the
407 # initial 'country_name' table should be kept.
410 def process_place(self, place):
411 """ Determine tokenizer information about the given place.
413 Returns a JSON-serializable structure that will be handed into
414 the database via the token_info field.
416 token_info = _TokenInfo(self._cache)
418 names, address = self.sanitizer.process_names(place)
421 fulls, partials = self._compute_name_tokens(names)
423 token_info.add_names(fulls, partials)
425 if place.is_country():
426 self._add_country_full_names(place.country_code, names)
429 self._process_place_address(token_info, address)
431 return token_info.data
434 def _process_place_address(self, token_info, address):
438 if item.kind == 'postcode':
439 self._add_postcode(item.name)
440 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
441 hnrs.append(item.name)
442 elif item.kind == 'street':
443 token_info.add_street(self._compute_partial_tokens(item.name))
444 elif item.kind == 'place':
445 token_info.add_place(self._compute_partial_tokens(item.name))
446 elif not item.kind.startswith('_') and \
447 item.kind not in ('country', 'full'):
448 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
451 hnrs = self._split_housenumbers(hnrs)
452 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
455 token_info.add_address_terms(addr_terms)
458 def _compute_partial_tokens(self, name):
459 """ Normalize the given term, split it into partial words and return
460 then token list for them.
462 norm_name = self._search_normalized(name)
466 for partial in norm_name.split():
467 token = self._cache.partials.get(partial)
471 need_lookup.append(partial)
474 with self.conn.cursor() as cur:
475 cur.execute("""SELECT word, getorcreate_partial_word(word)
476 FROM unnest(%s) word""",
479 for partial, token in cur:
481 self._cache.partials[partial] = token
486 def _compute_name_tokens(self, names):
487 """ Computes the full name and partial name tokens for the given
491 partial_tokens = set()
494 analyzer_id = name.get_attr('analyzer')
495 norm_name = self._normalized(name.name)
496 if analyzer_id is None:
499 token_id = f'{norm_name}@{analyzer_id}'
501 full, part = self._cache.names.get(token_id, (None, None))
503 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
507 with self.conn.cursor() as cur:
508 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
509 (token_id, variants))
510 full, part = cur.fetchone()
512 self._cache.names[token_id] = (full, part)
514 full_tokens.add(full)
515 partial_tokens.update(part)
517 return full_tokens, partial_tokens
520 def _add_postcode(self, postcode):
521 """ Make sure the normalized postcode is present in the word table.
523 if re.search(r'[:,;]', postcode) is None:
524 postcode = self.normalize_postcode(postcode)
526 if postcode not in self._cache.postcodes:
527 term = self._search_normalized(postcode)
531 with self.conn.cursor() as cur:
532 # no word_id needed for postcodes
533 cur.execute("""INSERT INTO word (word_token, type, word)
534 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
537 WHERE type = 'P' and word = pc))
538 """, (term, postcode))
539 self._cache.postcodes.add(postcode)
543 def _split_housenumbers(hnrs):
544 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
545 # split numbers if necessary
548 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
550 if len(simple_list) > 1:
551 hnrs = list(set(simple_list))
561 """ Collect token information to be sent back to the database.
563 def __init__(self, cache):
568 def _mk_array(tokens):
569 return '{%s}' % ','.join((str(s) for s in tokens))
572 def add_names(self, fulls, partials):
573 """ Adds token information for the normalised names.
575 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
578 def add_housenumbers(self, conn, hnrs):
579 """ Extract housenumber information from a list of normalised
582 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
583 self.data['hnr'] = ';'.join(hnrs)
586 def add_street(self, tokens):
587 """ Add addr:street match terms.
590 self.data['street'] = self._mk_array(tokens)
593 def add_place(self, tokens):
594 """ Add addr:place search and match terms.
597 self.data['place'] = self._mk_array(tokens)
600 def add_address_terms(self, terms):
601 """ Add additional address terms.
603 tokens = {key: self._mk_array(partials)
604 for key, partials in terms if partials}
607 self.data['addr'] = tokens
611 """ Cache for token information to avoid repeated database queries.
613 This cache is not thread-safe and needs to be instantiated per
619 self.postcodes = set()
620 self.housenumbers = {}
623 def get_hnr_tokens(self, conn, terms):
624 """ Get token ids for a list of housenumbers, looking them up in the
625 database if necessary. `terms` is an iterable of normalized
632 token = self.housenumbers.get(term)
639 with conn.cursor() as cur:
640 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
642 for term, tid in cur:
643 self.housenumbers[term] = tid