2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.indexer.place_info import PlaceInfo
17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer(AbstractTokenizer):
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
40 self.term_normalization = None
43 def init_new_db(self, config, init_db=True):
44 """ Set up a new tokenizer for the database.
46 This copies all necessary data in the project directory to make
47 sure the tokenizer remains stable even over updates.
49 self.loader = ICURuleLoader(config)
51 self.term_normalization = config.TERM_NORMALIZATION
53 self._install_php(config.lib_dir.php)
57 self.update_sql_functions(config)
58 self._init_db_tables(config)
61 def init_from_project(self, config):
62 """ Initialise the tokenizer from the project directory.
64 self.loader = ICURuleLoader(config)
66 with connect(self.dsn) as conn:
67 self.loader.load_config_from_db(conn)
68 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
71 def finalize_import(self, _):
72 """ Do any required postprocessing to make the tokenizer data ready
77 def update_sql_functions(self, config):
78 """ Reimport the SQL functions for this tokenizer.
80 with connect(self.dsn) as conn:
81 sqlp = SQLPreprocessor(conn, config)
82 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
85 def check_database(self, config):
86 """ Check that the tokenizer is set up correctly.
88 self.init_from_project(config)
90 if self.term_normalization is None:
91 return "Configuration for tokenizer 'icu' are missing."
96 def update_statistics(self):
97 """ Recompute frequencies for all name words.
99 with connect(self.dsn) as conn:
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def name_analyzer(self):
116 """ Create a new analyzer for tokenizing names and queries
117 using this tokinzer. Analyzers are context managers and should
121 with tokenizer.name_analyzer() as analyzer:
125 When used outside the with construct, the caller must ensure to
126 call the close() function before destructing the analyzer.
128 Analyzers are not thread-safe. You need to instantiate one per thread.
130 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
131 self.loader.make_token_analysis())
134 def _install_php(self, phpdir):
135 """ Install the php script for the tokenizer.
137 php_file = self.data_dir / "tokenizer.php"
138 php_file.write_text(dedent(f"""\
140 @define('CONST_Max_Word_Frequency', 10000000);
141 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
142 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
143 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
146 def _save_config(self):
147 """ Save the configuration that needs to remain stable for the given
148 database as database properties.
150 with connect(self.dsn) as conn:
151 self.loader.save_config_to_db(conn)
152 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
155 def _init_db_tables(self, config):
156 """ Set up the word table and fill it with pre-computed word
159 with connect(self.dsn) as conn:
160 sqlp = SQLPreprocessor(conn, config)
161 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
164 LOG.warning("Precomputing word tokens")
166 # get partial words and their frequencies
167 words = self._count_partial_terms(conn)
169 # copy them back into the word table
170 with CopyBuffer() as copystr:
171 for term, cnt in words.items():
172 copystr.add('w', term, json.dumps({'count': cnt}))
174 with conn.cursor() as cur:
175 copystr.copy_out(cur, 'word',
176 columns=['type', 'word_token', 'info'])
177 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
178 WHERE word_id is null and type = 'w'""")
182 def _count_partial_terms(self, conn):
183 """ Count the partial terms from the names in the place table.
186 analysis = self.loader.make_token_analysis()
188 with conn.cursor(name="words") as cur:
189 cur.execute(""" SELECT v, count(*) FROM
190 (SELECT svals(name) as v FROM place)x
191 WHERE length(v) < 75 GROUP BY v""")
193 for name, cnt in cur:
194 word = analysis.search.transliterate(name)
195 if word and ' ' in word:
196 for term in set(word.split()):
202 class LegacyICUNameAnalyzer(AbstractAnalyzer):
203 """ The legacy analyzer uses the ICU library for splitting names.
205 Each instance opens a connection to the database to request the
209 def __init__(self, dsn, sanitizer, token_analysis):
210 self.conn = connect(dsn).connection
211 self.conn.autocommit = True
212 self.sanitizer = sanitizer
213 self.token_analysis = token_analysis
215 self._cache = _TokenCache()
219 """ Free all resources used by the analyzer.
226 def _search_normalized(self, name):
227 """ Return the search token transliteration of the given name.
229 return self.token_analysis.search.transliterate(name).strip()
232 def _normalized(self, name):
233 """ Return the normalized version of the given name with all
234 non-relevant information removed.
236 return self.token_analysis.normalizer.transliterate(name).strip()
239 def get_word_token_info(self, words):
240 """ Return token information for the given list of words.
241 If a word starts with # it is assumed to be a full name
242 otherwise is a partial name.
244 The function returns a list of tuples with
245 (original word, word token, word id).
247 The function is used for testing and debugging only
248 and not necessarily efficient.
253 if word.startswith('#'):
254 full_tokens[word] = self._search_normalized(word[1:])
256 partial_tokens[word] = self._search_normalized(word)
258 with self.conn.cursor() as cur:
259 cur.execute("""SELECT word_token, word_id
260 FROM word WHERE word_token = ANY(%s) and type = 'W'
261 """, (list(full_tokens.values()),))
262 full_ids = {r[0]: r[1] for r in cur}
263 cur.execute("""SELECT word_token, word_id
264 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
265 (list(partial_tokens.values()),))
266 part_ids = {r[0]: r[1] for r in cur}
268 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
269 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
273 def normalize_postcode(postcode):
274 """ Convert the postcode to a standardized form.
276 This function must yield exactly the same result as the SQL function
277 'token_normalized_postcode()'.
279 return postcode.strip().upper()
282 def _make_standard_hnr(self, hnr):
283 """ Create a normalised version of a housenumber.
285 This function takes minor shortcuts on transliteration.
287 return self._search_normalized(hnr)
289 def update_postcodes_from_db(self):
290 """ Update postcode tokens in the word table from the location_postcode
294 with self.conn.cursor() as cur:
295 # This finds us the rows in location_postcode and word that are
296 # missing in the other table.
297 cur.execute("""SELECT * FROM
298 (SELECT pc, word FROM
299 (SELECT distinct(postcode) as pc FROM location_postcode) p
301 (SELECT word FROM word WHERE type = 'P') w
303 WHERE pc is null or word is null""")
305 with CopyBuffer() as copystr:
306 for postcode, word in cur:
308 to_delete.append(word)
310 copystr.add(self._search_normalized(postcode),
314 cur.execute("""DELETE FROM WORD
315 WHERE type ='P' and word = any(%s)
318 copystr.copy_out(cur, 'word',
319 columns=['word_token', 'type', 'word'])
322 def update_special_phrases(self, phrases, should_replace):
323 """ Replace the search index for special phrases with the new phrases.
324 If `should_replace` is True, then the previous set of will be
325 completely replaced. Otherwise the phrases are added to the
326 already existing ones.
328 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
331 with self.conn.cursor() as cur:
332 # Get the old phrases.
333 existing_phrases = set()
334 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
335 for word, info in cur:
336 existing_phrases.add((word, info['class'], info['type'],
337 info.get('op') or '-'))
339 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
341 deleted = self._remove_special_phrases(cur, norm_phrases,
346 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
347 len(norm_phrases), added, deleted)
350 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
351 """ Add all phrases to the database that are not yet there.
353 to_add = new_phrases - existing_phrases
356 with CopyBuffer() as copystr:
357 for word, cls, typ, oper in to_add:
358 term = self._search_normalized(word)
360 copystr.add(term, 'S', word,
361 json.dumps({'class': cls, 'type': typ,
362 'op': oper if oper in ('in', 'near') else None}))
365 copystr.copy_out(cursor, 'word',
366 columns=['word_token', 'type', 'word', 'info'])
372 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
373 """ Remove all phrases from the databse that are no longer in the
376 to_delete = existing_phrases - new_phrases
379 cursor.execute_values(
380 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
381 WHERE type = 'S' and word = name
382 and info->>'class' = in_class and info->>'type' = in_type
383 and ((op = '-' and info->>'op' is null) or op = info->>'op')
386 return len(to_delete)
389 def add_country_names(self, country_code, names):
390 """ Add names for the given country to the search index.
392 # Make sure any name preprocessing for country names applies.
393 info = PlaceInfo({'name': names, 'country_code': country_code,
394 'rank_address': 4, 'class': 'boundary',
395 'type': 'administrative'})
396 self._add_country_full_names(country_code,
397 self.sanitizer.process_names(info)[0])
400 def _add_country_full_names(self, country_code, names):
401 """ Add names for the given country from an already sanitized
406 norm_name = self._search_normalized(name.name)
408 word_tokens.add(norm_name)
410 with self.conn.cursor() as cur:
412 cur.execute("""SELECT word_token FROM word
413 WHERE type = 'C' and word = %s""",
415 word_tokens.difference_update((t[0] for t in cur))
417 # Only add those names that are not yet in the list.
419 cur.execute("""INSERT INTO word (word_token, type, word)
420 (SELECT token, 'C', %s
421 FROM unnest(%s) as token)
422 """, (country_code, list(word_tokens)))
424 # No names are deleted at the moment.
425 # If deletion is made possible, then the static names from the
426 # initial 'country_name' table should be kept.
429 def process_place(self, place):
430 """ Determine tokenizer information about the given place.
432 Returns a JSON-serializable structure that will be handed into
433 the database via the token_info field.
435 token_info = _TokenInfo(self._cache)
437 names, address = self.sanitizer.process_names(place)
440 fulls, partials = self._compute_name_tokens(names)
442 token_info.add_names(fulls, partials)
444 if place.is_country():
445 self._add_country_full_names(place.country_code, names)
448 self._process_place_address(token_info, address)
450 return token_info.data
453 def _process_place_address(self, token_info, address):
457 if item.kind == 'postcode':
458 self._add_postcode(item.name)
459 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
460 hnrs.append(item.name)
461 elif item.kind == 'street':
462 token_info.add_street(self._compute_partial_tokens(item.name))
463 elif item.kind == 'place':
464 token_info.add_place(self._compute_partial_tokens(item.name))
465 elif not item.kind.startswith('_') and \
466 item.kind not in ('country', 'full'):
467 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
470 hnrs = self._split_housenumbers(hnrs)
471 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
474 token_info.add_address_terms(addr_terms)
477 def _compute_partial_tokens(self, name):
478 """ Normalize the given term, split it into partial words and return
479 then token list for them.
481 norm_name = self._search_normalized(name)
485 for partial in norm_name.split():
486 token = self._cache.partials.get(partial)
490 need_lookup.append(partial)
493 with self.conn.cursor() as cur:
494 cur.execute("""SELECT word, getorcreate_partial_word(word)
495 FROM unnest(%s) word""",
498 for partial, token in cur:
500 self._cache.partials[partial] = token
505 def _compute_name_tokens(self, names):
506 """ Computes the full name and partial name tokens for the given
510 partial_tokens = set()
513 analyzer_id = name.get_attr('analyzer')
514 norm_name = self._normalized(name.name)
515 if analyzer_id is None:
518 token_id = f'{norm_name}@{analyzer_id}'
520 full, part = self._cache.names.get(token_id, (None, None))
522 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
526 with self.conn.cursor() as cur:
527 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
528 (token_id, variants))
529 full, part = cur.fetchone()
531 self._cache.names[token_id] = (full, part)
533 full_tokens.add(full)
534 partial_tokens.update(part)
536 return full_tokens, partial_tokens
539 def _add_postcode(self, postcode):
540 """ Make sure the normalized postcode is present in the word table.
542 if re.search(r'[:,;]', postcode) is None:
543 postcode = self.normalize_postcode(postcode)
545 if postcode not in self._cache.postcodes:
546 term = self._search_normalized(postcode)
550 with self.conn.cursor() as cur:
551 # no word_id needed for postcodes
552 cur.execute("""INSERT INTO word (word_token, type, word)
553 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
556 WHERE type = 'P' and word = pc))
557 """, (term, postcode))
558 self._cache.postcodes.add(postcode)
562 def _split_housenumbers(hnrs):
563 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
564 # split numbers if necessary
567 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
569 if len(simple_list) > 1:
570 hnrs = list(set(simple_list))
580 """ Collect token information to be sent back to the database.
582 def __init__(self, cache):
587 def _mk_array(tokens):
588 return '{%s}' % ','.join((str(s) for s in tokens))
591 def add_names(self, fulls, partials):
592 """ Adds token information for the normalised names.
594 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
597 def add_housenumbers(self, conn, hnrs):
598 """ Extract housenumber information from a list of normalised
601 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
602 self.data['hnr'] = ';'.join(hnrs)
605 def add_street(self, tokens):
606 """ Add addr:street match terms.
609 self.data['street'] = self._mk_array(tokens)
612 def add_place(self, tokens):
613 """ Add addr:place search and match terms.
616 self.data['place'] = self._mk_array(tokens)
619 def add_address_terms(self, terms):
620 """ Add additional address terms.
622 tokens = {key: self._mk_array(partials)
623 for key, partials in terms if partials}
626 self.data['addr'] = tokens
630 """ Cache for token information to avoid repeated database queries.
632 This cache is not thread-safe and needs to be instantiated per
638 self.postcodes = set()
639 self.housenumbers = {}
642 def get_hnr_tokens(self, conn, terms):
643 """ Get token ids for a list of housenumbers, looking them up in the
644 database if necessary. `terms` is an iterable of normalized
651 token = self.housenumbers.get(term)
658 with conn.cursor() as cur:
659 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
661 for term, tid in cur:
662 self.housenumbers[term] = tid