2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
9 from textwrap import dedent
11 from nominatim.db.connection import connect
12 from nominatim.db.properties import set_property, get_property
13 from nominatim.db.utils import CopyBuffer
14 from nominatim.db.sql_preprocessor import SQLPreprocessor
15 from nominatim.indexer.place_info import PlaceInfo
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
21 LOG = logging.getLogger()
23 def create(dsn, data_dir):
24 """ Create a new instance of the tokenizer provided by this module.
26 return LegacyICUTokenizer(dsn, data_dir)
29 class LegacyICUTokenizer(AbstractTokenizer):
30 """ This tokenizer uses libICU to covert names and queries to ASCII.
31 Otherwise it uses the same algorithms and data structures as the
32 normalization routines in Nominatim 3.
35 def __init__(self, dsn, data_dir):
37 self.data_dir = data_dir
39 self.term_normalization = None
42 def init_new_db(self, config, init_db=True):
43 """ Set up a new tokenizer for the database.
45 This copies all necessary data in the project directory to make
46 sure the tokenizer remains stable even over updates.
48 self.loader = ICURuleLoader(config)
50 self.term_normalization = config.TERM_NORMALIZATION
52 self._install_php(config.lib_dir.php)
56 self.update_sql_functions(config)
57 self._init_db_tables(config)
60 def init_from_project(self, config):
61 """ Initialise the tokenizer from the project directory.
63 self.loader = ICURuleLoader(config)
65 with connect(self.dsn) as conn:
66 self.loader.load_config_from_db(conn)
67 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
70 def finalize_import(self, _):
71 """ Do any required postprocessing to make the tokenizer data ready
76 def update_sql_functions(self, config):
77 """ Reimport the SQL functions for this tokenizer.
79 with connect(self.dsn) as conn:
80 sqlp = SQLPreprocessor(conn, config)
81 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
84 def check_database(self, config):
85 """ Check that the tokenizer is set up correctly.
87 self.init_from_project(config)
89 if self.term_normalization is None:
90 return "Configuration for tokenizer 'icu' are missing."
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 with conn.cursor() as cur:
100 cur.drop_table("word_frequencies")
101 LOG.info("Computing word frequencies")
102 cur.execute("""CREATE TEMP TABLE word_frequencies AS
103 SELECT unnest(name_vector) as id, count(*)
104 FROM search_name GROUP BY id""")
105 cur.execute("CREATE INDEX ON word_frequencies(id)")
106 LOG.info("Update word table with recomputed frequencies")
107 cur.execute("""UPDATE word
108 SET info = info || jsonb_build_object('count', count)
109 FROM word_frequencies WHERE word_id = id""")
110 cur.drop_table("word_frequencies")
114 def name_analyzer(self):
115 """ Create a new analyzer for tokenizing names and queries
116 using this tokinzer. Analyzers are context managers and should
120 with tokenizer.name_analyzer() as analyzer:
124 When used outside the with construct, the caller must ensure to
125 call the close() function before destructing the analyzer.
127 Analyzers are not thread-safe. You need to instantiate one per thread.
129 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
130 self.loader.make_token_analysis())
133 def _install_php(self, phpdir):
134 """ Install the php script for the tokenizer.
136 php_file = self.data_dir / "tokenizer.php"
137 php_file.write_text(dedent(f"""\
139 @define('CONST_Max_Word_Frequency', 10000000);
140 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
141 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
142 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
145 def _save_config(self):
146 """ Save the configuration that needs to remain stable for the given
147 database as database properties.
149 with connect(self.dsn) as conn:
150 self.loader.save_config_to_db(conn)
151 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
154 def _init_db_tables(self, config):
155 """ Set up the word table and fill it with pre-computed word
158 with connect(self.dsn) as conn:
159 sqlp = SQLPreprocessor(conn, config)
160 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
164 class LegacyICUNameAnalyzer(AbstractAnalyzer):
165 """ The legacy analyzer uses the ICU library for splitting names.
167 Each instance opens a connection to the database to request the
171 def __init__(self, dsn, sanitizer, token_analysis):
172 self.conn = connect(dsn).connection
173 self.conn.autocommit = True
174 self.sanitizer = sanitizer
175 self.token_analysis = token_analysis
177 self._cache = _TokenCache()
181 """ Free all resources used by the analyzer.
188 def _search_normalized(self, name):
189 """ Return the search token transliteration of the given name.
191 return self.token_analysis.search.transliterate(name).strip()
194 def _normalized(self, name):
195 """ Return the normalized version of the given name with all
196 non-relevant information removed.
198 return self.token_analysis.normalizer.transliterate(name).strip()
201 def get_word_token_info(self, words):
202 """ Return token information for the given list of words.
203 If a word starts with # it is assumed to be a full name
204 otherwise is a partial name.
206 The function returns a list of tuples with
207 (original word, word token, word id).
209 The function is used for testing and debugging only
210 and not necessarily efficient.
215 if word.startswith('#'):
216 full_tokens[word] = self._search_normalized(word[1:])
218 partial_tokens[word] = self._search_normalized(word)
220 with self.conn.cursor() as cur:
221 cur.execute("""SELECT word_token, word_id
222 FROM word WHERE word_token = ANY(%s) and type = 'W'
223 """, (list(full_tokens.values()),))
224 full_ids = {r[0]: r[1] for r in cur}
225 cur.execute("""SELECT word_token, word_id
226 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
227 (list(partial_tokens.values()),))
228 part_ids = {r[0]: r[1] for r in cur}
230 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
231 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
235 def normalize_postcode(postcode):
236 """ Convert the postcode to a standardized form.
238 This function must yield exactly the same result as the SQL function
239 'token_normalized_postcode()'.
241 return postcode.strip().upper()
244 def _make_standard_hnr(self, hnr):
245 """ Create a normalised version of a housenumber.
247 This function takes minor shortcuts on transliteration.
249 return self._search_normalized(hnr)
251 def update_postcodes_from_db(self):
252 """ Update postcode tokens in the word table from the location_postcode
256 with self.conn.cursor() as cur:
257 # This finds us the rows in location_postcode and word that are
258 # missing in the other table.
259 cur.execute("""SELECT * FROM
260 (SELECT pc, word FROM
261 (SELECT distinct(postcode) as pc FROM location_postcode) p
263 (SELECT word FROM word WHERE type = 'P') w
265 WHERE pc is null or word is null""")
267 with CopyBuffer() as copystr:
268 for postcode, word in cur:
270 to_delete.append(word)
272 copystr.add(self._search_normalized(postcode),
276 cur.execute("""DELETE FROM WORD
277 WHERE type ='P' and word = any(%s)
280 copystr.copy_out(cur, 'word',
281 columns=['word_token', 'type', 'word'])
284 def update_special_phrases(self, phrases, should_replace):
285 """ Replace the search index for special phrases with the new phrases.
286 If `should_replace` is True, then the previous set of will be
287 completely replaced. Otherwise the phrases are added to the
288 already existing ones.
290 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
293 with self.conn.cursor() as cur:
294 # Get the old phrases.
295 existing_phrases = set()
296 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
297 for word, info in cur:
298 existing_phrases.add((word, info['class'], info['type'],
299 info.get('op') or '-'))
301 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
303 deleted = self._remove_special_phrases(cur, norm_phrases,
308 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
309 len(norm_phrases), added, deleted)
312 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
313 """ Add all phrases to the database that are not yet there.
315 to_add = new_phrases - existing_phrases
318 with CopyBuffer() as copystr:
319 for word, cls, typ, oper in to_add:
320 term = self._search_normalized(word)
322 copystr.add(term, 'S', word,
323 json.dumps({'class': cls, 'type': typ,
324 'op': oper if oper in ('in', 'near') else None}))
327 copystr.copy_out(cursor, 'word',
328 columns=['word_token', 'type', 'word', 'info'])
334 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
335 """ Remove all phrases from the databse that are no longer in the
338 to_delete = existing_phrases - new_phrases
341 cursor.execute_values(
342 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
343 WHERE type = 'S' and word = name
344 and info->>'class' = in_class and info->>'type' = in_type
345 and ((op = '-' and info->>'op' is null) or op = info->>'op')
348 return len(to_delete)
351 def add_country_names(self, country_code, names):
352 """ Add names for the given country to the search index.
354 # Make sure any name preprocessing for country names applies.
355 info = PlaceInfo({'name': names, 'country_code': country_code,
356 'rank_address': 4, 'class': 'boundary',
357 'type': 'administrative'})
358 self._add_country_full_names(country_code,
359 self.sanitizer.process_names(info)[0])
362 def _add_country_full_names(self, country_code, names):
363 """ Add names for the given country from an already sanitized
368 norm_name = self._search_normalized(name.name)
370 word_tokens.add(norm_name)
372 with self.conn.cursor() as cur:
374 cur.execute("""SELECT word_token FROM word
375 WHERE type = 'C' and word = %s""",
377 word_tokens.difference_update((t[0] for t in cur))
379 # Only add those names that are not yet in the list.
381 cur.execute("""INSERT INTO word (word_token, type, word)
382 (SELECT token, 'C', %s
383 FROM unnest(%s) as token)
384 """, (country_code, list(word_tokens)))
386 # No names are deleted at the moment.
387 # If deletion is made possible, then the static names from the
388 # initial 'country_name' table should be kept.
391 def process_place(self, place):
392 """ Determine tokenizer information about the given place.
394 Returns a JSON-serializable structure that will be handed into
395 the database via the token_info field.
397 token_info = _TokenInfo(self._cache)
399 names, address = self.sanitizer.process_names(place)
402 fulls, partials = self._compute_name_tokens(names)
404 token_info.add_names(fulls, partials)
406 if place.is_country():
407 self._add_country_full_names(place.country_code, names)
410 self._process_place_address(token_info, address)
412 return token_info.data
415 def _process_place_address(self, token_info, address):
419 if item.kind == 'postcode':
420 self._add_postcode(item.name)
421 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
422 hnrs.append(item.name)
423 elif item.kind == 'street':
424 token_info.add_street(self._compute_partial_tokens(item.name))
425 elif item.kind == 'place':
426 token_info.add_place(self._compute_partial_tokens(item.name))
427 elif not item.kind.startswith('_') and \
428 item.kind not in ('country', 'full'):
429 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
432 hnrs = self._split_housenumbers(hnrs)
433 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
436 token_info.add_address_terms(addr_terms)
439 def _compute_partial_tokens(self, name):
440 """ Normalize the given term, split it into partial words and return
441 then token list for them.
443 norm_name = self._search_normalized(name)
447 for partial in norm_name.split():
448 token = self._cache.partials.get(partial)
452 need_lookup.append(partial)
455 with self.conn.cursor() as cur:
456 cur.execute("""SELECT word, getorcreate_partial_word(word)
457 FROM unnest(%s) word""",
460 for partial, token in cur:
462 self._cache.partials[partial] = token
467 def _compute_name_tokens(self, names):
468 """ Computes the full name and partial name tokens for the given
472 partial_tokens = set()
475 analyzer_id = name.get_attr('analyzer')
476 norm_name = self._normalized(name.name)
477 if analyzer_id is None:
480 token_id = f'{norm_name}@{analyzer_id}'
482 full, part = self._cache.names.get(token_id, (None, None))
484 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
488 with self.conn.cursor() as cur:
489 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
490 (token_id, variants))
491 full, part = cur.fetchone()
493 self._cache.names[token_id] = (full, part)
495 full_tokens.add(full)
496 partial_tokens.update(part)
498 return full_tokens, partial_tokens
501 def _add_postcode(self, postcode):
502 """ Make sure the normalized postcode is present in the word table.
504 if re.search(r'[:,;]', postcode) is None:
505 postcode = self.normalize_postcode(postcode)
507 if postcode not in self._cache.postcodes:
508 term = self._search_normalized(postcode)
512 with self.conn.cursor() as cur:
513 # no word_id needed for postcodes
514 cur.execute("""INSERT INTO word (word_token, type, word)
515 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
518 WHERE type = 'P' and word = pc))
519 """, (term, postcode))
520 self._cache.postcodes.add(postcode)
524 def _split_housenumbers(hnrs):
525 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
526 # split numbers if necessary
529 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
531 if len(simple_list) > 1:
532 hnrs = list(set(simple_list))
542 """ Collect token information to be sent back to the database.
544 def __init__(self, cache):
549 def _mk_array(tokens):
550 return '{%s}' % ','.join((str(s) for s in tokens))
553 def add_names(self, fulls, partials):
554 """ Adds token information for the normalised names.
556 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
559 def add_housenumbers(self, conn, hnrs):
560 """ Extract housenumber information from a list of normalised
563 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
564 self.data['hnr'] = ';'.join(hnrs)
567 def add_street(self, tokens):
568 """ Add addr:street match terms.
571 self.data['street'] = self._mk_array(tokens)
574 def add_place(self, tokens):
575 """ Add addr:place search and match terms.
578 self.data['place'] = self._mk_array(tokens)
581 def add_address_terms(self, terms):
582 """ Add additional address terms.
584 tokens = {key: self._mk_array(partials)
585 for key, partials in terms if partials}
588 self.data['addr'] = tokens
592 """ Cache for token information to avoid repeated database queries.
594 This cache is not thread-safe and needs to be instantiated per
600 self.postcodes = set()
601 self.housenumbers = {}
604 def get_hnr_tokens(self, conn, terms):
605 """ Get token ids for a list of housenumbers, looking them up in the
606 database if necessary. `terms` is an iterable of normalized
613 token = self.housenumbers.get(term)
620 with conn.cursor() as cur:
621 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
623 for term, tid in cur:
624 self.housenumbers[term] = tid