2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
9 from textwrap import dedent
11 from nominatim.db.connection import connect
12 from nominatim.db.properties import set_property, get_property
13 from nominatim.db.utils import CopyBuffer
14 from nominatim.db.sql_preprocessor import SQLPreprocessor
15 from nominatim.indexer.place_info import PlaceInfo
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
21 LOG = logging.getLogger()
23 def create(dsn, data_dir):
24 """ Create a new instance of the tokenizer provided by this module.
26 return LegacyICUTokenizer(dsn, data_dir)
29 class LegacyICUTokenizer(AbstractTokenizer):
30 """ This tokenizer uses libICU to covert names and queries to ASCII.
31 Otherwise it uses the same algorithms and data structures as the
32 normalization routines in Nominatim 3.
35 def __init__(self, dsn, data_dir):
37 self.data_dir = data_dir
39 self.term_normalization = None
42 def init_new_db(self, config, init_db=True):
43 """ Set up a new tokenizer for the database.
45 This copies all necessary data in the project directory to make
46 sure the tokenizer remains stable even over updates.
48 self.loader = ICURuleLoader(config)
50 self.term_normalization = config.TERM_NORMALIZATION
52 self._install_php(config.lib_dir.php)
56 self.update_sql_functions(config)
57 self._init_db_tables(config)
60 def init_from_project(self, config):
61 """ Initialise the tokenizer from the project directory.
63 self.loader = ICURuleLoader(config)
65 with connect(self.dsn) as conn:
66 self.loader.load_config_from_db(conn)
67 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
70 def finalize_import(self, config):
71 """ Do any required postprocessing to make the tokenizer data ready
74 with connect(self.dsn) as conn:
75 sqlp = SQLPreprocessor(conn, config)
76 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
79 def update_sql_functions(self, config):
80 """ Reimport the SQL functions for this tokenizer.
82 with connect(self.dsn) as conn:
83 sqlp = SQLPreprocessor(conn, config)
84 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
87 def check_database(self, config):
88 """ Check that the tokenizer is set up correctly.
90 self.init_from_project(config)
92 if self.term_normalization is None:
93 return "Configuration for tokenizer 'icu' are missing."
98 def update_statistics(self):
99 """ Recompute frequencies for all name words.
101 with connect(self.dsn) as conn:
102 with conn.cursor() as cur:
103 cur.drop_table("word_frequencies")
104 LOG.info("Computing word frequencies")
105 cur.execute("""CREATE TEMP TABLE word_frequencies AS
106 SELECT unnest(name_vector) as id, count(*)
107 FROM search_name GROUP BY id""")
108 cur.execute("CREATE INDEX ON word_frequencies(id)")
109 LOG.info("Update word table with recomputed frequencies")
110 cur.execute("""UPDATE word
111 SET info = info || jsonb_build_object('count', count)
112 FROM word_frequencies WHERE word_id = id""")
113 cur.drop_table("word_frequencies")
117 def name_analyzer(self):
118 """ Create a new analyzer for tokenizing names and queries
119 using this tokinzer. Analyzers are context managers and should
123 with tokenizer.name_analyzer() as analyzer:
127 When used outside the with construct, the caller must ensure to
128 call the close() function before destructing the analyzer.
130 Analyzers are not thread-safe. You need to instantiate one per thread.
132 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
133 self.loader.make_token_analysis())
136 def _install_php(self, phpdir):
137 """ Install the php script for the tokenizer.
139 php_file = self.data_dir / "tokenizer.php"
140 php_file.write_text(dedent(f"""\
142 @define('CONST_Max_Word_Frequency', 10000000);
143 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
144 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
145 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
148 def _save_config(self):
149 """ Save the configuration that needs to remain stable for the given
150 database as database properties.
152 with connect(self.dsn) as conn:
153 self.loader.save_config_to_db(conn)
154 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
157 def _init_db_tables(self, config):
158 """ Set up the word table and fill it with pre-computed word
161 with connect(self.dsn) as conn:
162 sqlp = SQLPreprocessor(conn, config)
163 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
167 class LegacyICUNameAnalyzer(AbstractAnalyzer):
168 """ The legacy analyzer uses the ICU library for splitting names.
170 Each instance opens a connection to the database to request the
174 def __init__(self, dsn, sanitizer, token_analysis):
175 self.conn = connect(dsn).connection
176 self.conn.autocommit = True
177 self.sanitizer = sanitizer
178 self.token_analysis = token_analysis
180 self._cache = _TokenCache()
184 """ Free all resources used by the analyzer.
191 def _search_normalized(self, name):
192 """ Return the search token transliteration of the given name.
194 return self.token_analysis.search.transliterate(name).strip()
197 def _normalized(self, name):
198 """ Return the normalized version of the given name with all
199 non-relevant information removed.
201 return self.token_analysis.normalizer.transliterate(name).strip()
204 def get_word_token_info(self, words):
205 """ Return token information for the given list of words.
206 If a word starts with # it is assumed to be a full name
207 otherwise is a partial name.
209 The function returns a list of tuples with
210 (original word, word token, word id).
212 The function is used for testing and debugging only
213 and not necessarily efficient.
218 if word.startswith('#'):
219 full_tokens[word] = self._search_normalized(word[1:])
221 partial_tokens[word] = self._search_normalized(word)
223 with self.conn.cursor() as cur:
224 cur.execute("""SELECT word_token, word_id
225 FROM word WHERE word_token = ANY(%s) and type = 'W'
226 """, (list(full_tokens.values()),))
227 full_ids = {r[0]: r[1] for r in cur}
228 cur.execute("""SELECT word_token, word_id
229 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
230 (list(partial_tokens.values()),))
231 part_ids = {r[0]: r[1] for r in cur}
233 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
234 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
238 def normalize_postcode(postcode):
239 """ Convert the postcode to a standardized form.
241 This function must yield exactly the same result as the SQL function
242 'token_normalized_postcode()'.
244 return postcode.strip().upper()
247 def _make_standard_hnr(self, hnr):
248 """ Create a normalised version of a housenumber.
250 This function takes minor shortcuts on transliteration.
252 return self._search_normalized(hnr)
254 def update_postcodes_from_db(self):
255 """ Update postcode tokens in the word table from the location_postcode
259 with self.conn.cursor() as cur:
260 # This finds us the rows in location_postcode and word that are
261 # missing in the other table.
262 cur.execute("""SELECT * FROM
263 (SELECT pc, word FROM
264 (SELECT distinct(postcode) as pc FROM location_postcode) p
266 (SELECT word FROM word WHERE type = 'P') w
268 WHERE pc is null or word is null""")
270 with CopyBuffer() as copystr:
271 for postcode, word in cur:
273 to_delete.append(word)
275 copystr.add(self._search_normalized(postcode),
279 cur.execute("""DELETE FROM WORD
280 WHERE type ='P' and word = any(%s)
283 copystr.copy_out(cur, 'word',
284 columns=['word_token', 'type', 'word'])
287 def update_special_phrases(self, phrases, should_replace):
288 """ Replace the search index for special phrases with the new phrases.
289 If `should_replace` is True, then the previous set of will be
290 completely replaced. Otherwise the phrases are added to the
291 already existing ones.
293 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
296 with self.conn.cursor() as cur:
297 # Get the old phrases.
298 existing_phrases = set()
299 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
300 for word, info in cur:
301 existing_phrases.add((word, info['class'], info['type'],
302 info.get('op') or '-'))
304 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
306 deleted = self._remove_special_phrases(cur, norm_phrases,
311 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
312 len(norm_phrases), added, deleted)
315 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
316 """ Add all phrases to the database that are not yet there.
318 to_add = new_phrases - existing_phrases
321 with CopyBuffer() as copystr:
322 for word, cls, typ, oper in to_add:
323 term = self._search_normalized(word)
325 copystr.add(term, 'S', word,
326 json.dumps({'class': cls, 'type': typ,
327 'op': oper if oper in ('in', 'near') else None}))
330 copystr.copy_out(cursor, 'word',
331 columns=['word_token', 'type', 'word', 'info'])
337 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
338 """ Remove all phrases from the databse that are no longer in the
341 to_delete = existing_phrases - new_phrases
344 cursor.execute_values(
345 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
346 WHERE type = 'S' and word = name
347 and info->>'class' = in_class and info->>'type' = in_type
348 and ((op = '-' and info->>'op' is null) or op = info->>'op')
351 return len(to_delete)
354 def add_country_names(self, country_code, names):
355 """ Add names for the given country to the search index.
357 # Make sure any name preprocessing for country names applies.
358 info = PlaceInfo({'name': names, 'country_code': country_code,
359 'rank_address': 4, 'class': 'boundary',
360 'type': 'administrative'})
361 self._add_country_full_names(country_code,
362 self.sanitizer.process_names(info)[0])
365 def _add_country_full_names(self, country_code, names):
366 """ Add names for the given country from an already sanitized
371 norm_name = self._search_normalized(name.name)
373 word_tokens.add(norm_name)
375 with self.conn.cursor() as cur:
377 cur.execute("""SELECT word_token FROM word
378 WHERE type = 'C' and word = %s""",
380 word_tokens.difference_update((t[0] for t in cur))
382 # Only add those names that are not yet in the list.
384 cur.execute("""INSERT INTO word (word_token, type, word)
385 (SELECT token, 'C', %s
386 FROM unnest(%s) as token)
387 """, (country_code, list(word_tokens)))
389 # No names are deleted at the moment.
390 # If deletion is made possible, then the static names from the
391 # initial 'country_name' table should be kept.
394 def process_place(self, place):
395 """ Determine tokenizer information about the given place.
397 Returns a JSON-serializable structure that will be handed into
398 the database via the token_info field.
400 token_info = _TokenInfo(self._cache)
402 names, address = self.sanitizer.process_names(place)
405 fulls, partials = self._compute_name_tokens(names)
407 token_info.add_names(fulls, partials)
409 if place.is_country():
410 self._add_country_full_names(place.country_code, names)
413 self._process_place_address(token_info, address)
415 return token_info.data
418 def _process_place_address(self, token_info, address):
422 if item.kind == 'postcode':
423 self._add_postcode(item.name)
424 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
425 hnrs.append(item.name)
426 elif item.kind == 'street':
427 token_info.add_street(self._compute_partial_tokens(item.name))
428 elif item.kind == 'place':
429 token_info.add_place(self._compute_partial_tokens(item.name))
430 elif not item.kind.startswith('_') and \
431 item.kind not in ('country', 'full'):
432 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
435 hnrs = self._split_housenumbers(hnrs)
436 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
439 token_info.add_address_terms(addr_terms)
442 def _compute_partial_tokens(self, name):
443 """ Normalize the given term, split it into partial words and return
444 then token list for them.
446 norm_name = self._search_normalized(name)
450 for partial in norm_name.split():
451 token = self._cache.partials.get(partial)
455 need_lookup.append(partial)
458 with self.conn.cursor() as cur:
459 cur.execute("""SELECT word, getorcreate_partial_word(word)
460 FROM unnest(%s) word""",
463 for partial, token in cur:
465 self._cache.partials[partial] = token
470 def _compute_name_tokens(self, names):
471 """ Computes the full name and partial name tokens for the given
475 partial_tokens = set()
478 analyzer_id = name.get_attr('analyzer')
479 norm_name = self._normalized(name.name)
480 if analyzer_id is None:
483 token_id = f'{norm_name}@{analyzer_id}'
485 full, part = self._cache.names.get(token_id, (None, None))
487 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
491 with self.conn.cursor() as cur:
492 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
493 (token_id, variants))
494 full, part = cur.fetchone()
496 self._cache.names[token_id] = (full, part)
498 full_tokens.add(full)
499 partial_tokens.update(part)
501 return full_tokens, partial_tokens
504 def _add_postcode(self, postcode):
505 """ Make sure the normalized postcode is present in the word table.
507 if re.search(r'[:,;]', postcode) is None:
508 postcode = self.normalize_postcode(postcode)
510 if postcode not in self._cache.postcodes:
511 term = self._search_normalized(postcode)
515 with self.conn.cursor() as cur:
516 # no word_id needed for postcodes
517 cur.execute("""INSERT INTO word (word_token, type, word)
518 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
521 WHERE type = 'P' and word = pc))
522 """, (term, postcode))
523 self._cache.postcodes.add(postcode)
527 def _split_housenumbers(hnrs):
528 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
529 # split numbers if necessary
532 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
534 if len(simple_list) > 1:
535 hnrs = list(set(simple_list))
545 """ Collect token information to be sent back to the database.
547 def __init__(self, cache):
552 def _mk_array(tokens):
553 return '{%s}' % ','.join((str(s) for s in tokens))
556 def add_names(self, fulls, partials):
557 """ Adds token information for the normalised names.
559 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
562 def add_housenumbers(self, conn, hnrs):
563 """ Extract housenumber information from a list of normalised
566 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
567 self.data['hnr'] = ';'.join(hnrs)
570 def add_street(self, tokens):
571 """ Add addr:street match terms.
574 self.data['street'] = self._mk_array(tokens)
577 def add_place(self, tokens):
578 """ Add addr:place search and match terms.
581 self.data['place'] = self._mk_array(tokens)
584 def add_address_terms(self, terms):
585 """ Add additional address terms.
587 tokens = {key: self._mk_array(partials)
588 for key, partials in terms if partials}
591 self.data['addr'] = tokens
595 """ Cache for token information to avoid repeated database queries.
597 This cache is not thread-safe and needs to be instantiated per
603 self.postcodes = set()
604 self.housenumbers = {}
607 def get_hnr_tokens(self, conn, terms):
608 """ Get token ids for a list of housenumbers, looking them up in the
609 database if necessary. `terms` is an iterable of normalized
616 token = self.housenumbers.get(term)
623 with conn.cursor() as cur:
624 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
626 for term, tid in cur:
627 self.housenumbers[term] = tid