2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
9 from textwrap import dedent
11 from nominatim.db.connection import connect
12 from nominatim.db.properties import set_property, get_property
13 from nominatim.db.utils import CopyBuffer
14 from nominatim.db.sql_preprocessor import SQLPreprocessor
15 from nominatim.indexer.place_info import PlaceInfo
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
21 LOG = logging.getLogger()
23 def create(dsn, data_dir):
24 """ Create a new instance of the tokenizer provided by this module.
26 return LegacyICUTokenizer(dsn, data_dir)
29 class LegacyICUTokenizer(AbstractTokenizer):
30 """ This tokenizer uses libICU to covert names and queries to ASCII.
31 Otherwise it uses the same algorithms and data structures as the
32 normalization routines in Nominatim 3.
35 def __init__(self, dsn, data_dir):
37 self.data_dir = data_dir
39 self.term_normalization = None
42 def init_new_db(self, config, init_db=True):
43 """ Set up a new tokenizer for the database.
45 This copies all necessary data in the project directory to make
46 sure the tokenizer remains stable even over updates.
48 self.loader = ICURuleLoader(config)
50 self.term_normalization = config.TERM_NORMALIZATION
52 self._install_php(config.lib_dir.php)
56 self.update_sql_functions(config)
57 self._init_db_tables(config)
60 def init_from_project(self, config):
61 """ Initialise the tokenizer from the project directory.
63 self.loader = ICURuleLoader(config)
65 with connect(self.dsn) as conn:
66 self.loader.load_config_from_db(conn)
67 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
70 def finalize_import(self, config):
71 """ Do any required postprocessing to make the tokenizer data ready
74 with connect(self.dsn) as conn:
75 sqlp = SQLPreprocessor(conn, config)
76 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
79 def update_sql_functions(self, config):
80 """ Reimport the SQL functions for this tokenizer.
82 with connect(self.dsn) as conn:
83 sqlp = SQLPreprocessor(conn, config)
84 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
87 def check_database(self, config):
88 """ Check that the tokenizer is set up correctly.
90 self.init_from_project(config)
92 if self.term_normalization is None:
93 return "Configuration for tokenizer 'icu' are missing."
98 def update_statistics(self):
99 """ Recompute frequencies for all name words.
101 with connect(self.dsn) as conn:
102 if conn.table_exists('search_name'):
103 with conn.cursor() as cur:
104 cur.drop_table("word_frequencies")
105 LOG.info("Computing word frequencies")
106 cur.execute("""CREATE TEMP TABLE word_frequencies AS
107 SELECT unnest(name_vector) as id, count(*)
108 FROM search_name GROUP BY id""")
109 cur.execute("CREATE INDEX ON word_frequencies(id)")
110 LOG.info("Update word table with recomputed frequencies")
111 cur.execute("""UPDATE word
112 SET info = info || jsonb_build_object('count', count)
113 FROM word_frequencies WHERE word_id = id""")
114 cur.drop_table("word_frequencies")
118 def name_analyzer(self):
119 """ Create a new analyzer for tokenizing names and queries
120 using this tokinzer. Analyzers are context managers and should
124 with tokenizer.name_analyzer() as analyzer:
128 When used outside the with construct, the caller must ensure to
129 call the close() function before destructing the analyzer.
131 Analyzers are not thread-safe. You need to instantiate one per thread.
133 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
134 self.loader.make_token_analysis())
137 def _install_php(self, phpdir):
138 """ Install the php script for the tokenizer.
140 php_file = self.data_dir / "tokenizer.php"
141 php_file.write_text(dedent(f"""\
143 @define('CONST_Max_Word_Frequency', 10000000);
144 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
145 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
146 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
149 def _save_config(self):
150 """ Save the configuration that needs to remain stable for the given
151 database as database properties.
153 with connect(self.dsn) as conn:
154 self.loader.save_config_to_db(conn)
155 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
158 def _init_db_tables(self, config):
159 """ Set up the word table and fill it with pre-computed word
162 with connect(self.dsn) as conn:
163 sqlp = SQLPreprocessor(conn, config)
164 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
168 class LegacyICUNameAnalyzer(AbstractAnalyzer):
169 """ The legacy analyzer uses the ICU library for splitting names.
171 Each instance opens a connection to the database to request the
175 def __init__(self, dsn, sanitizer, token_analysis):
176 self.conn = connect(dsn).connection
177 self.conn.autocommit = True
178 self.sanitizer = sanitizer
179 self.token_analysis = token_analysis
181 self._cache = _TokenCache()
185 """ Free all resources used by the analyzer.
192 def _search_normalized(self, name):
193 """ Return the search token transliteration of the given name.
195 return self.token_analysis.search.transliterate(name).strip()
198 def _normalized(self, name):
199 """ Return the normalized version of the given name with all
200 non-relevant information removed.
202 return self.token_analysis.normalizer.transliterate(name).strip()
205 def get_word_token_info(self, words):
206 """ Return token information for the given list of words.
207 If a word starts with # it is assumed to be a full name
208 otherwise is a partial name.
210 The function returns a list of tuples with
211 (original word, word token, word id).
213 The function is used for testing and debugging only
214 and not necessarily efficient.
219 if word.startswith('#'):
220 full_tokens[word] = self._search_normalized(word[1:])
222 partial_tokens[word] = self._search_normalized(word)
224 with self.conn.cursor() as cur:
225 cur.execute("""SELECT word_token, word_id
226 FROM word WHERE word_token = ANY(%s) and type = 'W'
227 """, (list(full_tokens.values()),))
228 full_ids = {r[0]: r[1] for r in cur}
229 cur.execute("""SELECT word_token, word_id
230 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
231 (list(partial_tokens.values()),))
232 part_ids = {r[0]: r[1] for r in cur}
234 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
235 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
239 def normalize_postcode(postcode):
240 """ Convert the postcode to a standardized form.
242 This function must yield exactly the same result as the SQL function
243 'token_normalized_postcode()'.
245 return postcode.strip().upper()
248 def _make_standard_hnr(self, hnr):
249 """ Create a normalised version of a housenumber.
251 This function takes minor shortcuts on transliteration.
253 return self._search_normalized(hnr)
255 def update_postcodes_from_db(self):
256 """ Update postcode tokens in the word table from the location_postcode
260 with self.conn.cursor() as cur:
261 # This finds us the rows in location_postcode and word that are
262 # missing in the other table.
263 cur.execute("""SELECT * FROM
264 (SELECT pc, word FROM
265 (SELECT distinct(postcode) as pc FROM location_postcode) p
267 (SELECT word FROM word WHERE type = 'P') w
269 WHERE pc is null or word is null""")
271 with CopyBuffer() as copystr:
272 for postcode, word in cur:
274 to_delete.append(word)
276 copystr.add(self._search_normalized(postcode),
280 cur.execute("""DELETE FROM WORD
281 WHERE type ='P' and word = any(%s)
284 copystr.copy_out(cur, 'word',
285 columns=['word_token', 'type', 'word'])
288 def update_special_phrases(self, phrases, should_replace):
289 """ Replace the search index for special phrases with the new phrases.
290 If `should_replace` is True, then the previous set of will be
291 completely replaced. Otherwise the phrases are added to the
292 already existing ones.
294 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
297 with self.conn.cursor() as cur:
298 # Get the old phrases.
299 existing_phrases = set()
300 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
301 for word, info in cur:
302 existing_phrases.add((word, info['class'], info['type'],
303 info.get('op') or '-'))
305 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
307 deleted = self._remove_special_phrases(cur, norm_phrases,
312 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
313 len(norm_phrases), added, deleted)
316 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
317 """ Add all phrases to the database that are not yet there.
319 to_add = new_phrases - existing_phrases
322 with CopyBuffer() as copystr:
323 for word, cls, typ, oper in to_add:
324 term = self._search_normalized(word)
326 copystr.add(term, 'S', word,
327 json.dumps({'class': cls, 'type': typ,
328 'op': oper if oper in ('in', 'near') else None}))
331 copystr.copy_out(cursor, 'word',
332 columns=['word_token', 'type', 'word', 'info'])
338 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
339 """ Remove all phrases from the databse that are no longer in the
342 to_delete = existing_phrases - new_phrases
345 cursor.execute_values(
346 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
347 WHERE type = 'S' and word = name
348 and info->>'class' = in_class and info->>'type' = in_type
349 and ((op = '-' and info->>'op' is null) or op = info->>'op')
352 return len(to_delete)
355 def add_country_names(self, country_code, names):
356 """ Add names for the given country to the search index.
358 # Make sure any name preprocessing for country names applies.
359 info = PlaceInfo({'name': names, 'country_code': country_code,
360 'rank_address': 4, 'class': 'boundary',
361 'type': 'administrative'})
362 self._add_country_full_names(country_code,
363 self.sanitizer.process_names(info)[0])
366 def _add_country_full_names(self, country_code, names):
367 """ Add names for the given country from an already sanitized
372 norm_name = self._search_normalized(name.name)
374 word_tokens.add(norm_name)
376 with self.conn.cursor() as cur:
378 cur.execute("""SELECT word_token FROM word
379 WHERE type = 'C' and word = %s""",
381 word_tokens.difference_update((t[0] for t in cur))
383 # Only add those names that are not yet in the list.
385 cur.execute("""INSERT INTO word (word_token, type, word)
386 (SELECT token, 'C', %s
387 FROM unnest(%s) as token)
388 """, (country_code, list(word_tokens)))
390 # No names are deleted at the moment.
391 # If deletion is made possible, then the static names from the
392 # initial 'country_name' table should be kept.
395 def process_place(self, place):
396 """ Determine tokenizer information about the given place.
398 Returns a JSON-serializable structure that will be handed into
399 the database via the token_info field.
401 token_info = _TokenInfo(self._cache)
403 names, address = self.sanitizer.process_names(place)
406 fulls, partials = self._compute_name_tokens(names)
408 token_info.add_names(fulls, partials)
410 if place.is_country():
411 self._add_country_full_names(place.country_code, names)
414 self._process_place_address(token_info, address)
416 return token_info.data
419 def _process_place_address(self, token_info, address):
423 if item.kind == 'postcode':
424 self._add_postcode(item.name)
425 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
426 hnrs.append(item.name)
427 elif item.kind == 'street':
428 token_info.add_street(self._compute_partial_tokens(item.name))
429 elif item.kind == 'place':
430 token_info.add_place(self._compute_partial_tokens(item.name))
431 elif not item.kind.startswith('_') and \
432 item.kind not in ('country', 'full'):
433 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
436 hnrs = self._split_housenumbers(hnrs)
437 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
440 token_info.add_address_terms(addr_terms)
443 def _compute_partial_tokens(self, name):
444 """ Normalize the given term, split it into partial words and return
445 then token list for them.
447 norm_name = self._search_normalized(name)
451 for partial in norm_name.split():
452 token = self._cache.partials.get(partial)
456 need_lookup.append(partial)
459 with self.conn.cursor() as cur:
460 cur.execute("""SELECT word, getorcreate_partial_word(word)
461 FROM unnest(%s) word""",
464 for partial, token in cur:
466 self._cache.partials[partial] = token
471 def _compute_name_tokens(self, names):
472 """ Computes the full name and partial name tokens for the given
476 partial_tokens = set()
479 analyzer_id = name.get_attr('analyzer')
480 norm_name = self._normalized(name.name)
481 if analyzer_id is None:
484 token_id = f'{norm_name}@{analyzer_id}'
486 full, part = self._cache.names.get(token_id, (None, None))
488 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
492 with self.conn.cursor() as cur:
493 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
494 (token_id, variants))
495 full, part = cur.fetchone()
497 self._cache.names[token_id] = (full, part)
499 full_tokens.add(full)
500 partial_tokens.update(part)
502 return full_tokens, partial_tokens
505 def _add_postcode(self, postcode):
506 """ Make sure the normalized postcode is present in the word table.
508 if re.search(r'[:,;]', postcode) is None:
509 postcode = self.normalize_postcode(postcode)
511 if postcode not in self._cache.postcodes:
512 term = self._search_normalized(postcode)
516 with self.conn.cursor() as cur:
517 # no word_id needed for postcodes
518 cur.execute("""INSERT INTO word (word_token, type, word)
519 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
522 WHERE type = 'P' and word = pc))
523 """, (term, postcode))
524 self._cache.postcodes.add(postcode)
528 def _split_housenumbers(hnrs):
529 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
530 # split numbers if necessary
533 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
535 if len(simple_list) > 1:
536 hnrs = list(set(simple_list))
546 """ Collect token information to be sent back to the database.
548 def __init__(self, cache):
553 def _mk_array(tokens):
554 return '{%s}' % ','.join((str(s) for s in tokens))
557 def add_names(self, fulls, partials):
558 """ Adds token information for the normalised names.
560 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
563 def add_housenumbers(self, conn, hnrs):
564 """ Extract housenumber information from a list of normalised
567 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
568 self.data['hnr'] = ';'.join(hnrs)
571 def add_street(self, tokens):
572 """ Add addr:street match terms.
575 self.data['street'] = self._mk_array(tokens)
578 def add_place(self, tokens):
579 """ Add addr:place search and match terms.
582 self.data['place'] = self._mk_array(tokens)
585 def add_address_terms(self, terms):
586 """ Add additional address terms.
588 tokens = {key: self._mk_array(partials)
589 for key, partials in terms if partials}
592 self.data['addr'] = tokens
596 """ Cache for token information to avoid repeated database queries.
598 This cache is not thread-safe and needs to be instantiated per
604 self.postcodes = set()
605 self.housenumbers = {}
608 def get_hnr_tokens(self, conn, terms):
609 """ Get token ids for a list of housenumbers, looking them up in the
610 database if necessary. `terms` is an iterable of normalized
617 token = self.housenumbers.get(term)
624 with conn.cursor() as cur:
625 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
627 for term, tid in cur:
628 self.housenumbers[term] = tid