2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer(AbstractTokenizer):
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
39 self.naming_rules = None
40 self.term_normalization = None
43 def init_new_db(self, config, init_db=True):
44 """ Set up a new tokenizer for the database.
46 This copies all necessary data in the project directory to make
47 sure the tokenizer remains stable even over updates.
49 loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
50 config='TOKENIZER_CONFIG'))
51 self.naming_rules = ICUNameProcessorRules(loader=loader)
52 self.term_normalization = config.TERM_NORMALIZATION
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self):
63 """ Initialise the tokenizer from the project directory.
65 with connect(self.dsn) as conn:
66 self.naming_rules = ICUNameProcessorRules(conn=conn)
67 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
70 def finalize_import(self, _):
71 """ Do any required postprocessing to make the tokenizer data ready
76 def update_sql_functions(self, config):
77 """ Reimport the SQL functions for this tokenizer.
79 with connect(self.dsn) as conn:
80 sqlp = SQLPreprocessor(conn, config)
81 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
84 def check_database(self):
85 """ Check that the tokenizer is set up correctly.
87 self.init_from_project()
89 if self.naming_rules is None:
90 return "Configuration for tokenizer 'icu' are missing."
95 def name_analyzer(self):
96 """ Create a new analyzer for tokenizing names and queries
97 using this tokinzer. Analyzers are context managers and should
101 with tokenizer.name_analyzer() as analyzer:
105 When used outside the with construct, the caller must ensure to
106 call the close() function before destructing the analyzer.
108 Analyzers are not thread-safe. You need to instantiate one per thread.
110 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
113 def _install_php(self, phpdir):
114 """ Install the php script for the tokenizer.
116 php_file = self.data_dir / "tokenizer.php"
117 php_file.write_text(dedent(f"""\
119 @define('CONST_Max_Word_Frequency', 10000000);
120 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
121 @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
122 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
125 def _save_config(self):
126 """ Save the configuration that needs to remain stable for the given
127 database as database properties.
129 with connect(self.dsn) as conn:
130 self.naming_rules.save_rules(conn)
132 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
135 def _init_db_tables(self, config):
136 """ Set up the word table and fill it with pre-computed word
139 with connect(self.dsn) as conn:
140 sqlp = SQLPreprocessor(conn, config)
141 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
144 LOG.warning("Precomputing word tokens")
146 # get partial words and their frequencies
147 words = self._count_partial_terms(conn)
149 # copy them back into the word table
150 with CopyBuffer() as copystr:
151 for term, cnt in words.items():
152 copystr.add('w', term, json.dumps({'count': cnt}))
154 with conn.cursor() as cur:
155 copystr.copy_out(cur, 'word',
156 columns=['type', 'word_token', 'info'])
157 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
158 WHERE word_id is null and type = 'w'""")
162 def _count_partial_terms(self, conn):
163 """ Count the partial terms from the names in the place table.
166 name_proc = ICUNameProcessor(self.naming_rules)
168 with conn.cursor(name="words") as cur:
169 cur.execute(""" SELECT v, count(*) FROM
170 (SELECT svals(name) as v FROM place)x
171 WHERE length(v) < 75 GROUP BY v""")
173 for name, cnt in cur:
175 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
177 terms.update(word.split())
184 class LegacyICUNameAnalyzer(AbstractAnalyzer):
185 """ The legacy analyzer uses the ICU library for splitting names.
187 Each instance opens a connection to the database to request the
191 def __init__(self, dsn, name_proc):
192 self.conn = connect(dsn).connection
193 self.conn.autocommit = True
194 self.name_processor = name_proc
196 self._cache = _TokenCache()
200 """ Free all resources used by the analyzer.
207 def get_word_token_info(self, words):
208 """ Return token information for the given list of words.
209 If a word starts with # it is assumed to be a full name
210 otherwise is a partial name.
212 The function returns a list of tuples with
213 (original word, word token, word id).
215 The function is used for testing and debugging only
216 and not necessarily efficient.
221 if word.startswith('#'):
222 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
224 partial_tokens[word] = self.name_processor.get_search_normalized(word)
226 with self.conn.cursor() as cur:
227 cur.execute("""SELECT word_token, word_id
228 FROM word WHERE word_token = ANY(%s) and type = 'W'
229 """, (list(full_tokens.values()),))
230 full_ids = {r[0]: r[1] for r in cur}
231 cur.execute("""SELECT word_token, word_id
232 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
233 (list(partial_tokens.values()),))
234 part_ids = {r[0]: r[1] for r in cur}
236 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
237 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
241 def normalize_postcode(postcode):
242 """ Convert the postcode to a standardized form.
244 This function must yield exactly the same result as the SQL function
245 'token_normalized_postcode()'.
247 return postcode.strip().upper()
250 def _make_standard_hnr(self, hnr):
251 """ Create a normalised version of a housenumber.
253 This function takes minor shortcuts on transliteration.
255 return self.name_processor.get_search_normalized(hnr)
257 def update_postcodes_from_db(self):
258 """ Update postcode tokens in the word table from the location_postcode
262 with self.conn.cursor() as cur:
263 # This finds us the rows in location_postcode and word that are
264 # missing in the other table.
265 cur.execute("""SELECT * FROM
266 (SELECT pc, word FROM
267 (SELECT distinct(postcode) as pc FROM location_postcode) p
269 (SELECT word FROM word WHERE type = 'P') w
271 WHERE pc is null or word is null""")
273 with CopyBuffer() as copystr:
274 for postcode, word in cur:
276 to_delete.append(word)
278 copystr.add(self.name_processor.get_search_normalized(postcode),
282 cur.execute("""DELETE FROM WORD
283 WHERE type ='P' and word = any(%s)
286 copystr.copy_out(cur, 'word',
287 columns=['word_token', 'type', 'word'])
290 def update_special_phrases(self, phrases, should_replace):
291 """ Replace the search index for special phrases with the new phrases.
292 If `should_replace` is True, then the previous set of will be
293 completely replaced. Otherwise the phrases are added to the
294 already existing ones.
296 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
299 with self.conn.cursor() as cur:
300 # Get the old phrases.
301 existing_phrases = set()
302 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
303 for word, info in cur:
304 existing_phrases.add((word, info['class'], info['type'],
305 info.get('op') or '-'))
307 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
309 deleted = self._remove_special_phrases(cur, norm_phrases,
314 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
315 len(norm_phrases), added, deleted)
318 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
319 """ Add all phrases to the database that are not yet there.
321 to_add = new_phrases - existing_phrases
324 with CopyBuffer() as copystr:
325 for word, cls, typ, oper in to_add:
326 term = self.name_processor.get_search_normalized(word)
328 copystr.add(term, 'S', word,
329 json.dumps({'class': cls, 'type': typ,
330 'op': oper if oper in ('in', 'near') else None}))
333 copystr.copy_out(cursor, 'word',
334 columns=['word_token', 'type', 'word', 'info'])
340 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
341 """ Remove all phrases from the databse that are no longer in the
344 to_delete = existing_phrases - new_phrases
347 cursor.execute_values(
348 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
349 WHERE type = 'S' and word = name
350 and info->>'class' = in_class and info->>'type' = in_type
351 and ((op = '-' and info->>'op' is null) or op = info->>'op')
354 return len(to_delete)
357 def add_country_names(self, country_code, names):
358 """ Add names for the given country to the search index.
361 for name in self._compute_full_names(names):
362 norm_name = self.name_processor.get_search_normalized(name)
364 word_tokens.add(norm_name)
366 with self.conn.cursor() as cur:
368 cur.execute("""SELECT word_token FROM word
369 WHERE type = 'C' and word = %s""",
371 word_tokens.difference_update((t[0] for t in cur))
373 # Only add those names that are not yet in the list.
375 cur.execute("""INSERT INTO word (word_token, type, word)
376 (SELECT token, 'C', %s
377 FROM unnest(%s) as token)
378 """, (country_code, list(word_tokens)))
380 # No names are deleted at the moment.
381 # If deletion is made possible, then the static names from the
382 # initial 'country_name' table should be kept.
385 def process_place(self, place):
386 """ Determine tokenizer information about the given place.
388 Returns a JSON-serialisable structure that will be handed into
389 the database via the token_info field.
391 token_info = _TokenInfo(self._cache)
396 fulls, partials = self._compute_name_tokens(names)
398 token_info.add_names(fulls, partials)
400 if place.is_country():
401 self.add_country_names(place.country_code, names)
403 address = place.address
405 self._process_place_address(token_info, address)
407 return token_info.data
410 def _process_place_address(self, token_info, address):
413 for key, value in address.items():
414 if key == 'postcode':
415 self._add_postcode(value)
416 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
418 elif key == 'street':
419 token_info.add_street(self._compute_partial_tokens(value))
421 token_info.add_place(self._compute_partial_tokens(value))
422 elif not key.startswith('_') and \
423 key not in ('country', 'full'):
424 addr_terms.append((key, self._compute_partial_tokens(value)))
427 hnrs = self._split_housenumbers(hnrs)
428 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
431 token_info.add_address_terms(addr_terms)
433 def _compute_partial_tokens(self, name):
434 """ Normalize the given term, split it into partial words and return
435 then token list for them.
437 norm_name = self.name_processor.get_search_normalized(name)
441 for partial in norm_name.split():
442 token = self._cache.partials.get(partial)
446 need_lookup.append(partial)
449 with self.conn.cursor() as cur:
450 cur.execute("""SELECT word, getorcreate_partial_word(word)
451 FROM unnest(%s) word""",
454 for partial, token in cur:
456 self._cache.partials[partial] = token
460 def _compute_name_tokens(self, names):
461 """ Computes the full name and partial name tokens for the given
464 full_names = self._compute_full_names(names)
466 partial_tokens = set()
468 for name in full_names:
469 norm_name = self.name_processor.get_normalized(name)
470 full, part = self._cache.names.get(norm_name, (None, None))
472 variants = self.name_processor.get_variants_ascii(norm_name)
476 with self.conn.cursor() as cur:
477 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
478 (norm_name, variants))
479 full, part = cur.fetchone()
481 self._cache.names[norm_name] = (full, part)
483 full_tokens.add(full)
484 partial_tokens.update(part)
486 return full_tokens, partial_tokens
490 def _compute_full_names(names):
491 """ Return the set of all full name word ids to be used with the
492 given dictionary of names.
495 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
499 brace_idx = name.find('(')
501 full_names.add(name[:brace_idx].strip())
506 def _add_postcode(self, postcode):
507 """ Make sure the normalized postcode is present in the word table.
509 if re.search(r'[:,;]', postcode) is None:
510 postcode = self.normalize_postcode(postcode)
512 if postcode not in self._cache.postcodes:
513 term = self.name_processor.get_search_normalized(postcode)
517 with self.conn.cursor() as cur:
518 # no word_id needed for postcodes
519 cur.execute("""INSERT INTO word (word_token, type, word)
520 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
523 WHERE type = 'P' and word = pc))
524 """, (term, postcode))
525 self._cache.postcodes.add(postcode)
529 def _split_housenumbers(hnrs):
530 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
531 # split numbers if necessary
534 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
536 if len(simple_list) > 1:
537 hnrs = list(set(simple_list))
547 """ Collect token information to be sent back to the database.
549 def __init__(self, cache):
554 def _mk_array(tokens):
555 return '{%s}' % ','.join((str(s) for s in tokens))
558 def add_names(self, fulls, partials):
559 """ Adds token information for the normalised names.
561 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
564 def add_housenumbers(self, conn, hnrs):
565 """ Extract housenumber information from a list of normalised
568 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
569 self.data['hnr'] = ';'.join(hnrs)
572 def add_street(self, tokens):
573 """ Add addr:street match terms.
576 self.data['street'] = self._mk_array(tokens)
579 def add_place(self, tokens):
580 """ Add addr:place search and match terms.
583 self.data['place'] = self._mk_array(tokens)
586 def add_address_terms(self, terms):
587 """ Add additional address terms.
589 tokens = {key: self._mk_array(partials)
590 for key, partials in terms if partials}
593 self.data['addr'] = tokens
597 """ Cache for token information to avoid repeated database queries.
599 This cache is not thread-safe and needs to be instantiated per
605 self.postcodes = set()
606 self.housenumbers = {}
609 def get_hnr_tokens(self, conn, terms):
610 """ Get token ids for a list of housenumbers, looking them up in the
611 database if necessary. `terms` is an iterable of normalized
618 token = self.housenumbers.get(term)
625 with conn.cursor() as cur:
626 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
628 for term, tid in cur:
629 self.housenumbers[term] = tid