2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
20 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
21 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
23 LOG = logging.getLogger()
25 def create(dsn, data_dir):
26 """ Create a new instance of the tokenizer provided by this module.
28 return LegacyICUTokenizer(dsn, data_dir)
31 class LegacyICUTokenizer(AbstractTokenizer):
32 """ This tokenizer uses libICU to covert names and queries to ASCII.
33 Otherwise it uses the same algorithms and data structures as the
34 normalization routines in Nominatim 3.
37 def __init__(self, dsn, data_dir):
39 self.data_dir = data_dir
40 self.naming_rules = None
41 self.term_normalization = None
42 self.max_word_frequency = None
45 def init_new_db(self, config, init_db=True):
46 """ Set up a new tokenizer for the database.
48 This copies all necessary data in the project directory to make
49 sure the tokenizer remains stable even over updates.
51 loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
52 config='TOKENIZER_CONFIG'))
53 self.naming_rules = ICUNameProcessorRules(loader=loader)
54 self.term_normalization = config.TERM_NORMALIZATION
55 self.max_word_frequency = config.MAX_WORD_FREQUENCY
57 self._install_php(config.lib_dir.php)
58 self._save_config(config)
61 self.update_sql_functions(config)
62 self._init_db_tables(config)
65 def init_from_project(self):
66 """ Initialise the tokenizer from the project directory.
68 with connect(self.dsn) as conn:
69 self.naming_rules = ICUNameProcessorRules(conn=conn)
70 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
71 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
74 def finalize_import(self, _):
75 """ Do any required postprocessing to make the tokenizer data ready
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
85 sqlp = SQLPreprocessor(conn, config)
86 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
87 max_word_freq=max_word_freq)
90 def check_database(self):
91 """ Check that the tokenizer is set up correctly.
93 self.init_from_project()
95 if self.naming_rules is None:
96 return "Configuration for tokenizer 'icu' are missing."
101 def name_analyzer(self):
102 """ Create a new analyzer for tokenizing names and queries
103 using this tokinzer. Analyzers are context managers and should
107 with tokenizer.name_analyzer() as analyzer:
111 When used outside the with construct, the caller must ensure to
112 call the close() function before destructing the analyzer.
114 Analyzers are not thread-safe. You need to instantiate one per thread.
116 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
119 def _install_php(self, phpdir):
120 """ Install the php script for the tokenizer.
122 php_file = self.data_dir / "tokenizer.php"
123 php_file.write_text(dedent(f"""\
125 @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
126 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
127 @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
128 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
131 def _save_config(self, config):
132 """ Save the configuration that needs to remain stable for the given
133 database as database properties.
135 with connect(self.dsn) as conn:
136 self.naming_rules.save_rules(conn)
138 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
139 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
142 def _init_db_tables(self, config):
143 """ Set up the word table and fill it with pre-computed word
146 with connect(self.dsn) as conn:
147 sqlp = SQLPreprocessor(conn, config)
148 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
151 LOG.warning("Precomputing word tokens")
153 # get partial words and their frequencies
154 words = self._count_partial_terms(conn)
156 # copy them back into the word table
157 with CopyBuffer() as copystr:
158 for term, cnt in words.items():
159 copystr.add('w', term, json.dumps({'count': cnt}))
161 with conn.cursor() as cur:
162 copystr.copy_out(cur, 'word',
163 columns=['type', 'word_token', 'info'])
164 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
165 WHERE word_id is null and type = 'w'""")
169 def _count_partial_terms(self, conn):
170 """ Count the partial terms from the names in the place table.
173 name_proc = ICUNameProcessor(self.naming_rules)
175 with conn.cursor(name="words") as cur:
176 cur.execute(""" SELECT v, count(*) FROM
177 (SELECT svals(name) as v FROM place)x
178 WHERE length(v) < 75 GROUP BY v""")
180 for name, cnt in cur:
182 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
184 terms.update(word.split())
191 class LegacyICUNameAnalyzer(AbstractAnalyzer):
192 """ The legacy analyzer uses the ICU library for splitting names.
194 Each instance opens a connection to the database to request the
198 def __init__(self, dsn, name_proc):
199 self.conn = connect(dsn).connection
200 self.conn.autocommit = True
201 self.name_processor = name_proc
203 self._cache = _TokenCache()
207 """ Free all resources used by the analyzer.
214 def get_word_token_info(self, words):
215 """ Return token information for the given list of words.
216 If a word starts with # it is assumed to be a full name
217 otherwise is a partial name.
219 The function returns a list of tuples with
220 (original word, word token, word id).
222 The function is used for testing and debugging only
223 and not necessarily efficient.
228 if word.startswith('#'):
229 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
231 partial_tokens[word] = self.name_processor.get_search_normalized(word)
233 with self.conn.cursor() as cur:
234 cur.execute("""SELECT word_token, word_id
235 FROM word WHERE word_token = ANY(%s) and type = 'W'
236 """, (list(full_tokens.values()),))
237 full_ids = {r[0]: r[1] for r in cur}
238 cur.execute("""SELECT word_token, word_id
239 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
240 (list(partial_tokens.values()),))
241 part_ids = {r[0]: r[1] for r in cur}
243 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
244 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
248 def normalize_postcode(postcode):
249 """ Convert the postcode to a standardized form.
251 This function must yield exactly the same result as the SQL function
252 'token_normalized_postcode()'.
254 return postcode.strip().upper()
257 def _make_standard_hnr(self, hnr):
258 """ Create a normalised version of a housenumber.
260 This function takes minor shortcuts on transliteration.
262 return self.name_processor.get_search_normalized(hnr)
264 def update_postcodes_from_db(self):
265 """ Update postcode tokens in the word table from the location_postcode
269 with self.conn.cursor() as cur:
270 # This finds us the rows in location_postcode and word that are
271 # missing in the other table.
272 cur.execute("""SELECT * FROM
273 (SELECT pc, word FROM
274 (SELECT distinct(postcode) as pc FROM location_postcode) p
276 (SELECT word FROM word WHERE type = 'P') w
278 WHERE pc is null or word is null""")
280 with CopyBuffer() as copystr:
281 for postcode, word in cur:
283 to_delete.append(word)
285 copystr.add(self.name_processor.get_search_normalized(postcode),
289 cur.execute("""DELETE FROM WORD
290 WHERE type ='P' and word = any(%s)
293 copystr.copy_out(cur, 'word',
294 columns=['word_token', 'type', 'word'])
297 def update_special_phrases(self, phrases, should_replace):
298 """ Replace the search index for special phrases with the new phrases.
299 If `should_replace` is True, then the previous set of will be
300 completely replaced. Otherwise the phrases are added to the
301 already existing ones.
303 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
306 with self.conn.cursor() as cur:
307 # Get the old phrases.
308 existing_phrases = set()
309 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
310 for word, info in cur:
311 existing_phrases.add((word, info['class'], info['type'],
312 info.get('op') or '-'))
314 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
316 deleted = self._remove_special_phrases(cur, norm_phrases,
321 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
322 len(norm_phrases), added, deleted)
325 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
326 """ Add all phrases to the database that are not yet there.
328 to_add = new_phrases - existing_phrases
331 with CopyBuffer() as copystr:
332 for word, cls, typ, oper in to_add:
333 term = self.name_processor.get_search_normalized(word)
335 copystr.add(term, 'S', word,
336 json.dumps({'class': cls, 'type': typ,
337 'op': oper if oper in ('in', 'near') else None}))
340 copystr.copy_out(cursor, 'word',
341 columns=['word_token', 'type', 'word', 'info'])
347 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
348 """ Remove all phrases from the databse that are no longer in the
351 to_delete = existing_phrases - new_phrases
354 cursor.execute_values(
355 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
356 WHERE type = 'S' and word = name
357 and info->>'class' = in_class and info->>'type' = in_type
358 and ((op = '-' and info->>'op' is null) or op = info->>'op')
361 return len(to_delete)
364 def add_country_names(self, country_code, names):
365 """ Add names for the given country to the search index.
368 for name in self._compute_full_names(names):
369 norm_name = self.name_processor.get_search_normalized(name)
371 word_tokens.add(norm_name)
373 with self.conn.cursor() as cur:
375 cur.execute("""SELECT word_token FROM word
376 WHERE type = 'C' and word = %s""",
378 word_tokens.difference_update((t[0] for t in cur))
380 # Only add those names that are not yet in the list.
382 cur.execute("""INSERT INTO word (word_token, type, word)
383 (SELECT token, 'C', %s
384 FROM unnest(%s) as token)
385 """, (country_code, list(word_tokens)))
387 # No names are deleted at the moment.
388 # If deletion is made possible, then the static names from the
389 # initial 'country_name' table should be kept.
392 def process_place(self, place):
393 """ Determine tokenizer information about the given place.
395 Returns a JSON-serialisable structure that will be handed into
396 the database via the token_info field.
398 token_info = _TokenInfo(self._cache)
400 names = place.get('name')
403 fulls, partials = self._compute_name_tokens(names)
405 token_info.add_names(fulls, partials)
407 country_feature = place.get('country_feature')
408 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
409 self.add_country_names(country_feature.lower(), names)
411 address = place.get('address')
413 self._process_place_address(token_info, address)
415 return token_info.data
418 def _process_place_address(self, token_info, address):
421 for key, value in address.items():
422 if key == 'postcode':
423 self._add_postcode(value)
424 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
426 elif key == 'street':
427 token_info.add_street(*self._compute_name_tokens({'name': value}))
429 token_info.add_place(*self._compute_name_tokens({'name': value}))
430 elif not key.startswith('_') and \
431 key not in ('country', 'full'):
432 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
435 hnrs = self._split_housenumbers(hnrs)
436 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
439 token_info.add_address_terms(addr_terms)
442 def _compute_name_tokens(self, names):
443 """ Computes the full name and partial name tokens for the given
446 full_names = self._compute_full_names(names)
448 partial_tokens = set()
450 for name in full_names:
451 norm_name = self.name_processor.get_normalized(name)
452 full, part = self._cache.names.get(norm_name, (None, None))
454 variants = self.name_processor.get_variants_ascii(norm_name)
458 with self.conn.cursor() as cur:
459 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
460 (norm_name, variants))
461 full, part = cur.fetchone()
463 self._cache.names[norm_name] = (full, part)
465 full_tokens.add(full)
466 partial_tokens.update(part)
468 return full_tokens, partial_tokens
472 def _compute_full_names(names):
473 """ Return the set of all full name word ids to be used with the
474 given dictionary of names.
477 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
481 brace_idx = name.find('(')
483 full_names.add(name[:brace_idx].strip())
488 def _add_postcode(self, postcode):
489 """ Make sure the normalized postcode is present in the word table.
491 if re.search(r'[:,;]', postcode) is None:
492 postcode = self.normalize_postcode(postcode)
494 if postcode not in self._cache.postcodes:
495 term = self.name_processor.get_search_normalized(postcode)
499 with self.conn.cursor() as cur:
500 # no word_id needed for postcodes
501 cur.execute("""INSERT INTO word (word_token, type, word)
502 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
505 WHERE type = 'P' and word = pc))
506 """, (term, postcode))
507 self._cache.postcodes.add(postcode)
511 def _split_housenumbers(hnrs):
512 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
513 # split numbers if necessary
516 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
518 if len(simple_list) > 1:
519 hnrs = list(set(simple_list))
529 """ Collect token information to be sent back to the database.
531 def __init__(self, cache):
536 def _mk_array(tokens):
537 return '{%s}' % ','.join((str(s) for s in tokens))
540 def add_names(self, fulls, partials):
541 """ Adds token information for the normalised names.
543 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
546 def add_housenumbers(self, conn, hnrs):
547 """ Extract housenumber information from a list of normalised
550 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
551 self.data['hnr'] = ';'.join(hnrs)
554 def add_street(self, fulls, _):
555 """ Add addr:street match terms.
558 self.data['street'] = self._mk_array(fulls)
561 def add_place(self, fulls, partials):
562 """ Add addr:place search and match terms.
565 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
566 self.data['place_match'] = self._mk_array(fulls)
569 def add_address_terms(self, terms):
570 """ Add additional address terms.
574 for key, fulls, partials in terms:
576 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
577 self._mk_array(fulls)]
580 self.data['addr'] = tokens
584 """ Cache for token information to avoid repeated database queries.
586 This cache is not thread-safe and needs to be instantiated per
591 self.postcodes = set()
592 self.housenumbers = {}
595 def get_hnr_tokens(self, conn, terms):
596 """ Get token ids for a list of housenumbers, looking them up in the
597 database if necessary. `terms` is an iterable of normalized
604 token = self.housenumbers.get(term)
611 with conn.cursor() as cur:
612 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
614 for term, tid in cur:
615 self.housenumbers[term] = tid