2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer(AbstractTokenizer):
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
39 self.naming_rules = None
40 self.term_normalization = None
43 def init_new_db(self, config, init_db=True):
44 """ Set up a new tokenizer for the database.
46 This copies all necessary data in the project directory to make
47 sure the tokenizer remains stable even over updates.
49 loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
50 config='TOKENIZER_CONFIG'))
51 self.naming_rules = ICUNameProcessorRules(loader=loader)
52 self.term_normalization = config.TERM_NORMALIZATION
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self):
63 """ Initialise the tokenizer from the project directory.
65 with connect(self.dsn) as conn:
66 self.naming_rules = ICUNameProcessorRules(conn=conn)
67 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
70 def finalize_import(self, _):
71 """ Do any required postprocessing to make the tokenizer data ready
76 def update_sql_functions(self, config):
77 """ Reimport the SQL functions for this tokenizer.
79 with connect(self.dsn) as conn:
80 sqlp = SQLPreprocessor(conn, config)
81 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
84 def check_database(self):
85 """ Check that the tokenizer is set up correctly.
87 self.init_from_project()
89 if self.naming_rules is None:
90 return "Configuration for tokenizer 'icu' are missing."
95 def name_analyzer(self):
96 """ Create a new analyzer for tokenizing names and queries
97 using this tokinzer. Analyzers are context managers and should
101 with tokenizer.name_analyzer() as analyzer:
105 When used outside the with construct, the caller must ensure to
106 call the close() function before destructing the analyzer.
108 Analyzers are not thread-safe. You need to instantiate one per thread.
110 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
113 def _install_php(self, phpdir):
114 """ Install the php script for the tokenizer.
116 php_file = self.data_dir / "tokenizer.php"
117 php_file.write_text(dedent(f"""\
119 @define('CONST_Max_Word_Frequency', 10000000);
120 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
121 @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
122 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
125 def _save_config(self):
126 """ Save the configuration that needs to remain stable for the given
127 database as database properties.
129 with connect(self.dsn) as conn:
130 self.naming_rules.save_rules(conn)
132 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
135 def _init_db_tables(self, config):
136 """ Set up the word table and fill it with pre-computed word
139 with connect(self.dsn) as conn:
140 sqlp = SQLPreprocessor(conn, config)
141 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
144 LOG.warning("Precomputing word tokens")
146 # get partial words and their frequencies
147 words = self._count_partial_terms(conn)
149 # copy them back into the word table
150 with CopyBuffer() as copystr:
151 for term, cnt in words.items():
152 copystr.add('w', term, json.dumps({'count': cnt}))
154 with conn.cursor() as cur:
155 copystr.copy_out(cur, 'word',
156 columns=['type', 'word_token', 'info'])
157 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
158 WHERE word_id is null and type = 'w'""")
162 def _count_partial_terms(self, conn):
163 """ Count the partial terms from the names in the place table.
166 name_proc = ICUNameProcessor(self.naming_rules)
168 with conn.cursor(name="words") as cur:
169 cur.execute(""" SELECT v, count(*) FROM
170 (SELECT svals(name) as v FROM place)x
171 WHERE length(v) < 75 GROUP BY v""")
173 for name, cnt in cur:
175 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
177 terms.update(word.split())
184 class LegacyICUNameAnalyzer(AbstractAnalyzer):
185 """ The legacy analyzer uses the ICU library for splitting names.
187 Each instance opens a connection to the database to request the
191 def __init__(self, dsn, name_proc):
192 self.conn = connect(dsn).connection
193 self.conn.autocommit = True
194 self.name_processor = name_proc
196 self._cache = _TokenCache()
200 """ Free all resources used by the analyzer.
207 def get_word_token_info(self, words):
208 """ Return token information for the given list of words.
209 If a word starts with # it is assumed to be a full name
210 otherwise is a partial name.
212 The function returns a list of tuples with
213 (original word, word token, word id).
215 The function is used for testing and debugging only
216 and not necessarily efficient.
221 if word.startswith('#'):
222 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
224 partial_tokens[word] = self.name_processor.get_search_normalized(word)
226 with self.conn.cursor() as cur:
227 cur.execute("""SELECT word_token, word_id
228 FROM word WHERE word_token = ANY(%s) and type = 'W'
229 """, (list(full_tokens.values()),))
230 full_ids = {r[0]: r[1] for r in cur}
231 cur.execute("""SELECT word_token, word_id
232 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
233 (list(partial_tokens.values()),))
234 part_ids = {r[0]: r[1] for r in cur}
236 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
237 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
241 def normalize_postcode(postcode):
242 """ Convert the postcode to a standardized form.
244 This function must yield exactly the same result as the SQL function
245 'token_normalized_postcode()'.
247 return postcode.strip().upper()
250 def _make_standard_hnr(self, hnr):
251 """ Create a normalised version of a housenumber.
253 This function takes minor shortcuts on transliteration.
255 return self.name_processor.get_search_normalized(hnr)
257 def update_postcodes_from_db(self):
258 """ Update postcode tokens in the word table from the location_postcode
262 with self.conn.cursor() as cur:
263 # This finds us the rows in location_postcode and word that are
264 # missing in the other table.
265 cur.execute("""SELECT * FROM
266 (SELECT pc, word FROM
267 (SELECT distinct(postcode) as pc FROM location_postcode) p
269 (SELECT word FROM word WHERE type = 'P') w
271 WHERE pc is null or word is null""")
273 with CopyBuffer() as copystr:
274 for postcode, word in cur:
276 to_delete.append(word)
278 copystr.add(self.name_processor.get_search_normalized(postcode),
282 cur.execute("""DELETE FROM WORD
283 WHERE type ='P' and word = any(%s)
286 copystr.copy_out(cur, 'word',
287 columns=['word_token', 'type', 'word'])
290 def update_special_phrases(self, phrases, should_replace):
291 """ Replace the search index for special phrases with the new phrases.
292 If `should_replace` is True, then the previous set of will be
293 completely replaced. Otherwise the phrases are added to the
294 already existing ones.
296 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
299 with self.conn.cursor() as cur:
300 # Get the old phrases.
301 existing_phrases = set()
302 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
303 for word, info in cur:
304 existing_phrases.add((word, info['class'], info['type'],
305 info.get('op') or '-'))
307 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
309 deleted = self._remove_special_phrases(cur, norm_phrases,
314 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
315 len(norm_phrases), added, deleted)
318 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
319 """ Add all phrases to the database that are not yet there.
321 to_add = new_phrases - existing_phrases
324 with CopyBuffer() as copystr:
325 for word, cls, typ, oper in to_add:
326 term = self.name_processor.get_search_normalized(word)
328 copystr.add(term, 'S', word,
329 json.dumps({'class': cls, 'type': typ,
330 'op': oper if oper in ('in', 'near') else None}))
333 copystr.copy_out(cursor, 'word',
334 columns=['word_token', 'type', 'word', 'info'])
340 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
341 """ Remove all phrases from the databse that are no longer in the
344 to_delete = existing_phrases - new_phrases
347 cursor.execute_values(
348 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
349 WHERE type = 'S' and word = name
350 and info->>'class' = in_class and info->>'type' = in_type
351 and ((op = '-' and info->>'op' is null) or op = info->>'op')
354 return len(to_delete)
357 def add_country_names(self, country_code, names):
358 """ Add names for the given country to the search index.
361 for name in self._compute_full_names(names):
362 norm_name = self.name_processor.get_search_normalized(name)
364 word_tokens.add(norm_name)
366 with self.conn.cursor() as cur:
368 cur.execute("""SELECT word_token FROM word
369 WHERE type = 'C' and word = %s""",
371 word_tokens.difference_update((t[0] for t in cur))
373 # Only add those names that are not yet in the list.
375 cur.execute("""INSERT INTO word (word_token, type, word)
376 (SELECT token, 'C', %s
377 FROM unnest(%s) as token)
378 """, (country_code, list(word_tokens)))
380 # No names are deleted at the moment.
381 # If deletion is made possible, then the static names from the
382 # initial 'country_name' table should be kept.
385 def process_place(self, place):
386 """ Determine tokenizer information about the given place.
388 Returns a JSON-serialisable structure that will be handed into
389 the database via the token_info field.
391 token_info = _TokenInfo(self._cache)
393 names = place.get('name')
396 fulls, partials = self._compute_name_tokens(names)
398 token_info.add_names(fulls, partials)
400 country_feature = place.get('country_feature')
401 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
402 self.add_country_names(country_feature.lower(), names)
404 address = place.get('address')
406 self._process_place_address(token_info, address)
408 return token_info.data
411 def _process_place_address(self, token_info, address):
414 for key, value in address.items():
415 if key == 'postcode':
416 self._add_postcode(value)
417 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
419 elif key == 'street':
420 token_info.add_street(self._compute_partial_tokens(value))
422 token_info.add_place(self._compute_partial_tokens(value))
423 elif not key.startswith('_') and \
424 key not in ('country', 'full'):
425 addr_terms.append((key, self._compute_partial_tokens(value)))
428 hnrs = self._split_housenumbers(hnrs)
429 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
432 token_info.add_address_terms(addr_terms)
434 def _compute_partial_tokens(self, name):
435 """ Normalize the given term, split it into partial words and return
436 then token list for them.
438 norm_name = self.name_processor.get_search_normalized(name)
442 for partial in norm_name.split():
443 token = self._cache.partials.get(partial)
447 need_lookup.append(partial)
450 with self.conn.cursor() as cur:
451 cur.execute("""SELECT word, getorcreate_partial_word(word)
452 FROM unnest(%s) word""",
455 for partial, token in cur:
457 self._cache.partials[partial] = token
461 def _compute_name_tokens(self, names):
462 """ Computes the full name and partial name tokens for the given
465 full_names = self._compute_full_names(names)
467 partial_tokens = set()
469 for name in full_names:
470 norm_name = self.name_processor.get_normalized(name)
471 full, part = self._cache.names.get(norm_name, (None, None))
473 variants = self.name_processor.get_variants_ascii(norm_name)
477 with self.conn.cursor() as cur:
478 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
479 (norm_name, variants))
480 full, part = cur.fetchone()
482 self._cache.names[norm_name] = (full, part)
484 full_tokens.add(full)
485 partial_tokens.update(part)
487 return full_tokens, partial_tokens
491 def _compute_full_names(names):
492 """ Return the set of all full name word ids to be used with the
493 given dictionary of names.
496 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
500 brace_idx = name.find('(')
502 full_names.add(name[:brace_idx].strip())
507 def _add_postcode(self, postcode):
508 """ Make sure the normalized postcode is present in the word table.
510 if re.search(r'[:,;]', postcode) is None:
511 postcode = self.normalize_postcode(postcode)
513 if postcode not in self._cache.postcodes:
514 term = self.name_processor.get_search_normalized(postcode)
518 with self.conn.cursor() as cur:
519 # no word_id needed for postcodes
520 cur.execute("""INSERT INTO word (word_token, type, word)
521 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
524 WHERE type = 'P' and word = pc))
525 """, (term, postcode))
526 self._cache.postcodes.add(postcode)
530 def _split_housenumbers(hnrs):
531 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
532 # split numbers if necessary
535 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
537 if len(simple_list) > 1:
538 hnrs = list(set(simple_list))
548 """ Collect token information to be sent back to the database.
550 def __init__(self, cache):
555 def _mk_array(tokens):
556 return '{%s}' % ','.join((str(s) for s in tokens))
559 def add_names(self, fulls, partials):
560 """ Adds token information for the normalised names.
562 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
565 def add_housenumbers(self, conn, hnrs):
566 """ Extract housenumber information from a list of normalised
569 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
570 self.data['hnr'] = ';'.join(hnrs)
573 def add_street(self, tokens):
574 """ Add addr:street match terms.
577 self.data['street'] = self._mk_array(tokens)
580 def add_place(self, tokens):
581 """ Add addr:place search and match terms.
584 self.data['place'] = self._mk_array(tokens)
587 def add_address_terms(self, terms):
588 """ Add additional address terms.
590 tokens = {key: self._mk_array(partials)
591 for key, partials in terms if partials}
594 self.data['addr'] = tokens
598 """ Cache for token information to avoid repeated database queries.
600 This cache is not thread-safe and needs to be instantiated per
606 self.postcodes = set()
607 self.housenumbers = {}
610 def get_hnr_tokens(self, conn, terms):
611 """ Get token ids for a list of housenumbers, looking them up in the
612 database if necessary. `terms` is an iterable of normalized
619 token = self.housenumbers.get(term)
626 with conn.cursor() as cur:
627 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
629 for term, tid in cur:
630 self.housenumbers[term] = tid