2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
9 from textwrap import dedent
10 from pathlib import Path
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db.properties import set_property, get_property
16 from nominatim.db.utils import CopyBuffer
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyICUTokenizer(dsn, data_dir)
32 class LegacyICUTokenizer:
33 """ This tokenizer uses libICU to covert names and queries to ASCII.
34 Otherwise it uses the same algorithms and data structures as the
35 normalization routines in Nominatim 3.
38 def __init__(self, dsn, data_dir):
40 self.data_dir = data_dir
41 self.naming_rules = None
42 self.term_normalization = None
43 self.max_word_frequency = None
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 if config.TOKENIZER_CONFIG:
53 cfgfile = Path(config.TOKENIZER_CONFIG)
55 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
57 loader = ICURuleLoader(cfgfile)
58 self.naming_rules = ICUNameProcessorRules(loader=loader)
59 self.term_normalization = config.TERM_NORMALIZATION
60 self.max_word_frequency = config.MAX_WORD_FREQUENCY
62 self._install_php(config.lib_dir.php)
63 self._save_config(config)
66 self.update_sql_functions(config)
67 self._init_db_tables(config)
70 def init_from_project(self):
71 """ Initialise the tokenizer from the project directory.
73 with connect(self.dsn) as conn:
74 self.naming_rules = ICUNameProcessorRules(conn=conn)
75 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
79 def finalize_import(self, config):
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config):
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
93 sqlp = SQLPreprocessor(conn, config)
94 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
95 max_word_freq=max_word_freq)
98 def check_database(self):
99 """ Check that the tokenizer is set up correctly.
101 self.init_from_project()
103 if self.naming_rules is None:
104 return "Configuration for tokenizer 'legacy_icu' are missing."
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
127 def _install_php(self, phpdir):
128 """ Install the php script for the tokenizer.
130 php_file = self.data_dir / "tokenizer.php"
131 php_file.write_text(dedent("""\
133 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
134 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
135 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
136 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
137 """.format(self, phpdir))) # pylint: disable=missing-format-attribute
140 def _save_config(self, config):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.naming_rules.save_rules(conn)
147 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
148 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
151 def _init_db_tables(self, config):
152 """ Set up the word table and fill it with pre-computed word
155 with connect(self.dsn) as conn:
156 sqlp = SQLPreprocessor(conn, config)
157 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
160 LOG.warning("Precomputing word tokens")
162 # get partial words and their frequencies
164 name_proc = ICUNameProcessor(self.naming_rules)
165 with conn.cursor(name="words") as cur:
166 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
168 for name, cnt in cur:
169 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
170 for term in word.split():
173 # copy them back into the word table
174 with CopyBuffer() as copystr:
175 for args in words.items():
178 with conn.cursor() as cur:
179 copystr.copy_out(cur, 'word',
180 columns=['word_token', 'search_name_count'])
181 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
182 WHERE word_id is null""")
187 class LegacyICUNameAnalyzer:
188 """ The legacy analyzer uses the ICU library for splitting names.
190 Each instance opens a connection to the database to request the
194 def __init__(self, dsn, name_proc):
195 self.conn = connect(dsn).connection
196 self.conn.autocommit = True
197 self.name_processor = name_proc
199 self._cache = _TokenCache()
206 def __exit__(self, exc_type, exc_value, traceback):
211 """ Free all resources used by the analyzer.
218 def get_word_token_info(self, words):
219 """ Return token information for the given list of words.
220 If a word starts with # it is assumed to be a full name
221 otherwise is a partial name.
223 The function returns a list of tuples with
224 (original word, word token, word id).
226 The function is used for testing and debugging only
227 and not necessarily efficient.
231 if word.startswith('#'):
232 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
234 tokens[word] = self.name_processor.get_search_normalized(word)
236 with self.conn.cursor() as cur:
237 cur.execute("""SELECT word_token, word_id
238 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
239 WHERE word_token = t.term
240 and class is null and country_code is null""",
241 (list(tokens.values()), ))
242 ids = {r[0]: r[1] for r in cur}
244 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
248 def normalize_postcode(postcode):
249 """ Convert the postcode to a standardized form.
251 This function must yield exactly the same result as the SQL function
252 'token_normalized_postcode()'.
254 return postcode.strip().upper()
257 def _make_standard_hnr(self, hnr):
258 """ Create a normalised version of a housenumber.
260 This function takes minor shortcuts on transliteration.
262 return self.name_processor.get_search_normalized(hnr)
264 def update_postcodes_from_db(self):
265 """ Update postcode tokens in the word table from the location_postcode
269 with self.conn.cursor() as cur:
270 # This finds us the rows in location_postcode and word that are
271 # missing in the other table.
272 cur.execute("""SELECT * FROM
273 (SELECT pc, word FROM
274 (SELECT distinct(postcode) as pc FROM location_postcode) p
276 (SELECT word FROM word
277 WHERE class ='place' and type = 'postcode') w
279 WHERE pc is null or word is null""")
281 with CopyBuffer() as copystr:
282 for postcode, word in cur:
284 to_delete.append(word)
288 ' ' + self.name_processor.get_search_normalized(postcode),
289 'place', 'postcode', 0)
292 cur.execute("""DELETE FROM WORD
293 WHERE class ='place' and type = 'postcode'
297 copystr.copy_out(cur, 'word',
298 columns=['word', 'word_token', 'class', 'type',
299 'search_name_count'])
302 def update_special_phrases(self, phrases, should_replace):
303 """ Replace the search index for special phrases with the new phrases.
305 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
308 with self.conn.cursor() as cur:
309 # Get the old phrases.
310 existing_phrases = set()
311 cur.execute("""SELECT word, class, type, operator FROM word
312 WHERE class != 'place'
313 OR (type != 'house' AND type != 'postcode')""")
314 for label, cls, typ, oper in cur:
315 existing_phrases.add((label, cls, typ, oper or '-'))
317 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
319 deleted = self._remove_special_phrases(cur, norm_phrases,
324 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
325 len(norm_phrases), added, deleted)
328 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
329 """ Add all phrases to the database that are not yet there.
331 to_add = new_phrases - existing_phrases
334 with CopyBuffer() as copystr:
335 for word, cls, typ, oper in to_add:
336 term = self.name_processor.get_search_normalized(word)
338 copystr.add(word, term, cls, typ,
339 oper if oper in ('in', 'near') else None, 0)
342 copystr.copy_out(cursor, 'word',
343 columns=['word', 'word_token', 'class', 'type',
344 'operator', 'search_name_count'])
350 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
351 """ Remove all phrases from the databse that are no longer in the
354 to_delete = existing_phrases - new_phrases
357 psycopg2.extras.execute_values(
359 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
360 WHERE word = name and class = in_class and type = in_type
361 and ((op = '-' and operator is null) or op = operator)""",
364 return len(to_delete)
367 def add_country_names(self, country_code, names):
368 """ Add names for the given country to the search index.
371 for name in self._compute_full_names(names):
373 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
375 with self.conn.cursor() as cur:
377 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
379 word_tokens.difference_update((t[0] for t in cur))
382 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
384 (SELECT nextval('seq_word'), token, '{}', 0
385 FROM unnest(%s) as token)
386 """.format(country_code), (list(word_tokens),))
389 def process_place(self, place):
390 """ Determine tokenizer information about the given place.
392 Returns a JSON-serialisable structure that will be handed into
393 the database via the token_info field.
395 token_info = _TokenInfo(self._cache)
397 names = place.get('name')
400 fulls, partials = self._compute_name_tokens(names)
402 token_info.add_names(fulls, partials)
404 country_feature = place.get('country_feature')
405 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
406 self.add_country_names(country_feature.lower(), names)
408 address = place.get('address')
413 for key, value in address.items():
414 if key == 'postcode':
415 self._add_postcode(value)
416 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
418 elif key == 'street':
419 token_info.add_street(*self._compute_name_tokens({'name': value}))
421 token_info.add_place(*self._compute_name_tokens({'name': value}))
422 elif not key.startswith('_') and \
423 key not in ('country', 'full'):
424 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
427 hnrs = self._split_housenumbers(hnrs)
428 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
431 token_info.add_address_terms(addr_terms)
433 return token_info.data
436 def _compute_name_tokens(self, names):
437 """ Computes the full name and partial name tokens for the given
440 full_names = self._compute_full_names(names)
442 partial_tokens = set()
444 for name in full_names:
445 norm_name = self.name_processor.get_normalized(name)
446 full, part = self._cache.names.get(norm_name, (None, None))
448 variants = self.name_processor.get_variants_ascii(norm_name)
449 with self.conn.cursor() as cur:
450 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
451 (norm_name, variants))
452 full, part = cur.fetchone()
454 self._cache.names[norm_name] = (full, part)
456 full_tokens.add(full)
457 partial_tokens.update(part)
459 return full_tokens, partial_tokens
463 def _compute_full_names(names):
464 """ Return the set of all full name word ids to be used with the
465 given dictionary of names.
468 for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
469 full_names.add(name.strip())
471 brace_idx = name.find('(')
473 full_names.add(name[:brace_idx].strip())
478 def _add_postcode(self, postcode):
479 """ Make sure the normalized postcode is present in the word table.
481 if re.search(r'[:,;]', postcode) is None:
482 postcode = self.normalize_postcode(postcode)
484 if postcode not in self._cache.postcodes:
485 term = self.name_processor.get_search_normalized(postcode)
489 with self.conn.cursor() as cur:
490 # no word_id needed for postcodes
491 cur.execute("""INSERT INTO word (word, word_token, class, type,
493 (SELECT pc, %s, 'place', 'postcode', 0
494 FROM (VALUES (%s)) as v(pc)
497 WHERE word = pc and class='place' and type='postcode'))
498 """, (' ' + term, postcode))
499 self._cache.postcodes.add(postcode)
503 def _split_housenumbers(hnrs):
504 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
505 # split numbers if necessary
508 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
510 if len(simple_list) > 1:
511 hnrs = list(set(simple_list))
521 """ Collect token information to be sent back to the database.
523 def __init__(self, cache):
528 def _mk_array(tokens):
529 return '{%s}' % ','.join((str(s) for s in tokens))
532 def add_names(self, fulls, partials):
533 """ Adds token information for the normalised names.
535 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
538 def add_housenumbers(self, conn, hnrs):
539 """ Extract housenumber information from a list of normalised
542 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
543 self.data['hnr'] = ';'.join(hnrs)
546 def add_street(self, fulls, _):
547 """ Add addr:street match terms.
550 self.data['street'] = self._mk_array(fulls)
553 def add_place(self, fulls, partials):
554 """ Add addr:place search and match terms.
557 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
558 self.data['place_match'] = self._mk_array(fulls)
561 def add_address_terms(self, terms):
562 """ Add additional address terms.
566 for key, fulls, partials in terms:
568 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
569 self._mk_array(fulls)]
572 self.data['addr'] = tokens
576 """ Cache for token information to avoid repeated database queries.
578 This cache is not thread-safe and needs to be instantiated per
583 self.postcodes = set()
584 self.housenumbers = {}
587 def get_hnr_tokens(self, conn, terms):
588 """ Get token ids for a list of housenumbers, looking them up in the
589 database if necessary.
595 token = self.housenumbers.get(term)
602 with conn.cursor() as cur:
603 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
605 for term, tid in cur:
606 self.housenumbers[term] = tid