2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
9 from textwrap import dedent
10 from pathlib import Path
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db.properties import set_property, get_property
16 from nominatim.db.utils import CopyBuffer
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyICUTokenizer(dsn, data_dir)
32 class LegacyICUTokenizer:
33 """ This tokenizer uses libICU to covert names and queries to ASCII.
34 Otherwise it uses the same algorithms and data structures as the
35 normalization routines in Nominatim 3.
38 def __init__(self, dsn, data_dir):
40 self.data_dir = data_dir
41 self.naming_rules = None
42 self.term_normalization = None
43 self.max_word_frequency = None
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 if config.TOKENIZER_CONFIG:
53 cfgfile = Path(config.TOKENIZER_CONFIG)
55 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
57 loader = ICURuleLoader(cfgfile)
58 self.naming_rules = ICUNameProcessorRules(loader=loader)
59 self.term_normalization = config.TERM_NORMALIZATION
60 self.max_word_frequency = config.MAX_WORD_FREQUENCY
62 self._install_php(config.lib_dir.php)
63 self._save_config(config)
66 self.update_sql_functions(config)
67 self._init_db_tables(config)
70 def init_from_project(self):
71 """ Initialise the tokenizer from the project directory.
73 with connect(self.dsn) as conn:
74 self.naming_rules = ICUNameProcessorRules(conn=conn)
75 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
79 def finalize_import(self, config):
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config):
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
93 sqlp = SQLPreprocessor(conn, config)
94 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
95 max_word_freq=max_word_freq)
98 def check_database(self):
99 """ Check that the tokenizer is set up correctly.
101 self.init_from_project()
103 if self.naming_rules is None:
104 return "Configuration for tokenizer 'legacy_icu' are missing."
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
126 # pylint: disable=missing-format-attribute
127 def _install_php(self, phpdir):
128 """ Install the php script for the tokenizer.
130 php_file = self.data_dir / "tokenizer.php"
131 php_file.write_text(dedent("""\
133 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
134 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
135 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
136 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
137 """.format(self, phpdir)))
140 def _save_config(self, config):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.naming_rules.save_rules(conn)
147 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
148 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
151 def _init_db_tables(self, config):
152 """ Set up the word table and fill it with pre-computed word
155 with connect(self.dsn) as conn:
156 sqlp = SQLPreprocessor(conn, config)
157 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
160 LOG.warning("Precomputing word tokens")
162 # get partial words and their frequencies
164 name_proc = ICUNameProcessor(self.naming_rules)
165 with conn.cursor(name="words") as cur:
166 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
168 for name, cnt in cur:
170 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
171 terms.update(word.split())
175 # copy them back into the word table
176 with CopyBuffer() as copystr:
177 for args in words.items():
180 with conn.cursor() as cur:
181 copystr.copy_out(cur, 'word',
182 columns=['word_token', 'search_name_count'])
183 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
184 WHERE word_id is null""")
189 class LegacyICUNameAnalyzer:
190 """ The legacy analyzer uses the ICU library for splitting names.
192 Each instance opens a connection to the database to request the
196 def __init__(self, dsn, name_proc):
197 self.conn = connect(dsn).connection
198 self.conn.autocommit = True
199 self.name_processor = name_proc
201 self._cache = _TokenCache()
208 def __exit__(self, exc_type, exc_value, traceback):
213 """ Free all resources used by the analyzer.
220 def get_word_token_info(self, words):
221 """ Return token information for the given list of words.
222 If a word starts with # it is assumed to be a full name
223 otherwise is a partial name.
225 The function returns a list of tuples with
226 (original word, word token, word id).
228 The function is used for testing and debugging only
229 and not necessarily efficient.
233 if word.startswith('#'):
234 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
236 tokens[word] = self.name_processor.get_search_normalized(word)
238 with self.conn.cursor() as cur:
239 cur.execute("""SELECT word_token, word_id
240 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
241 WHERE word_token = t.term
242 and class is null and country_code is null""",
243 (list(tokens.values()), ))
244 ids = {r[0]: r[1] for r in cur}
246 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
250 def normalize_postcode(postcode):
251 """ Convert the postcode to a standardized form.
253 This function must yield exactly the same result as the SQL function
254 'token_normalized_postcode()'.
256 return postcode.strip().upper()
259 def _make_standard_hnr(self, hnr):
260 """ Create a normalised version of a housenumber.
262 This function takes minor shortcuts on transliteration.
264 return self.name_processor.get_search_normalized(hnr)
266 def update_postcodes_from_db(self):
267 """ Update postcode tokens in the word table from the location_postcode
271 with self.conn.cursor() as cur:
272 # This finds us the rows in location_postcode and word that are
273 # missing in the other table.
274 cur.execute("""SELECT * FROM
275 (SELECT pc, word FROM
276 (SELECT distinct(postcode) as pc FROM location_postcode) p
278 (SELECT word FROM word
279 WHERE class ='place' and type = 'postcode') w
281 WHERE pc is null or word is null""")
283 with CopyBuffer() as copystr:
284 for postcode, word in cur:
286 to_delete.append(word)
290 ' ' + self.name_processor.get_search_normalized(postcode),
291 'place', 'postcode', 0)
294 cur.execute("""DELETE FROM WORD
295 WHERE class ='place' and type = 'postcode'
299 copystr.copy_out(cur, 'word',
300 columns=['word', 'word_token', 'class', 'type',
301 'search_name_count'])
304 def update_special_phrases(self, phrases, should_replace):
305 """ Replace the search index for special phrases with the new phrases.
307 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
310 with self.conn.cursor() as cur:
311 # Get the old phrases.
312 existing_phrases = set()
313 cur.execute("""SELECT word, class, type, operator FROM word
314 WHERE class != 'place'
315 OR (type != 'house' AND type != 'postcode')""")
316 for label, cls, typ, oper in cur:
317 existing_phrases.add((label, cls, typ, oper or '-'))
319 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
321 deleted = self._remove_special_phrases(cur, norm_phrases,
326 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
327 len(norm_phrases), added, deleted)
330 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
331 """ Add all phrases to the database that are not yet there.
333 to_add = new_phrases - existing_phrases
336 with CopyBuffer() as copystr:
337 for word, cls, typ, oper in to_add:
338 term = self.name_processor.get_search_normalized(word)
340 copystr.add(word, ' ' + term, cls, typ,
341 oper if oper in ('in', 'near') else None, 0)
344 copystr.copy_out(cursor, 'word',
345 columns=['word', 'word_token', 'class', 'type',
346 'operator', 'search_name_count'])
352 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
353 """ Remove all phrases from the databse that are no longer in the
356 to_delete = existing_phrases - new_phrases
359 psycopg2.extras.execute_values(
361 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
362 WHERE word = name and class = in_class and type = in_type
363 and ((op = '-' and operator is null) or op = operator)""",
366 return len(to_delete)
369 def add_country_names(self, country_code, names):
370 """ Add names for the given country to the search index.
373 for name in self._compute_full_names(names):
375 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
377 with self.conn.cursor() as cur:
379 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
381 word_tokens.difference_update((t[0] for t in cur))
384 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
386 (SELECT nextval('seq_word'), token, '{}', 0
387 FROM unnest(%s) as token)
388 """.format(country_code), (list(word_tokens),))
391 def process_place(self, place):
392 """ Determine tokenizer information about the given place.
394 Returns a JSON-serialisable structure that will be handed into
395 the database via the token_info field.
397 token_info = _TokenInfo(self._cache)
399 names = place.get('name')
402 fulls, partials = self._compute_name_tokens(names)
404 token_info.add_names(fulls, partials)
406 country_feature = place.get('country_feature')
407 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
408 self.add_country_names(country_feature.lower(), names)
410 address = place.get('address')
415 for key, value in address.items():
416 if key == 'postcode':
417 self._add_postcode(value)
418 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
420 elif key == 'street':
421 token_info.add_street(*self._compute_name_tokens({'name': value}))
423 token_info.add_place(*self._compute_name_tokens({'name': value}))
424 elif not key.startswith('_') and \
425 key not in ('country', 'full'):
426 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
429 hnrs = self._split_housenumbers(hnrs)
430 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
433 token_info.add_address_terms(addr_terms)
435 return token_info.data
438 def _compute_name_tokens(self, names):
439 """ Computes the full name and partial name tokens for the given
442 full_names = self._compute_full_names(names)
444 partial_tokens = set()
446 for name in full_names:
447 norm_name = self.name_processor.get_normalized(name)
448 full, part = self._cache.names.get(norm_name, (None, None))
450 variants = self.name_processor.get_variants_ascii(norm_name)
454 with self.conn.cursor() as cur:
455 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
456 (norm_name, variants))
457 full, part = cur.fetchone()
459 self._cache.names[norm_name] = (full, part)
461 full_tokens.add(full)
462 partial_tokens.update(part)
464 return full_tokens, partial_tokens
468 def _compute_full_names(names):
469 """ Return the set of all full name word ids to be used with the
470 given dictionary of names.
473 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
477 brace_idx = name.find('(')
479 full_names.add(name[:brace_idx].strip())
484 def _add_postcode(self, postcode):
485 """ Make sure the normalized postcode is present in the word table.
487 if re.search(r'[:,;]', postcode) is None:
488 postcode = self.normalize_postcode(postcode)
490 if postcode not in self._cache.postcodes:
491 term = self.name_processor.get_search_normalized(postcode)
495 with self.conn.cursor() as cur:
496 # no word_id needed for postcodes
497 cur.execute("""INSERT INTO word (word, word_token, class, type,
499 (SELECT pc, %s, 'place', 'postcode', 0
500 FROM (VALUES (%s)) as v(pc)
503 WHERE word = pc and class='place' and type='postcode'))
504 """, (' ' + term, postcode))
505 self._cache.postcodes.add(postcode)
509 def _split_housenumbers(hnrs):
510 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
511 # split numbers if necessary
514 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
516 if len(simple_list) > 1:
517 hnrs = list(set(simple_list))
527 """ Collect token information to be sent back to the database.
529 def __init__(self, cache):
534 def _mk_array(tokens):
535 return '{%s}' % ','.join((str(s) for s in tokens))
538 def add_names(self, fulls, partials):
539 """ Adds token information for the normalised names.
541 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
544 def add_housenumbers(self, conn, hnrs):
545 """ Extract housenumber information from a list of normalised
548 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
549 self.data['hnr'] = ';'.join(hnrs)
552 def add_street(self, fulls, _):
553 """ Add addr:street match terms.
556 self.data['street'] = self._mk_array(fulls)
559 def add_place(self, fulls, partials):
560 """ Add addr:place search and match terms.
563 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
564 self.data['place_match'] = self._mk_array(fulls)
567 def add_address_terms(self, terms):
568 """ Add additional address terms.
572 for key, fulls, partials in terms:
574 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
575 self._mk_array(fulls)]
578 self.data['addr'] = tokens
582 """ Cache for token information to avoid repeated database queries.
584 This cache is not thread-safe and needs to be instantiated per
589 self.postcodes = set()
590 self.housenumbers = {}
593 def get_hnr_tokens(self, conn, terms):
594 """ Get token ids for a list of housenumbers, looking them up in the
595 database if necessary.
601 token = self.housenumbers.get(term)
608 with conn.cursor() as cur:
609 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
611 for term, tid in cur:
612 self.housenumbers[term] = tid