2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
9 from textwrap import dedent
10 from pathlib import Path
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db.properties import set_property, get_property
16 from nominatim.db.utils import CopyBuffer
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyICUTokenizer(dsn, data_dir)
32 class LegacyICUTokenizer:
33 """ This tokenizer uses libICU to covert names and queries to ASCII.
34 Otherwise it uses the same algorithms and data structures as the
35 normalization routines in Nominatim 3.
38 def __init__(self, dsn, data_dir):
40 self.data_dir = data_dir
41 self.naming_rules = None
42 self.term_normalization = None
43 self.max_word_frequency = None
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 if config.TOKENIZER_CONFIG:
53 cfgfile = Path(config.TOKENIZER_CONFIG)
55 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
57 loader = ICURuleLoader(cfgfile)
58 self.naming_rules = ICUNameProcessorRules(loader=loader)
59 self.term_normalization = config.TERM_NORMALIZATION
60 self.max_word_frequency = config.MAX_WORD_FREQUENCY
62 self._install_php(config.lib_dir.php)
63 self._save_config(config)
66 self.update_sql_functions(config)
67 self._init_db_tables(config)
70 def init_from_project(self):
71 """ Initialise the tokenizer from the project directory.
73 with connect(self.dsn) as conn:
74 self.naming_rules = ICUNameProcessorRules(conn=conn)
75 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
79 def finalize_import(self, config):
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config):
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
93 sqlp = SQLPreprocessor(conn, config)
94 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
95 max_word_freq=max_word_freq)
98 def check_database(self):
99 """ Check that the tokenizer is set up correctly.
101 self.init_from_project()
103 if self.naming_rules is None:
104 return "Configuration for tokenizer 'legacy_icu' are missing."
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
126 # pylint: disable=missing-format-attribute
127 def _install_php(self, phpdir):
128 """ Install the php script for the tokenizer.
130 php_file = self.data_dir / "tokenizer.php"
131 php_file.write_text(dedent("""\
133 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
134 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
135 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
136 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
137 """.format(self, phpdir)))
140 def _save_config(self, config):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.naming_rules.save_rules(conn)
147 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
148 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
151 def _init_db_tables(self, config):
152 """ Set up the word table and fill it with pre-computed word
155 with connect(self.dsn) as conn:
156 sqlp = SQLPreprocessor(conn, config)
157 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
160 LOG.warning("Precomputing word tokens")
162 # get partial words and their frequencies
164 name_proc = ICUNameProcessor(self.naming_rules)
165 with conn.cursor(name="words") as cur:
166 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
168 for name, cnt in cur:
170 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
172 terms.update(word.split())
176 # copy them back into the word table
177 with CopyBuffer() as copystr:
178 for args in words.items():
181 with conn.cursor() as cur:
182 copystr.copy_out(cur, 'word',
183 columns=['word_token', 'search_name_count'])
184 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
185 WHERE word_id is null""")
190 class LegacyICUNameAnalyzer:
191 """ The legacy analyzer uses the ICU library for splitting names.
193 Each instance opens a connection to the database to request the
197 def __init__(self, dsn, name_proc):
198 self.conn = connect(dsn).connection
199 self.conn.autocommit = True
200 self.name_processor = name_proc
202 self._cache = _TokenCache()
209 def __exit__(self, exc_type, exc_value, traceback):
214 """ Free all resources used by the analyzer.
221 def get_word_token_info(self, words):
222 """ Return token information for the given list of words.
223 If a word starts with # it is assumed to be a full name
224 otherwise is a partial name.
226 The function returns a list of tuples with
227 (original word, word token, word id).
229 The function is used for testing and debugging only
230 and not necessarily efficient.
234 if word.startswith('#'):
235 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
237 tokens[word] = self.name_processor.get_search_normalized(word)
239 with self.conn.cursor() as cur:
240 cur.execute("""SELECT word_token, word_id
241 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
242 WHERE word_token = t.term
243 and class is null and country_code is null""",
244 (list(tokens.values()), ))
245 ids = {r[0]: r[1] for r in cur}
247 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
251 def normalize_postcode(postcode):
252 """ Convert the postcode to a standardized form.
254 This function must yield exactly the same result as the SQL function
255 'token_normalized_postcode()'.
257 return postcode.strip().upper()
260 def _make_standard_hnr(self, hnr):
261 """ Create a normalised version of a housenumber.
263 This function takes minor shortcuts on transliteration.
265 return self.name_processor.get_search_normalized(hnr)
267 def update_postcodes_from_db(self):
268 """ Update postcode tokens in the word table from the location_postcode
272 with self.conn.cursor() as cur:
273 # This finds us the rows in location_postcode and word that are
274 # missing in the other table.
275 cur.execute("""SELECT * FROM
276 (SELECT pc, word FROM
277 (SELECT distinct(postcode) as pc FROM location_postcode) p
279 (SELECT word FROM word
280 WHERE class ='place' and type = 'postcode') w
282 WHERE pc is null or word is null""")
284 with CopyBuffer() as copystr:
285 for postcode, word in cur:
287 to_delete.append(word)
291 ' ' + self.name_processor.get_search_normalized(postcode),
292 'place', 'postcode', 0)
295 cur.execute("""DELETE FROM WORD
296 WHERE class ='place' and type = 'postcode'
300 copystr.copy_out(cur, 'word',
301 columns=['word', 'word_token', 'class', 'type',
302 'search_name_count'])
305 def update_special_phrases(self, phrases, should_replace):
306 """ Replace the search index for special phrases with the new phrases.
308 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
311 with self.conn.cursor() as cur:
312 # Get the old phrases.
313 existing_phrases = set()
314 cur.execute("""SELECT word, class, type, operator FROM word
315 WHERE class != 'place'
316 OR (type != 'house' AND type != 'postcode')""")
317 for label, cls, typ, oper in cur:
318 existing_phrases.add((label, cls, typ, oper or '-'))
320 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
322 deleted = self._remove_special_phrases(cur, norm_phrases,
327 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
328 len(norm_phrases), added, deleted)
331 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
332 """ Add all phrases to the database that are not yet there.
334 to_add = new_phrases - existing_phrases
337 with CopyBuffer() as copystr:
338 for word, cls, typ, oper in to_add:
339 term = self.name_processor.get_search_normalized(word)
341 copystr.add(word, ' ' + term, cls, typ,
342 oper if oper in ('in', 'near') else None, 0)
345 copystr.copy_out(cursor, 'word',
346 columns=['word', 'word_token', 'class', 'type',
347 'operator', 'search_name_count'])
353 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
354 """ Remove all phrases from the databse that are no longer in the
357 to_delete = existing_phrases - new_phrases
360 psycopg2.extras.execute_values(
362 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
363 WHERE word = name and class = in_class and type = in_type
364 and ((op = '-' and operator is null) or op = operator)""",
367 return len(to_delete)
370 def add_country_names(self, country_code, names):
371 """ Add names for the given country to the search index.
374 for name in self._compute_full_names(names):
376 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
378 with self.conn.cursor() as cur:
380 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
382 word_tokens.difference_update((t[0] for t in cur))
385 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
387 (SELECT nextval('seq_word'), token, '{}', 0
388 FROM unnest(%s) as token)
389 """.format(country_code), (list(word_tokens),))
392 def process_place(self, place):
393 """ Determine tokenizer information about the given place.
395 Returns a JSON-serialisable structure that will be handed into
396 the database via the token_info field.
398 token_info = _TokenInfo(self._cache)
400 names = place.get('name')
403 fulls, partials = self._compute_name_tokens(names)
405 token_info.add_names(fulls, partials)
407 country_feature = place.get('country_feature')
408 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
409 self.add_country_names(country_feature.lower(), names)
411 address = place.get('address')
416 for key, value in address.items():
417 if key == 'postcode':
418 self._add_postcode(value)
419 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
421 elif key == 'street':
422 token_info.add_street(*self._compute_name_tokens({'name': value}))
424 token_info.add_place(*self._compute_name_tokens({'name': value}))
425 elif not key.startswith('_') and \
426 key not in ('country', 'full'):
427 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
430 hnrs = self._split_housenumbers(hnrs)
431 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
434 token_info.add_address_terms(addr_terms)
436 return token_info.data
439 def _compute_name_tokens(self, names):
440 """ Computes the full name and partial name tokens for the given
443 full_names = self._compute_full_names(names)
445 partial_tokens = set()
447 for name in full_names:
448 norm_name = self.name_processor.get_normalized(name)
449 full, part = self._cache.names.get(norm_name, (None, None))
451 variants = self.name_processor.get_variants_ascii(norm_name)
455 with self.conn.cursor() as cur:
456 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
457 (norm_name, variants))
458 full, part = cur.fetchone()
460 self._cache.names[norm_name] = (full, part)
462 full_tokens.add(full)
463 partial_tokens.update(part)
465 return full_tokens, partial_tokens
469 def _compute_full_names(names):
470 """ Return the set of all full name word ids to be used with the
471 given dictionary of names.
474 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
478 brace_idx = name.find('(')
480 full_names.add(name[:brace_idx].strip())
485 def _add_postcode(self, postcode):
486 """ Make sure the normalized postcode is present in the word table.
488 if re.search(r'[:,;]', postcode) is None:
489 postcode = self.normalize_postcode(postcode)
491 if postcode not in self._cache.postcodes:
492 term = self.name_processor.get_search_normalized(postcode)
496 with self.conn.cursor() as cur:
497 # no word_id needed for postcodes
498 cur.execute("""INSERT INTO word (word, word_token, class, type,
500 (SELECT pc, %s, 'place', 'postcode', 0
501 FROM (VALUES (%s)) as v(pc)
504 WHERE word = pc and class='place' and type='postcode'))
505 """, (' ' + term, postcode))
506 self._cache.postcodes.add(postcode)
510 def _split_housenumbers(hnrs):
511 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
512 # split numbers if necessary
515 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
517 if len(simple_list) > 1:
518 hnrs = list(set(simple_list))
528 """ Collect token information to be sent back to the database.
530 def __init__(self, cache):
535 def _mk_array(tokens):
536 return '{%s}' % ','.join((str(s) for s in tokens))
539 def add_names(self, fulls, partials):
540 """ Adds token information for the normalised names.
542 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
545 def add_housenumbers(self, conn, hnrs):
546 """ Extract housenumber information from a list of normalised
549 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
550 self.data['hnr'] = ';'.join(hnrs)
553 def add_street(self, fulls, _):
554 """ Add addr:street match terms.
557 self.data['street'] = self._mk_array(fulls)
560 def add_place(self, fulls, partials):
561 """ Add addr:place search and match terms.
564 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
565 self.data['place_match'] = self._mk_array(fulls)
568 def add_address_terms(self, terms):
569 """ Add additional address terms.
573 for key, fulls, partials in terms:
575 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
576 self._mk_array(fulls)]
579 self.data['addr'] = tokens
583 """ Cache for token information to avoid repeated database queries.
585 This cache is not thread-safe and needs to be instantiated per
590 self.postcodes = set()
591 self.housenumbers = {}
594 def get_hnr_tokens(self, conn, terms):
595 """ Get token ids for a list of housenumbers, looking them up in the
596 database if necessary.
602 token = self.housenumbers.get(term)
609 with conn.cursor() as cur:
610 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
612 for term, tid in cur:
613 self.housenumbers[term] = tid