2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
11 from pathlib import Path
13 import psycopg2.extras
15 from nominatim.db.connection import connect
16 from nominatim.db.properties import set_property, get_property
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyICUTokenizer(dsn, data_dir)
32 class LegacyICUTokenizer:
33 """ This tokenizer uses libICU to covert names and queries to ASCII.
34 Otherwise it uses the same algorithms and data structures as the
35 normalization routines in Nominatim 3.
38 def __init__(self, dsn, data_dir):
40 self.data_dir = data_dir
41 self.naming_rules = None
42 self.term_normalization = None
43 self.max_word_frequency = None
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 if config.TOKENIZER_CONFIG:
53 cfgfile = Path(config.TOKENIZER_CONFIG)
55 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
57 loader = ICURuleLoader(cfgfile)
58 self.naming_rules = ICUNameProcessorRules(loader=loader)
59 self.term_normalization = config.TERM_NORMALIZATION
60 self.max_word_frequency = config.MAX_WORD_FREQUENCY
62 self._install_php(config.lib_dir.php)
63 self._save_config(config)
66 self.update_sql_functions(config)
67 self._init_db_tables(config)
70 def init_from_project(self):
71 """ Initialise the tokenizer from the project directory.
73 with connect(self.dsn) as conn:
74 self.naming_rules = ICUNameProcessorRules(conn=conn)
75 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
79 def finalize_import(self, config):
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config):
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
93 sqlp = SQLPreprocessor(conn, config)
94 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
95 max_word_freq=max_word_freq)
98 def check_database(self):
99 """ Check that the tokenizer is set up correctly.
101 self.init_from_project()
103 if self.naming_rules is None:
104 return "Configuration for tokenizer 'legacy_icu' are missing."
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
127 def _install_php(self, phpdir):
128 """ Install the php script for the tokenizer.
130 php_file = self.data_dir / "tokenizer.php"
131 php_file.write_text(dedent("""\
133 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
134 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
135 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
136 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
137 """.format(self, phpdir)))
140 def _save_config(self, config):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.naming_rules.save_rules(conn)
147 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
148 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
151 def _init_db_tables(self, config):
152 """ Set up the word table and fill it with pre-computed word
155 with connect(self.dsn) as conn:
156 sqlp = SQLPreprocessor(conn, config)
157 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
160 LOG.warning("Precomputing word tokens")
162 # get partial words and their frequencies
164 name_proc = ICUNameProcessor(self.naming_rules)
165 with conn.cursor(name="words") as cur:
166 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
168 for name, cnt in cur:
169 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
170 for term in word.split():
173 # copy them back into the word table
174 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
177 with conn.cursor() as cur:
179 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
180 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
181 WHERE word_id is null""")
186 class LegacyICUNameAnalyzer:
187 """ The legacy analyzer uses the ICU library for splitting names.
189 Each instance opens a connection to the database to request the
193 def __init__(self, dsn, name_proc):
194 self.conn = connect(dsn).connection
195 self.conn.autocommit = True
196 self.name_processor = name_proc
198 self._cache = _TokenCache()
205 def __exit__(self, exc_type, exc_value, traceback):
210 """ Free all resources used by the analyzer.
217 def get_word_token_info(self, words):
218 """ Return token information for the given list of words.
219 If a word starts with # it is assumed to be a full name
220 otherwise is a partial name.
222 The function returns a list of tuples with
223 (original word, word token, word id).
225 The function is used for testing and debugging only
226 and not necessarily efficient.
230 if word.startswith('#'):
231 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
233 tokens[word] = self.name_processor.get_search_normalized(word)
235 with self.conn.cursor() as cur:
236 cur.execute("""SELECT word_token, word_id
237 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
238 WHERE word_token = t.term
239 and class is null and country_code is null""",
240 (list(tokens.values()), ))
241 ids = {r[0]: r[1] for r in cur}
243 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
247 def normalize_postcode(postcode):
248 """ Convert the postcode to a standardized form.
250 This function must yield exactly the same result as the SQL function
251 'token_normalized_postcode()'.
253 return postcode.strip().upper()
256 def _make_standard_hnr(self, hnr):
257 """ Create a normalised version of a housenumber.
259 This function takes minor shortcuts on transliteration.
261 return self.name_processor.get_search_normalized(hnr)
263 def update_postcodes_from_db(self):
264 """ Update postcode tokens in the word table from the location_postcode
268 copystr = io.StringIO()
269 with self.conn.cursor() as cur:
270 # This finds us the rows in location_postcode and word that are
271 # missing in the other table.
272 cur.execute("""SELECT * FROM
273 (SELECT pc, word FROM
274 (SELECT distinct(postcode) as pc FROM location_postcode) p
276 (SELECT word FROM word
277 WHERE class ='place' and type = 'postcode') w
279 WHERE pc is null or word is null""")
281 for postcode, word in cur:
283 to_delete.append(word)
285 copystr.write(postcode)
287 copystr.write(self.name_processor.get_search_normalized(postcode))
288 copystr.write('\tplace\tpostcode\t0\n')
291 cur.execute("""DELETE FROM WORD
292 WHERE class ='place' and type = 'postcode'
296 if copystr.getvalue():
298 cur.copy_from(copystr, 'word',
299 columns=['word', 'word_token', 'class', 'type',
300 'search_name_count'])
303 def update_special_phrases(self, phrases, should_replace):
304 """ Replace the search index for special phrases with the new phrases.
306 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
309 with self.conn.cursor() as cur:
310 # Get the old phrases.
311 existing_phrases = set()
312 cur.execute("""SELECT word, class, type, operator FROM word
313 WHERE class != 'place'
314 OR (type != 'house' AND type != 'postcode')""")
315 for label, cls, typ, oper in cur:
316 existing_phrases.add((label, cls, typ, oper or '-'))
318 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
320 deleted = self._remove_special_phrases(cur, norm_phrases,
325 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
326 len(norm_phrases), added, deleted)
329 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
330 """ Add all phrases to the database that are not yet there.
332 to_add = new_phrases - existing_phrases
334 copystr = io.StringIO()
336 for word, cls, typ, oper in to_add:
337 term = self.name_processor.get_search_normalized(word)
347 copystr.write(oper if oper in ('in', 'near') else '\\N')
348 copystr.write('\t0\n')
352 if copystr.tell() > 0:
354 cursor.copy_from(copystr, 'word',
355 columns=['word', 'word_token', 'class', 'type',
356 'operator', 'search_name_count'])
361 def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
362 """ Remove all phrases from the databse that are no longer in the
365 to_delete = existing_phrases - new_phrases
368 psycopg2.extras.execute_values(
370 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
371 WHERE word = name and class = in_class and type = in_type
372 and ((op = '-' and operator is null) or op = operator)""",
375 return len(to_delete)
378 def add_country_names(self, country_code, names):
379 """ Add names for the given country to the search index.
382 for name in self._compute_full_names(names):
384 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
386 with self.conn.cursor() as cur:
388 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
390 word_tokens.difference_update((t[0] for t in cur))
393 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
395 (SELECT nextval('seq_word'), token, '{}', 0
396 FROM unnest(%s) as token)
397 """.format(country_code), (list(word_tokens),))
400 def process_place(self, place):
401 """ Determine tokenizer information about the given place.
403 Returns a JSON-serialisable structure that will be handed into
404 the database via the token_info field.
406 token_info = _TokenInfo(self._cache)
408 names = place.get('name')
411 fulls, partials = self._compute_name_tokens(names)
413 token_info.add_names(fulls, partials)
415 country_feature = place.get('country_feature')
416 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
417 self.add_country_names(country_feature.lower(), names)
419 address = place.get('address')
424 for key, value in address.items():
425 if key == 'postcode':
426 self._add_postcode(value)
427 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
429 elif key == 'street':
430 token_info.add_street(*self._compute_name_tokens({'name': value}))
432 token_info.add_place(*self._compute_name_tokens({'name': value}))
433 elif not key.startswith('_') and \
434 key not in ('country', 'full'):
435 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
438 hnrs = self._split_housenumbers(hnrs)
439 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
442 token_info.add_address_terms(addr_terms)
444 return token_info.data
447 def _compute_name_tokens(self, names):
448 """ Computes the full name and partial name tokens for the given
451 full_names = self._compute_full_names(names)
453 partial_tokens = set()
455 for name in full_names:
456 norm_name = self.name_processor.get_normalized(name)
457 full, part = self._cache.names.get(norm_name, (None, None))
459 variants = self.name_processor.get_variants_ascii(norm_name)
460 with self.conn.cursor() as cur:
461 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
462 (norm_name, variants))
463 full, part = cur.fetchone()
465 self._cache.names[norm_name] = (full, part)
467 full_tokens.add(full)
468 partial_tokens.update(part)
470 return full_tokens, partial_tokens
474 def _compute_full_names(names):
475 """ Return the set of all full name word ids to be used with the
476 given dictionary of names.
479 for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
480 full_names.add(name.strip())
482 brace_idx = name.find('(')
484 full_names.add(name[:brace_idx].strip())
489 def _add_postcode(self, postcode):
490 """ Make sure the normalized postcode is present in the word table.
492 if re.search(r'[:,;]', postcode) is None:
493 postcode = self.normalize_postcode(postcode)
495 if postcode not in self._cache.postcodes:
496 term = self.name_processor.get_search_normalized(postcode)
500 with self.conn.cursor() as cur:
501 # no word_id needed for postcodes
502 cur.execute("""INSERT INTO word (word, word_token, class, type,
504 (SELECT pc, %s, 'place', 'postcode', 0
505 FROM (VALUES (%s)) as v(pc)
508 WHERE word = pc and class='place' and type='postcode'))
509 """, (' ' + term, postcode))
510 self._cache.postcodes.add(postcode)
514 def _split_housenumbers(hnrs):
515 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
516 # split numbers if necessary
519 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
521 if len(simple_list) > 1:
522 hnrs = list(set(simple_list))
532 """ Collect token information to be sent back to the database.
534 def __init__(self, cache):
539 def _mk_array(tokens):
540 return '{%s}' % ','.join((str(s) for s in tokens))
543 def add_names(self, fulls, partials):
544 """ Adds token information for the normalised names.
546 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
549 def add_housenumbers(self, conn, hnrs):
550 """ Extract housenumber information from a list of normalised
553 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
554 self.data['hnr'] = ';'.join(hnrs)
557 def add_street(self, fulls, _):
558 """ Add addr:street match terms.
561 self.data['street'] = self._mk_array(fulls)
564 def add_place(self, fulls, partials):
565 """ Add addr:place search and match terms.
568 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
569 self.data['place_match'] = self._mk_array(fulls)
572 def add_address_terms(self, terms):
573 """ Add additional address terms.
577 for key, fulls, partials in terms:
579 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
580 self._mk_array(fulls)]
583 self.data['addr'] = tokens
587 """ Cache for token information to avoid repeated database queries.
589 This cache is not thread-safe and needs to be instantiated per
594 self.postcodes = set()
595 self.housenumbers = {}
598 def get_hnr_tokens(self, conn, terms):
599 """ Get token ids for a list of housenumbers, looking them up in the
600 database if necessary.
606 token = self.housenumbers.get(term)
613 with conn.cursor() as cur:
614 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
616 for term, tid in cur:
617 self.housenumbers[term] = tid