2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
12 from textwrap import dedent
13 from pathlib import Path
15 from icu import Transliterator
16 import psycopg2.extras
18 from nominatim.db.connection import connect
19 from nominatim.db.properties import set_property, get_property
20 from nominatim.db.sql_preprocessor import SQLPreprocessor
22 DBCFG_NORMALIZATION = "tokenizer_normalization"
23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
27 LOG = logging.getLogger()
29 def create(dsn, data_dir):
30 """ Create a new instance of the tokenizer provided by this module.
32 return LegacyICUTokenizer(dsn, data_dir)
35 class LegacyICUTokenizer:
36 """ This tokenizer uses libICU to covert names and queries to ASCII.
37 Otherwise it uses the same algorithms and data structures as the
38 normalization routines in Nominatim 3.
41 def __init__(self, dsn, data_dir):
43 self.data_dir = data_dir
44 self.normalization = None
45 self.transliteration = None
46 self.abbreviations = None
49 def init_new_db(self, config, init_db=True):
50 """ Set up a new tokenizer for the database.
52 This copies all necessary data in the project directory to make
53 sure the tokenizer remains stable even over updates.
55 if config.TOKENIZER_CONFIG:
56 cfgfile = Path(config.TOKENIZER_CONFIG)
58 cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
60 rules = json.loads(cfgfile.read_text())
61 self.transliteration = ';'.join(rules['normalization']) + ';'
62 self.abbreviations = rules["abbreviations"]
63 self.normalization = config.TERM_NORMALIZATION
65 self._install_php(config)
66 self._save_config(config)
69 self.update_sql_functions(config)
70 self._init_db_tables(config)
73 def init_from_project(self):
74 """ Initialise the tokenizer from the project directory.
76 with connect(self.dsn) as conn:
77 self.normalization = get_property(conn, DBCFG_NORMALIZATION)
78 self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
79 self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
82 def finalize_import(self, config):
83 """ Do any required postprocessing to make the tokenizer data ready
86 with connect(self.dsn) as conn:
87 sqlp = SQLPreprocessor(conn, config)
88 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
91 def update_sql_functions(self, config):
92 """ Reimport the SQL functions for this tokenizer.
94 with connect(self.dsn) as conn:
95 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
98 max_word_freq=max_word_freq)
101 def check_database(self):
102 """ Check that the tokenizer is set up correctly.
104 self.init_from_project()
106 if self.normalization is None\
107 or self.transliteration is None\
108 or self.abbreviations is None:
109 return "Configuration for tokenizer 'legacy_icu' are missing."
114 def name_analyzer(self):
115 """ Create a new analyzer for tokenizing names and queries
116 using this tokinzer. Analyzers are context managers and should
120 with tokenizer.name_analyzer() as analyzer:
124 When used outside the with construct, the caller must ensure to
125 call the close() function before destructing the analyzer.
127 Analyzers are not thread-safe. You need to instantiate one per thread.
129 norm = Transliterator.createFromRules("normalizer", self.normalization)
130 trans = Transliterator.createFromRules("trans", self.transliteration)
131 return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
134 def _install_php(self, config):
135 """ Install the php script for the tokenizer.
137 abbr_inverse = list(zip(*self.abbreviations))
138 php_file = self.data_dir / "tokenizer.php"
139 php_file.write_text(dedent("""\
141 @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
142 @define('CONST_Term_Normalization_Rules', "{0.normalization}");
143 @define('CONST_Transliteration', "{0.transliteration}");
144 @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
145 require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
146 """.format(self, config,
147 "','".join(abbr_inverse[0]),
148 "','".join(abbr_inverse[1]))))
151 def _save_config(self, config):
152 """ Save the configuration that needs to remain stable for the given
153 database as database properties.
155 with connect(self.dsn) as conn:
156 set_property(conn, DBCFG_NORMALIZATION, self.normalization)
157 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
158 set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
159 set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
162 def _init_db_tables(self, config):
163 """ Set up the word table and fill it with pre-computed word
166 with connect(self.dsn) as conn:
167 sqlp = SQLPreprocessor(conn, config)
168 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
171 LOG.warning("Precomputing word tokens")
173 # get partial words and their frequencies
175 with self.name_analyzer() as analyzer:
176 with conn.cursor(name="words") as cur:
177 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
179 for name, cnt in cur:
180 term = analyzer.make_standard_word(name)
182 for word in term.split():
185 # copy them back into the word table
186 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
189 with conn.cursor() as cur:
191 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
192 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
193 WHERE word_id is null""")
198 class LegacyICUNameAnalyzer:
199 """ The legacy analyzer uses the ICU library for splitting names.
201 Each instance opens a connection to the database to request the
205 def __init__(self, dsn, normalizer, transliterator, abbreviations):
206 self.conn = connect(dsn).connection
207 self.conn.autocommit = True
208 self.normalizer = normalizer
209 self.transliterator = transliterator
210 self.abbreviations = abbreviations
212 self._cache = _TokenCache()
219 def __exit__(self, exc_type, exc_value, traceback):
224 """ Free all resources used by the analyzer.
231 def get_word_token_info(self, conn, words):
232 """ Return token information for the given list of words.
233 If a word starts with # it is assumed to be a full name
234 otherwise is a partial name.
236 The function returns a list of tuples with
237 (original word, word token, word id).
239 The function is used for testing and debugging only
240 and not necessarily efficient.
244 if word.startswith('#'):
245 tokens[word] = ' ' + self.make_standard_word(word[1:])
247 tokens[word] = self.make_standard_word(word)
249 with conn.cursor() as cur:
250 cur.execute("""SELECT word_token, word_id
251 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
252 WHERE word_token = t.term
253 and class is null and country_code is null""",
254 (list(tokens.values()), ))
255 ids = {r[0]: r[1] for r in cur}
257 return [(k, v, ids[v]) for k, v in tokens.items()]
260 def normalize(self, phrase):
261 """ Normalize the given phrase, i.e. remove all properties that
262 are irrelevant for search.
264 return self.normalizer.transliterate(phrase)
267 def normalize_postcode(postcode):
268 """ Convert the postcode to a standardized form.
270 This function must yield exactly the same result as the SQL function
271 'token_normalized_postcode()'.
273 return postcode.strip().upper()
276 @functools.lru_cache(maxsize=1024)
277 def make_standard_word(self, name):
278 """ Create the normalised version of the input.
280 norm = ' ' + self.transliterator.transliterate(name) + ' '
281 for full, abbr in self.abbreviations:
283 norm = norm.replace(full, abbr)
288 def _make_standard_hnr(self, hnr):
289 """ Create a normalised version of a housenumber.
291 This function takes minor shortcuts on transliteration.
296 return self.transliterator.transliterate(hnr)
298 def update_postcodes_from_db(self):
299 """ Update postcode tokens in the word table from the location_postcode
303 copystr = io.StringIO()
304 with self.conn.cursor() as cur:
305 # This finds us the rows in location_postcode and word that are
306 # missing in the other table.
307 cur.execute("""SELECT * FROM
308 (SELECT pc, word FROM
309 (SELECT distinct(postcode) as pc FROM location_postcode) p
311 (SELECT word FROM word
312 WHERE class ='place' and type = 'postcode') w
314 WHERE pc is null or word is null""")
316 for postcode, word in cur:
318 to_delete.append(word)
320 copystr.write(postcode)
322 copystr.write(self.transliterator.transliterate(postcode))
323 copystr.write('\tplace\tpostcode\t0\n')
326 cur.execute("""DELETE FROM WORD
327 WHERE class ='place' and type = 'postcode'
331 if copystr.getvalue():
333 cur.copy_from(copystr, 'word',
334 columns=['word', 'word_token', 'class', 'type',
335 'search_name_count'])
338 def update_special_phrases(self, phrases, should_replace):
339 """ Replace the search index for special phrases with the new phrases.
341 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
344 with self.conn.cursor() as cur:
345 # Get the old phrases.
346 existing_phrases = set()
347 cur.execute("""SELECT word, class, type, operator FROM word
348 WHERE class != 'place'
349 OR (type != 'house' AND type != 'postcode')""")
350 for label, cls, typ, oper in cur:
351 existing_phrases.add((label, cls, typ, oper or '-'))
353 to_add = norm_phrases - existing_phrases
354 to_delete = existing_phrases - norm_phrases
357 copystr = io.StringIO()
358 for word, cls, typ, oper in to_add:
359 term = self.make_standard_word(word)
369 copystr.write(oper if oper in ('in', 'near') else '\\N')
370 copystr.write('\t0\n')
373 cur.copy_from(copystr, 'word',
374 columns=['word', 'word_token', 'class', 'type',
375 'operator', 'search_name_count'])
377 if to_delete and should_replace:
378 psycopg2.extras.execute_values(
380 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
381 WHERE word = name and class = in_class and type = in_type
382 and ((op = '-' and operator is null) or op = operator)""",
385 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
386 len(norm_phrases), len(to_add), len(to_delete))
389 def add_country_names(self, country_code, names):
390 """ Add names for the given country to the search index.
392 full_names = set((self.make_standard_word(n) for n in names))
393 full_names.discard('')
394 self._add_normalized_country_names(country_code, full_names)
397 def _add_normalized_country_names(self, country_code, names):
398 """ Add names for the given country to the search index.
400 word_tokens = set((' ' + name for name in names))
401 with self.conn.cursor() as cur:
403 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
405 word_tokens.difference_update((t[0] for t in cur))
408 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
410 (SELECT nextval('seq_word'), token, '{}', 0
411 FROM unnest(%s) as token)
412 """.format(country_code), (list(word_tokens),))
415 def process_place(self, place):
416 """ Determine tokenizer information about the given place.
418 Returns a JSON-serialisable structure that will be handed into
419 the database via the token_info field.
421 token_info = _TokenInfo(self._cache)
423 names = place.get('name')
426 full_names = self._compute_full_names(names)
428 token_info.add_names(self.conn, full_names)
430 country_feature = place.get('country_feature')
431 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
432 self._add_normalized_country_names(country_feature.lower(),
435 address = place.get('address')
440 for key, value in address.items():
441 if key == 'postcode':
442 self._add_postcode(value)
443 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
445 elif key == 'street':
446 token_info.add_street(self.conn, self.make_standard_word(value))
448 token_info.add_place(self.conn, self.make_standard_word(value))
449 elif not key.startswith('_') and \
450 key not in ('country', 'full'):
451 addr_terms.append((key, self.make_standard_word(value)))
454 hnrs = self._split_housenumbers(hnrs)
455 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
458 token_info.add_address_terms(self.conn, addr_terms)
460 return token_info.data
463 def _compute_full_names(self, names):
464 """ Return the set of all full name word ids to be used with the
465 given dictionary of names.
468 for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
469 word = self.make_standard_word(name)
473 brace_split = name.split('(', 2)
474 if len(brace_split) > 1:
475 word = self.make_standard_word(brace_split[0])
482 def _add_postcode(self, postcode):
483 """ Make sure the normalized postcode is present in the word table.
485 if re.search(r'[:,;]', postcode) is None:
486 postcode = self.normalize_postcode(postcode)
488 if postcode not in self._cache.postcodes:
489 term = self.make_standard_word(postcode)
493 with self.conn.cursor() as cur:
494 # no word_id needed for postcodes
495 cur.execute("""INSERT INTO word (word, word_token, class, type,
497 (SELECT pc, %s, 'place', 'postcode', 0
498 FROM (VALUES (%s)) as v(pc)
501 WHERE word = pc and class='place' and type='postcode'))
502 """, (' ' + term, postcode))
503 self._cache.postcodes.add(postcode)
506 def _split_housenumbers(hnrs):
507 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
508 # split numbers if necessary
511 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
513 if len(simple_list) > 1:
514 hnrs = list(set(simple_list))
524 """ Collect token information to be sent back to the database.
526 def __init__(self, cache):
531 def _mk_array(tokens):
532 return '{%s}' % ','.join((str(s) for s in tokens))
535 def add_names(self, conn, names):
536 """ Adds token information for the normalised names.
538 # Start with all partial names
539 terms = set((part for ns in names for part in ns.split()))
541 terms.update((' ' + n for n in names))
543 self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
546 def add_housenumbers(self, conn, hnrs):
547 """ Extract housenumber information from a list of normalised
550 self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
551 self.data['hnr'] = ';'.join(hnrs)
554 def add_street(self, conn, street):
555 """ Add addr:street match terms.
562 tid = self.cache.names.get(term)
565 with conn.cursor() as cur:
566 cur.execute("""SELECT word_id FROM word
567 WHERE word_token = %s
568 and class is null and type is null""",
571 tid = cur.fetchone()[0]
572 self.cache.names[term] = tid
575 self.data['street'] = '{%d}' % tid
578 def add_place(self, conn, place):
579 """ Add addr:place search and match terms.
584 partial_ids = self.cache.get_term_tokens(conn, place.split())
585 tid = self.cache.get_term_tokens(conn, [' ' + place])
587 self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
588 self.data['place_match'] = '{%s}' % tid[0]
591 def add_address_terms(self, conn, terms):
592 """ Add additional address terms.
596 for key, value in terms:
599 partial_ids = self.cache.get_term_tokens(conn, value.split())
601 tid = self.cache.names.get(term)
604 with conn.cursor() as cur:
605 cur.execute("""SELECT word_id FROM word
606 WHERE word_token = %s
607 and class is null and type is null""",
610 tid = cur.fetchone()[0]
611 self.cache.names[term] = tid
613 tokens[key] = [self._mk_array(partial_ids),
614 '{%s}' % ('' if tid is None else str(tid))]
617 self.data['addr'] = tokens
621 """ Cache for token information to avoid repeated database queries.
623 This cache is not thread-safe and needs to be instantiated per
628 self.postcodes = set()
629 self.housenumbers = {}
632 def get_term_tokens(self, conn, terms):
633 """ Get token ids for a list of terms, looking them up in the database
640 token = self.names.get(term)
647 with conn.cursor() as cur:
648 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
650 for term, tid in cur:
651 self.names[term] = tid
658 def get_hnr_tokens(self, conn, terms):
659 """ Get token ids for a list of housenumbers, looking them up in the
660 database if necessary.
666 token = self.housenumbers.get(term)
673 with conn.cursor() as cur:
674 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
676 for term, tid in cur:
677 self.housenumbers[term] = tid