2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
12 from textwrap import dedent
13 from pathlib import Path
15 from icu import Transliterator
16 import psycopg2.extras
18 from nominatim.db.connection import connect
19 from nominatim.db.properties import set_property, get_property
20 from nominatim.db.sql_preprocessor import SQLPreprocessor
22 DBCFG_NORMALIZATION = "tokenizer_normalization"
23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
27 LOG = logging.getLogger()
29 def create(dsn, data_dir):
30 """ Create a new instance of the tokenizer provided by this module.
32 return LegacyICUTokenizer(dsn, data_dir)
35 class LegacyICUTokenizer:
36 """ This tokenizer uses libICU to covert names and queries to ASCII.
37 Otherwise it uses the same algorithms and data structures as the
38 normalization routines in Nominatim 3.
41 def __init__(self, dsn, data_dir):
43 self.data_dir = data_dir
44 self.normalization = None
45 self.transliteration = None
46 self.abbreviations = None
49 def init_new_db(self, config, init_db=True):
50 """ Set up a new tokenizer for the database.
52 This copies all necessary data in the project directory to make
53 sure the tokenizer remains stable even over updates.
55 if config.TOKENIZER_CONFIG:
56 cfgfile = Path(config.TOKENIZER_CONFIG)
58 cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
60 rules = json.loads(cfgfile.read_text())
61 self._load_transliteration(rules['normalization'], cfgfile.parent)
62 self.abbreviations = rules["abbreviations"]
63 self.normalization = config.TERM_NORMALIZATION
65 self._install_php(config)
66 self._save_config(config)
69 self.update_sql_functions(config)
70 self._init_db_tables(config)
73 def _load_transliteration(self, rules, cfg_path):
74 if isinstance(rules, str):
75 self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
77 self.transliteration = ';'.join(rules) + ';'
79 def init_from_project(self):
80 """ Initialise the tokenizer from the project directory.
82 with connect(self.dsn) as conn:
83 self.normalization = get_property(conn, DBCFG_NORMALIZATION)
84 self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
85 self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
88 def finalize_import(self, config):
89 """ Do any required postprocessing to make the tokenizer data ready
92 with connect(self.dsn) as conn:
93 sqlp = SQLPreprocessor(conn, config)
94 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
97 def update_sql_functions(self, config):
98 """ Reimport the SQL functions for this tokenizer.
100 with connect(self.dsn) as conn:
101 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
102 sqlp = SQLPreprocessor(conn, config)
103 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
104 max_word_freq=max_word_freq)
107 def check_database(self):
108 """ Check that the tokenizer is set up correctly.
110 self.init_from_project()
112 if self.normalization is None\
113 or self.transliteration is None\
114 or self.abbreviations is None:
115 return "Configuration for tokenizer 'legacy_icu' are missing."
120 def name_analyzer(self):
121 """ Create a new analyzer for tokenizing names and queries
122 using this tokinzer. Analyzers are context managers and should
126 with tokenizer.name_analyzer() as analyzer:
130 When used outside the with construct, the caller must ensure to
131 call the close() function before destructing the analyzer.
133 Analyzers are not thread-safe. You need to instantiate one per thread.
135 norm = Transliterator.createFromRules("normalizer", self.normalization)
136 trans = Transliterator.createFromRules("trans", self.transliteration)
137 return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
140 def _install_php(self, config):
141 """ Install the php script for the tokenizer.
143 abbr_inverse = list(zip(*self.abbreviations))
144 php_file = self.data_dir / "tokenizer.php"
145 php_file.write_text(dedent("""\
147 @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
148 @define('CONST_Term_Normalization_Rules', "{0.normalization}");
149 @define('CONST_Transliteration', "{0.transliteration}");
150 @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
151 require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
152 """.format(self, config,
153 "','".join(abbr_inverse[0]),
154 "','".join(abbr_inverse[1]))))
157 def _save_config(self, config):
158 """ Save the configuration that needs to remain stable for the given
159 database as database properties.
161 with connect(self.dsn) as conn:
162 set_property(conn, DBCFG_NORMALIZATION, self.normalization)
163 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
164 set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
165 set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
168 def _init_db_tables(self, config):
169 """ Set up the word table and fill it with pre-computed word
172 with connect(self.dsn) as conn:
173 sqlp = SQLPreprocessor(conn, config)
174 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
177 LOG.warning("Precomputing word tokens")
179 # get partial words and their frequencies
181 with self.name_analyzer() as analyzer:
182 with conn.cursor(name="words") as cur:
183 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
185 for name, cnt in cur:
186 term = analyzer.make_standard_word(name)
188 for word in term.split():
191 # copy them back into the word table
192 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
195 with conn.cursor() as cur:
197 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
198 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
199 WHERE word_id is null""")
204 class LegacyICUNameAnalyzer:
205 """ The legacy analyzer uses the ICU library for splitting names.
207 Each instance opens a connection to the database to request the
211 def __init__(self, dsn, normalizer, transliterator, abbreviations):
212 self.conn = connect(dsn).connection
213 self.conn.autocommit = True
214 self.normalizer = normalizer
215 self.transliterator = transliterator
216 self.abbreviations = abbreviations
218 self._cache = _TokenCache()
225 def __exit__(self, exc_type, exc_value, traceback):
230 """ Free all resources used by the analyzer.
237 def get_word_token_info(self, conn, words):
238 """ Return token information for the given list of words.
239 If a word starts with # it is assumed to be a full name
240 otherwise is a partial name.
242 The function returns a list of tuples with
243 (original word, word token, word id).
245 The function is used for testing and debugging only
246 and not necessarily efficient.
250 if word.startswith('#'):
251 tokens[word] = ' ' + self.make_standard_word(word[1:])
253 tokens[word] = self.make_standard_word(word)
255 with conn.cursor() as cur:
256 cur.execute("""SELECT word_token, word_id
257 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
258 WHERE word_token = t.term
259 and class is null and country_code is null""",
260 (list(tokens.values()), ))
261 ids = {r[0]: r[1] for r in cur}
263 return [(k, v, ids[v]) for k, v in tokens.items()]
266 def normalize(self, phrase):
267 """ Normalize the given phrase, i.e. remove all properties that
268 are irrelevant for search.
270 return self.normalizer.transliterate(phrase)
273 def normalize_postcode(postcode):
274 """ Convert the postcode to a standardized form.
276 This function must yield exactly the same result as the SQL function
277 'token_normalized_postcode()'.
279 return postcode.strip().upper()
282 @functools.lru_cache(maxsize=1024)
283 def make_standard_word(self, name):
284 """ Create the normalised version of the input.
286 norm = ' ' + self.transliterator.transliterate(name) + ' '
287 for full, abbr in self.abbreviations:
289 norm = norm.replace(full, abbr)
294 def _make_standard_hnr(self, hnr):
295 """ Create a normalised version of a housenumber.
297 This function takes minor shortcuts on transliteration.
302 return self.transliterator.transliterate(hnr)
304 def update_postcodes_from_db(self):
305 """ Update postcode tokens in the word table from the location_postcode
309 copystr = io.StringIO()
310 with self.conn.cursor() as cur:
311 # This finds us the rows in location_postcode and word that are
312 # missing in the other table.
313 cur.execute("""SELECT * FROM
314 (SELECT pc, word FROM
315 (SELECT distinct(postcode) as pc FROM location_postcode) p
317 (SELECT word FROM word
318 WHERE class ='place' and type = 'postcode') w
320 WHERE pc is null or word is null""")
322 for postcode, word in cur:
324 to_delete.append(word)
326 copystr.write(postcode)
328 copystr.write(self.transliterator.transliterate(postcode))
329 copystr.write('\tplace\tpostcode\t0\n')
332 cur.execute("""DELETE FROM WORD
333 WHERE class ='place' and type = 'postcode'
337 if copystr.getvalue():
339 cur.copy_from(copystr, 'word',
340 columns=['word', 'word_token', 'class', 'type',
341 'search_name_count'])
344 def update_special_phrases(self, phrases, should_replace):
345 """ Replace the search index for special phrases with the new phrases.
347 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
350 with self.conn.cursor() as cur:
351 # Get the old phrases.
352 existing_phrases = set()
353 cur.execute("""SELECT word, class, type, operator FROM word
354 WHERE class != 'place'
355 OR (type != 'house' AND type != 'postcode')""")
356 for label, cls, typ, oper in cur:
357 existing_phrases.add((label, cls, typ, oper or '-'))
359 to_add = norm_phrases - existing_phrases
360 to_delete = existing_phrases - norm_phrases
363 copystr = io.StringIO()
364 for word, cls, typ, oper in to_add:
365 term = self.make_standard_word(word)
375 copystr.write(oper if oper in ('in', 'near') else '\\N')
376 copystr.write('\t0\n')
379 cur.copy_from(copystr, 'word',
380 columns=['word', 'word_token', 'class', 'type',
381 'operator', 'search_name_count'])
383 if to_delete and should_replace:
384 psycopg2.extras.execute_values(
386 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
387 WHERE word = name and class = in_class and type = in_type
388 and ((op = '-' and operator is null) or op = operator)""",
391 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
392 len(norm_phrases), len(to_add), len(to_delete))
395 def add_country_names(self, country_code, names):
396 """ Add names for the given country to the search index.
398 full_names = set((self.make_standard_word(n) for n in names))
399 full_names.discard('')
400 self._add_normalized_country_names(country_code, full_names)
403 def _add_normalized_country_names(self, country_code, names):
404 """ Add names for the given country to the search index.
406 word_tokens = set((' ' + name for name in names))
407 with self.conn.cursor() as cur:
409 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
411 word_tokens.difference_update((t[0] for t in cur))
414 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
416 (SELECT nextval('seq_word'), token, '{}', 0
417 FROM unnest(%s) as token)
418 """.format(country_code), (list(word_tokens),))
421 def process_place(self, place):
422 """ Determine tokenizer information about the given place.
424 Returns a JSON-serialisable structure that will be handed into
425 the database via the token_info field.
427 token_info = _TokenInfo(self._cache)
429 names = place.get('name')
432 full_names = self._compute_full_names(names)
434 token_info.add_names(self.conn, full_names)
436 country_feature = place.get('country_feature')
437 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
438 self._add_normalized_country_names(country_feature.lower(),
441 address = place.get('address')
446 for key, value in address.items():
447 if key == 'postcode':
448 self._add_postcode(value)
449 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
451 elif key == 'street':
452 token_info.add_street(self.conn, self.make_standard_word(value))
454 token_info.add_place(self.conn, self.make_standard_word(value))
455 elif not key.startswith('_') and \
456 key not in ('country', 'full'):
457 addr_terms.append((key, self.make_standard_word(value)))
460 hnrs = self._split_housenumbers(hnrs)
461 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
464 token_info.add_address_terms(self.conn, addr_terms)
466 return token_info.data
469 def _compute_full_names(self, names):
470 """ Return the set of all full name word ids to be used with the
471 given dictionary of names.
474 for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
475 word = self.make_standard_word(name)
479 brace_split = name.split('(', 2)
480 if len(brace_split) > 1:
481 word = self.make_standard_word(brace_split[0])
488 def _add_postcode(self, postcode):
489 """ Make sure the normalized postcode is present in the word table.
491 if re.search(r'[:,;]', postcode) is None:
492 postcode = self.normalize_postcode(postcode)
494 if postcode not in self._cache.postcodes:
495 term = self.make_standard_word(postcode)
499 with self.conn.cursor() as cur:
500 # no word_id needed for postcodes
501 cur.execute("""INSERT INTO word (word, word_token, class, type,
503 (SELECT pc, %s, 'place', 'postcode', 0
504 FROM (VALUES (%s)) as v(pc)
507 WHERE word = pc and class='place' and type='postcode'))
508 """, (' ' + term, postcode))
509 self._cache.postcodes.add(postcode)
512 def _split_housenumbers(hnrs):
513 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
514 # split numbers if necessary
517 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
519 if len(simple_list) > 1:
520 hnrs = list(set(simple_list))
530 """ Collect token information to be sent back to the database.
532 def __init__(self, cache):
537 def _mk_array(tokens):
538 return '{%s}' % ','.join((str(s) for s in tokens))
541 def add_names(self, conn, names):
542 """ Adds token information for the normalised names.
544 # Start with all partial names
545 terms = set((part for ns in names for part in ns.split()))
547 terms.update((' ' + n for n in names))
549 self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
552 def add_housenumbers(self, conn, hnrs):
553 """ Extract housenumber information from a list of normalised
556 self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
557 self.data['hnr'] = ';'.join(hnrs)
560 def add_street(self, conn, street):
561 """ Add addr:street match terms.
568 tid = self.cache.names.get(term)
571 with conn.cursor() as cur:
572 cur.execute("""SELECT word_id FROM word
573 WHERE word_token = %s
574 and class is null and type is null""",
577 tid = cur.fetchone()[0]
578 self.cache.names[term] = tid
581 self.data['street'] = '{%d}' % tid
584 def add_place(self, conn, place):
585 """ Add addr:place search and match terms.
590 partial_ids = self.cache.get_term_tokens(conn, place.split())
591 tid = self.cache.get_term_tokens(conn, [' ' + place])
593 self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
594 self.data['place_match'] = '{%s}' % tid[0]
597 def add_address_terms(self, conn, terms):
598 """ Add additional address terms.
602 for key, value in terms:
605 partial_ids = self.cache.get_term_tokens(conn, value.split())
607 tid = self.cache.names.get(term)
610 with conn.cursor() as cur:
611 cur.execute("""SELECT word_id FROM word
612 WHERE word_token = %s
613 and class is null and type is null""",
616 tid = cur.fetchone()[0]
617 self.cache.names[term] = tid
619 tokens[key] = [self._mk_array(partial_ids),
620 '{%s}' % ('' if tid is None else str(tid))]
623 self.data['addr'] = tokens
627 """ Cache for token information to avoid repeated database queries.
629 This cache is not thread-safe and needs to be instantiated per
634 self.postcodes = set()
635 self.housenumbers = {}
638 def get_term_tokens(self, conn, terms):
639 """ Get token ids for a list of terms, looking them up in the database
646 token = self.names.get(term)
653 with conn.cursor() as cur:
654 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
656 for term, tid in cur:
657 self.names[term] = tid
664 def get_hnr_tokens(self, conn, terms):
665 """ Get token ids for a list of housenumbers, looking them up in the
666 database if necessary.
672 token = self.housenumbers.get(term)
679 with conn.cursor() as cur:
680 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
682 for term, tid in cur:
683 self.housenumbers[term] = tid