2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
9 from textwrap import dedent
10 from pathlib import Path
12 import psycopg2.extras
14 from nominatim.db.connection import connect
15 from nominatim.db.properties import set_property, get_property
16 from nominatim.db.utils import CopyBuffer
17 from nominatim.db.sql_preprocessor import SQLPreprocessor
18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyICUTokenizer(dsn, data_dir)
32 class LegacyICUTokenizer:
33 """ This tokenizer uses libICU to covert names and queries to ASCII.
34 Otherwise it uses the same algorithms and data structures as the
35 normalization routines in Nominatim 3.
38 def __init__(self, dsn, data_dir):
40 self.data_dir = data_dir
41 self.naming_rules = None
42 self.term_normalization = None
43 self.max_word_frequency = None
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 if config.TOKENIZER_CONFIG:
53 cfgfile = Path(config.TOKENIZER_CONFIG)
55 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
57 loader = ICURuleLoader(cfgfile)
58 self.naming_rules = ICUNameProcessorRules(loader=loader)
59 self.term_normalization = config.TERM_NORMALIZATION
60 self.max_word_frequency = config.MAX_WORD_FREQUENCY
62 self._install_php(config.lib_dir.php)
63 self._save_config(config)
66 self.update_sql_functions(config)
67 self._init_db_tables(config)
70 def init_from_project(self):
71 """ Initialise the tokenizer from the project directory.
73 with connect(self.dsn) as conn:
74 self.naming_rules = ICUNameProcessorRules(conn=conn)
75 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
79 def finalize_import(self, config):
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config):
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
93 sqlp = SQLPreprocessor(conn, config)
94 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
95 max_word_freq=max_word_freq)
98 def check_database(self):
99 """ Check that the tokenizer is set up correctly.
101 self.init_from_project()
103 if self.naming_rules is None:
104 return "Configuration for tokenizer 'legacy_icu' are missing."
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
126 # pylint: disable=missing-format-attribute
127 def _install_php(self, phpdir):
128 """ Install the php script for the tokenizer.
130 php_file = self.data_dir / "tokenizer.php"
131 php_file.write_text(dedent("""\
133 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
134 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
135 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
136 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
137 """.format(self, phpdir)))
140 def _save_config(self, config):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.naming_rules.save_rules(conn)
147 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
148 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
151 def _init_db_tables(self, config):
152 """ Set up the word table and fill it with pre-computed word
155 with connect(self.dsn) as conn:
156 sqlp = SQLPreprocessor(conn, config)
157 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
160 LOG.warning("Precomputing word tokens")
162 # get partial words and their frequencies
164 name_proc = ICUNameProcessor(self.naming_rules)
165 with conn.cursor(name="words") as cur:
166 cur.execute(""" SELECT v, count(*) FROM
167 (SELECT svals(name) as v FROM place)x
168 WHERE length(v) < 75 GROUP BY v""")
170 for name, cnt in cur:
172 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
174 terms.update(word.split())
178 # copy them back into the word table
179 with CopyBuffer() as copystr:
180 for args in words.items():
183 with conn.cursor() as cur:
184 copystr.copy_out(cur, 'word',
185 columns=['word_token', 'search_name_count'])
186 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
187 WHERE word_id is null""")
192 class LegacyICUNameAnalyzer:
193 """ The legacy analyzer uses the ICU library for splitting names.
195 Each instance opens a connection to the database to request the
199 def __init__(self, dsn, name_proc):
200 self.conn = connect(dsn).connection
201 self.conn.autocommit = True
202 self.name_processor = name_proc
204 self._cache = _TokenCache()
211 def __exit__(self, exc_type, exc_value, traceback):
216 """ Free all resources used by the analyzer.
223 def get_word_token_info(self, words):
224 """ Return token information for the given list of words.
225 If a word starts with # it is assumed to be a full name
226 otherwise is a partial name.
228 The function returns a list of tuples with
229 (original word, word token, word id).
231 The function is used for testing and debugging only
232 and not necessarily efficient.
236 if word.startswith('#'):
237 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
239 tokens[word] = self.name_processor.get_search_normalized(word)
241 with self.conn.cursor() as cur:
242 cur.execute("""SELECT word_token, word_id
243 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
244 WHERE word_token = t.term
245 and class is null and country_code is null""",
246 (list(tokens.values()), ))
247 ids = {r[0]: r[1] for r in cur}
249 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
253 def normalize_postcode(postcode):
254 """ Convert the postcode to a standardized form.
256 This function must yield exactly the same result as the SQL function
257 'token_normalized_postcode()'.
259 return postcode.strip().upper()
262 def _make_standard_hnr(self, hnr):
263 """ Create a normalised version of a housenumber.
265 This function takes minor shortcuts on transliteration.
267 return self.name_processor.get_search_normalized(hnr)
269 def update_postcodes_from_db(self):
270 """ Update postcode tokens in the word table from the location_postcode
274 with self.conn.cursor() as cur:
275 # This finds us the rows in location_postcode and word that are
276 # missing in the other table.
277 cur.execute("""SELECT * FROM
278 (SELECT pc, word FROM
279 (SELECT distinct(postcode) as pc FROM location_postcode) p
281 (SELECT word FROM word
282 WHERE class ='place' and type = 'postcode') w
284 WHERE pc is null or word is null""")
286 with CopyBuffer() as copystr:
287 for postcode, word in cur:
289 to_delete.append(word)
293 ' ' + self.name_processor.get_search_normalized(postcode),
294 'place', 'postcode', 0)
297 cur.execute("""DELETE FROM WORD
298 WHERE class ='place' and type = 'postcode'
302 copystr.copy_out(cur, 'word',
303 columns=['word', 'word_token', 'class', 'type',
304 'search_name_count'])
307 def update_special_phrases(self, phrases, should_replace):
308 """ Replace the search index for special phrases with the new phrases.
310 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
313 with self.conn.cursor() as cur:
314 # Get the old phrases.
315 existing_phrases = set()
316 cur.execute("""SELECT word, class, type, operator FROM word
317 WHERE class != 'place'
318 OR (type != 'house' AND type != 'postcode')""")
319 for label, cls, typ, oper in cur:
320 existing_phrases.add((label, cls, typ, oper or '-'))
322 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
324 deleted = self._remove_special_phrases(cur, norm_phrases,
329 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
330 len(norm_phrases), added, deleted)
333 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
334 """ Add all phrases to the database that are not yet there.
336 to_add = new_phrases - existing_phrases
339 with CopyBuffer() as copystr:
340 for word, cls, typ, oper in to_add:
341 term = self.name_processor.get_search_normalized(word)
343 copystr.add(word, ' ' + term, cls, typ,
344 oper if oper in ('in', 'near') else None, 0)
347 copystr.copy_out(cursor, 'word',
348 columns=['word', 'word_token', 'class', 'type',
349 'operator', 'search_name_count'])
355 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
356 """ Remove all phrases from the databse that are no longer in the
359 to_delete = existing_phrases - new_phrases
362 psycopg2.extras.execute_values(
364 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
365 WHERE word = name and class = in_class and type = in_type
366 and ((op = '-' and operator is null) or op = operator)""",
369 return len(to_delete)
372 def add_country_names(self, country_code, names):
373 """ Add names for the given country to the search index.
376 for name in self._compute_full_names(names):
378 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
380 with self.conn.cursor() as cur:
382 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
384 word_tokens.difference_update((t[0] for t in cur))
387 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
389 (SELECT nextval('seq_word'), token, '{}', 0
390 FROM unnest(%s) as token)
391 """.format(country_code), (list(word_tokens),))
394 def process_place(self, place):
395 """ Determine tokenizer information about the given place.
397 Returns a JSON-serialisable structure that will be handed into
398 the database via the token_info field.
400 token_info = _TokenInfo(self._cache)
402 names = place.get('name')
405 fulls, partials = self._compute_name_tokens(names)
407 token_info.add_names(fulls, partials)
409 country_feature = place.get('country_feature')
410 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
411 self.add_country_names(country_feature.lower(), names)
413 address = place.get('address')
415 self._process_place_address(token_info, address)
417 return token_info.data
420 def _process_place_address(self, token_info, address):
423 for key, value in address.items():
424 if key == 'postcode':
425 self._add_postcode(value)
426 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
428 elif key == 'street':
429 token_info.add_street(*self._compute_name_tokens({'name': value}))
431 token_info.add_place(*self._compute_name_tokens({'name': value}))
432 elif not key.startswith('_') and \
433 key not in ('country', 'full'):
434 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
437 hnrs = self._split_housenumbers(hnrs)
438 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
441 token_info.add_address_terms(addr_terms)
444 def _compute_name_tokens(self, names):
445 """ Computes the full name and partial name tokens for the given
448 full_names = self._compute_full_names(names)
450 partial_tokens = set()
452 for name in full_names:
453 norm_name = self.name_processor.get_normalized(name)
454 full, part = self._cache.names.get(norm_name, (None, None))
456 variants = self.name_processor.get_variants_ascii(norm_name)
460 with self.conn.cursor() as cur:
461 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
462 (norm_name, variants))
463 full, part = cur.fetchone()
465 self._cache.names[norm_name] = (full, part)
467 full_tokens.add(full)
468 partial_tokens.update(part)
470 return full_tokens, partial_tokens
474 def _compute_full_names(names):
475 """ Return the set of all full name word ids to be used with the
476 given dictionary of names.
479 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
483 brace_idx = name.find('(')
485 full_names.add(name[:brace_idx].strip())
490 def _add_postcode(self, postcode):
491 """ Make sure the normalized postcode is present in the word table.
493 if re.search(r'[:,;]', postcode) is None:
494 postcode = self.normalize_postcode(postcode)
496 if postcode not in self._cache.postcodes:
497 term = self.name_processor.get_search_normalized(postcode)
501 with self.conn.cursor() as cur:
502 # no word_id needed for postcodes
503 cur.execute("""INSERT INTO word (word, word_token, class, type,
505 (SELECT pc, %s, 'place', 'postcode', 0
506 FROM (VALUES (%s)) as v(pc)
509 WHERE word = pc and class='place' and type='postcode'))
510 """, (' ' + term, postcode))
511 self._cache.postcodes.add(postcode)
515 def _split_housenumbers(hnrs):
516 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
517 # split numbers if necessary
520 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
522 if len(simple_list) > 1:
523 hnrs = list(set(simple_list))
533 """ Collect token information to be sent back to the database.
535 def __init__(self, cache):
540 def _mk_array(tokens):
541 return '{%s}' % ','.join((str(s) for s in tokens))
544 def add_names(self, fulls, partials):
545 """ Adds token information for the normalised names.
547 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
550 def add_housenumbers(self, conn, hnrs):
551 """ Extract housenumber information from a list of normalised
554 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
555 self.data['hnr'] = ';'.join(hnrs)
558 def add_street(self, fulls, _):
559 """ Add addr:street match terms.
562 self.data['street'] = self._mk_array(fulls)
565 def add_place(self, fulls, partials):
566 """ Add addr:place search and match terms.
569 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
570 self.data['place_match'] = self._mk_array(fulls)
573 def add_address_terms(self, terms):
574 """ Add additional address terms.
578 for key, fulls, partials in terms:
580 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
581 self._mk_array(fulls)]
584 self.data['addr'] = tokens
588 """ Cache for token information to avoid repeated database queries.
590 This cache is not thread-safe and needs to be instantiated per
595 self.postcodes = set()
596 self.housenumbers = {}
599 def get_hnr_tokens(self, conn, terms):
600 """ Get token ids for a list of housenumbers, looking them up in the
601 database if necessary.
607 token = self.housenumbers.get(term)
614 with conn.cursor() as cur:
615 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
617 for term, tid in cur:
618 self.housenumbers[term] = tid