2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.indexer.place_info import PlaceInfo
17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer(AbstractTokenizer):
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
40 self.term_normalization = None
43 def init_new_db(self, config, init_db=True):
44 """ Set up a new tokenizer for the database.
46 This copies all necessary data in the project directory to make
47 sure the tokenizer remains stable even over updates.
49 self.loader = ICURuleLoader(config)
51 self.term_normalization = config.TERM_NORMALIZATION
53 self._install_php(config.lib_dir.php)
57 self.update_sql_functions(config)
58 self._init_db_tables(config)
61 def init_from_project(self, config):
62 """ Initialise the tokenizer from the project directory.
64 self.loader = ICURuleLoader(config)
66 with connect(self.dsn) as conn:
67 self.loader.load_config_from_db(conn)
68 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
71 def finalize_import(self, _):
72 """ Do any required postprocessing to make the tokenizer data ready
77 def update_sql_functions(self, config):
78 """ Reimport the SQL functions for this tokenizer.
80 with connect(self.dsn) as conn:
81 sqlp = SQLPreprocessor(conn, config)
82 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
85 def check_database(self, config):
86 """ Check that the tokenizer is set up correctly.
88 self.init_from_project(config)
90 if self.term_normalization is None:
91 return "Configuration for tokenizer 'icu' are missing."
96 def name_analyzer(self):
97 """ Create a new analyzer for tokenizing names and queries
98 using this tokinzer. Analyzers are context managers and should
102 with tokenizer.name_analyzer() as analyzer:
106 When used outside the with construct, the caller must ensure to
107 call the close() function before destructing the analyzer.
109 Analyzers are not thread-safe. You need to instantiate one per thread.
111 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
112 self.loader.make_token_analysis())
115 def _install_php(self, phpdir):
116 """ Install the php script for the tokenizer.
118 php_file = self.data_dir / "tokenizer.php"
119 php_file.write_text(dedent(f"""\
121 @define('CONST_Max_Word_Frequency', 10000000);
122 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
123 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
124 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
127 def _save_config(self):
128 """ Save the configuration that needs to remain stable for the given
129 database as database properties.
131 with connect(self.dsn) as conn:
132 self.loader.save_config_to_db(conn)
133 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
136 def _init_db_tables(self, config):
137 """ Set up the word table and fill it with pre-computed word
140 with connect(self.dsn) as conn:
141 sqlp = SQLPreprocessor(conn, config)
142 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
145 LOG.warning("Precomputing word tokens")
147 # get partial words and their frequencies
148 words = self._count_partial_terms(conn)
150 # copy them back into the word table
151 with CopyBuffer() as copystr:
152 for term, cnt in words.items():
153 copystr.add('w', term, json.dumps({'count': cnt}))
155 with conn.cursor() as cur:
156 copystr.copy_out(cur, 'word',
157 columns=['type', 'word_token', 'info'])
158 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
159 WHERE word_id is null and type = 'w'""")
163 def _count_partial_terms(self, conn):
164 """ Count the partial terms from the names in the place table.
167 name_proc = self.loader.make_token_analysis()
169 with conn.cursor(name="words") as cur:
170 cur.execute(""" SELECT v, count(*) FROM
171 (SELECT svals(name) as v FROM place)x
172 WHERE length(v) < 75 GROUP BY v""")
174 for name, cnt in cur:
176 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
178 terms.update(word.split())
185 class LegacyICUNameAnalyzer(AbstractAnalyzer):
186 """ The legacy analyzer uses the ICU library for splitting names.
188 Each instance opens a connection to the database to request the
192 def __init__(self, dsn, sanitizer, token_analysis):
193 self.conn = connect(dsn).connection
194 self.conn.autocommit = True
195 self.sanitizer = sanitizer
196 self.token_analysis = token_analysis
198 self._cache = _TokenCache()
202 """ Free all resources used by the analyzer.
209 def _search_normalized(self, name):
210 """ Return the search token transliteration of the given name.
212 return self.token_analysis.get_search_normalized(name)
215 def _normalized(self, name):
216 """ Return the normalized version of the given name with all
217 non-relevant information removed.
219 return self.token_analysis.get_normalized(name)
222 def get_word_token_info(self, words):
223 """ Return token information for the given list of words.
224 If a word starts with # it is assumed to be a full name
225 otherwise is a partial name.
227 The function returns a list of tuples with
228 (original word, word token, word id).
230 The function is used for testing and debugging only
231 and not necessarily efficient.
236 if word.startswith('#'):
237 full_tokens[word] = self._search_normalized(word[1:])
239 partial_tokens[word] = self._search_normalized(word)
241 with self.conn.cursor() as cur:
242 cur.execute("""SELECT word_token, word_id
243 FROM word WHERE word_token = ANY(%s) and type = 'W'
244 """, (list(full_tokens.values()),))
245 full_ids = {r[0]: r[1] for r in cur}
246 cur.execute("""SELECT word_token, word_id
247 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
248 (list(partial_tokens.values()),))
249 part_ids = {r[0]: r[1] for r in cur}
251 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
252 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
256 def normalize_postcode(postcode):
257 """ Convert the postcode to a standardized form.
259 This function must yield exactly the same result as the SQL function
260 'token_normalized_postcode()'.
262 return postcode.strip().upper()
265 def _make_standard_hnr(self, hnr):
266 """ Create a normalised version of a housenumber.
268 This function takes minor shortcuts on transliteration.
270 return self._search_normalized(hnr)
272 def update_postcodes_from_db(self):
273 """ Update postcode tokens in the word table from the location_postcode
277 with self.conn.cursor() as cur:
278 # This finds us the rows in location_postcode and word that are
279 # missing in the other table.
280 cur.execute("""SELECT * FROM
281 (SELECT pc, word FROM
282 (SELECT distinct(postcode) as pc FROM location_postcode) p
284 (SELECT word FROM word WHERE type = 'P') w
286 WHERE pc is null or word is null""")
288 with CopyBuffer() as copystr:
289 for postcode, word in cur:
291 to_delete.append(word)
293 copystr.add(self._search_normalized(postcode),
297 cur.execute("""DELETE FROM WORD
298 WHERE type ='P' and word = any(%s)
301 copystr.copy_out(cur, 'word',
302 columns=['word_token', 'type', 'word'])
305 def update_special_phrases(self, phrases, should_replace):
306 """ Replace the search index for special phrases with the new phrases.
307 If `should_replace` is True, then the previous set of will be
308 completely replaced. Otherwise the phrases are added to the
309 already existing ones.
311 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
314 with self.conn.cursor() as cur:
315 # Get the old phrases.
316 existing_phrases = set()
317 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
318 for word, info in cur:
319 existing_phrases.add((word, info['class'], info['type'],
320 info.get('op') or '-'))
322 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
324 deleted = self._remove_special_phrases(cur, norm_phrases,
329 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
330 len(norm_phrases), added, deleted)
333 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
334 """ Add all phrases to the database that are not yet there.
336 to_add = new_phrases - existing_phrases
339 with CopyBuffer() as copystr:
340 for word, cls, typ, oper in to_add:
341 term = self._search_normalized(word)
343 copystr.add(term, 'S', word,
344 json.dumps({'class': cls, 'type': typ,
345 'op': oper if oper in ('in', 'near') else None}))
348 copystr.copy_out(cursor, 'word',
349 columns=['word_token', 'type', 'word', 'info'])
355 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
356 """ Remove all phrases from the databse that are no longer in the
359 to_delete = existing_phrases - new_phrases
362 cursor.execute_values(
363 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
364 WHERE type = 'S' and word = name
365 and info->>'class' = in_class and info->>'type' = in_type
366 and ((op = '-' and info->>'op' is null) or op = info->>'op')
369 return len(to_delete)
372 def add_country_names(self, country_code, names):
373 """ Add names for the given country to the search index.
375 # Make sure any name preprocessing for country names applies.
376 info = PlaceInfo({'name': names, 'country_code': country_code,
377 'rank_address': 4, 'class': 'boundary',
378 'type': 'administrative'})
379 self._add_country_full_names(country_code,
380 self.sanitizer.process_names(info)[0])
383 def _add_country_full_names(self, country_code, names):
384 """ Add names for the given country from an already sanitized
389 norm_name = self._search_normalized(name.name)
391 word_tokens.add(norm_name)
393 with self.conn.cursor() as cur:
395 cur.execute("""SELECT word_token FROM word
396 WHERE type = 'C' and word = %s""",
398 word_tokens.difference_update((t[0] for t in cur))
400 # Only add those names that are not yet in the list.
402 cur.execute("""INSERT INTO word (word_token, type, word)
403 (SELECT token, 'C', %s
404 FROM unnest(%s) as token)
405 """, (country_code, list(word_tokens)))
407 # No names are deleted at the moment.
408 # If deletion is made possible, then the static names from the
409 # initial 'country_name' table should be kept.
412 def process_place(self, place):
413 """ Determine tokenizer information about the given place.
415 Returns a JSON-serializable structure that will be handed into
416 the database via the token_info field.
418 token_info = _TokenInfo(self._cache)
420 names, address = self.sanitizer.process_names(place)
423 fulls, partials = self._compute_name_tokens(names)
425 token_info.add_names(fulls, partials)
427 if place.is_country():
428 self._add_country_full_names(place.country_code, names)
431 self._process_place_address(token_info, address)
433 return token_info.data
436 def _process_place_address(self, token_info, address):
440 if item.kind == 'postcode':
441 self._add_postcode(item.name)
442 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
443 hnrs.append(item.name)
444 elif item.kind == 'street':
445 token_info.add_street(self._compute_partial_tokens(item.name))
446 elif item.kind == 'place':
447 token_info.add_place(self._compute_partial_tokens(item.name))
448 elif not item.kind.startswith('_') and \
449 item.kind not in ('country', 'full'):
450 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
453 hnrs = self._split_housenumbers(hnrs)
454 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
457 token_info.add_address_terms(addr_terms)
459 def _compute_partial_tokens(self, name):
460 """ Normalize the given term, split it into partial words and return
461 then token list for them.
463 norm_name = self._search_normalized(name)
467 for partial in norm_name.split():
468 token = self._cache.partials.get(partial)
472 need_lookup.append(partial)
475 with self.conn.cursor() as cur:
476 cur.execute("""SELECT word, getorcreate_partial_word(word)
477 FROM unnest(%s) word""",
480 for partial, token in cur:
482 self._cache.partials[partial] = token
487 def _compute_name_tokens(self, names):
488 """ Computes the full name and partial name tokens for the given
492 partial_tokens = set()
495 norm_name = self._normalized(name.name)
496 full, part = self._cache.names.get(norm_name, (None, None))
498 variants = self.token_analysis.get_variants_ascii(norm_name)
502 with self.conn.cursor() as cur:
503 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
504 (norm_name, variants))
505 full, part = cur.fetchone()
507 self._cache.names[norm_name] = (full, part)
509 full_tokens.add(full)
510 partial_tokens.update(part)
512 return full_tokens, partial_tokens
515 def _add_postcode(self, postcode):
516 """ Make sure the normalized postcode is present in the word table.
518 if re.search(r'[:,;]', postcode) is None:
519 postcode = self.normalize_postcode(postcode)
521 if postcode not in self._cache.postcodes:
522 term = self._search_normalized(postcode)
526 with self.conn.cursor() as cur:
527 # no word_id needed for postcodes
528 cur.execute("""INSERT INTO word (word_token, type, word)
529 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
532 WHERE type = 'P' and word = pc))
533 """, (term, postcode))
534 self._cache.postcodes.add(postcode)
538 def _split_housenumbers(hnrs):
539 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
540 # split numbers if necessary
543 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
545 if len(simple_list) > 1:
546 hnrs = list(set(simple_list))
556 """ Collect token information to be sent back to the database.
558 def __init__(self, cache):
563 def _mk_array(tokens):
564 return '{%s}' % ','.join((str(s) for s in tokens))
567 def add_names(self, fulls, partials):
568 """ Adds token information for the normalised names.
570 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
573 def add_housenumbers(self, conn, hnrs):
574 """ Extract housenumber information from a list of normalised
577 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
578 self.data['hnr'] = ';'.join(hnrs)
581 def add_street(self, tokens):
582 """ Add addr:street match terms.
585 self.data['street'] = self._mk_array(tokens)
588 def add_place(self, tokens):
589 """ Add addr:place search and match terms.
592 self.data['place'] = self._mk_array(tokens)
595 def add_address_terms(self, terms):
596 """ Add additional address terms.
598 tokens = {key: self._mk_array(partials)
599 for key, partials in terms if partials}
602 self.data['addr'] = tokens
606 """ Cache for token information to avoid repeated database queries.
608 This cache is not thread-safe and needs to be instantiated per
614 self.postcodes = set()
615 self.housenumbers = {}
618 def get_hnr_tokens(self, conn, terms):
619 """ Get token ids for a list of housenumbers, looking them up in the
620 database if necessary. `terms` is an iterable of normalized
627 token = self.housenumbers.get(term)
634 with conn.cursor() as cur:
635 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
637 for term, tid in cur:
638 self.housenumbers[term] = tid