2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
21 LOG = logging.getLogger()
23 def create(dsn, data_dir):
24 """ Create a new instance of the tokenizer provided by this module.
26 return LegacyICUTokenizer(dsn, data_dir)
29 class LegacyICUTokenizer(AbstractTokenizer):
30 """ This tokenizer uses libICU to covert names and queries to ASCII.
31 Otherwise it uses the same algorithms and data structures as the
32 normalization routines in Nominatim 3.
35 def __init__(self, dsn, data_dir):
37 self.data_dir = data_dir
39 self.term_normalization = None
42 def init_new_db(self, config, init_db=True):
43 """ Set up a new tokenizer for the database.
45 This copies all necessary data in the project directory to make
46 sure the tokenizer remains stable even over updates.
48 self.loader = ICURuleLoader(config)
50 self.term_normalization = config.TERM_NORMALIZATION
52 self._install_php(config.lib_dir.php)
56 self.update_sql_functions(config)
57 self._init_db_tables(config)
60 def init_from_project(self, config):
61 """ Initialise the tokenizer from the project directory.
63 self.loader = ICURuleLoader(config)
65 with connect(self.dsn) as conn:
66 self.loader.load_config_from_db(conn)
67 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
70 def finalize_import(self, _):
71 """ Do any required postprocessing to make the tokenizer data ready
76 def update_sql_functions(self, config):
77 """ Reimport the SQL functions for this tokenizer.
79 with connect(self.dsn) as conn:
80 sqlp = SQLPreprocessor(conn, config)
81 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
84 def check_database(self, config):
85 """ Check that the tokenizer is set up correctly.
87 self.init_from_project(config)
89 if self.term_normalization is None:
90 return "Configuration for tokenizer 'icu' are missing."
95 def name_analyzer(self):
96 """ Create a new analyzer for tokenizing names and queries
97 using this tokinzer. Analyzers are context managers and should
101 with tokenizer.name_analyzer() as analyzer:
105 When used outside the with construct, the caller must ensure to
106 call the close() function before destructing the analyzer.
108 Analyzers are not thread-safe. You need to instantiate one per thread.
110 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
113 def _install_php(self, phpdir):
114 """ Install the php script for the tokenizer.
116 php_file = self.data_dir / "tokenizer.php"
117 php_file.write_text(dedent(f"""\
119 @define('CONST_Max_Word_Frequency', 10000000);
120 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
121 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
122 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
125 def _save_config(self):
126 """ Save the configuration that needs to remain stable for the given
127 database as database properties.
129 with connect(self.dsn) as conn:
130 self.loader.save_config_to_db(conn)
131 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
134 def _init_db_tables(self, config):
135 """ Set up the word table and fill it with pre-computed word
138 with connect(self.dsn) as conn:
139 sqlp = SQLPreprocessor(conn, config)
140 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
143 LOG.warning("Precomputing word tokens")
145 # get partial words and their frequencies
146 words = self._count_partial_terms(conn)
148 # copy them back into the word table
149 with CopyBuffer() as copystr:
150 for term, cnt in words.items():
151 copystr.add('w', term, json.dumps({'count': cnt}))
153 with conn.cursor() as cur:
154 copystr.copy_out(cur, 'word',
155 columns=['type', 'word_token', 'info'])
156 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
157 WHERE word_id is null and type = 'w'""")
161 def _count_partial_terms(self, conn):
162 """ Count the partial terms from the names in the place table.
165 name_proc = self.loader.make_token_analysis()
167 with conn.cursor(name="words") as cur:
168 cur.execute(""" SELECT v, count(*) FROM
169 (SELECT svals(name) as v FROM place)x
170 WHERE length(v) < 75 GROUP BY v""")
172 for name, cnt in cur:
174 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
176 terms.update(word.split())
183 class LegacyICUNameAnalyzer(AbstractAnalyzer):
184 """ The legacy analyzer uses the ICU library for splitting names.
186 Each instance opens a connection to the database to request the
190 def __init__(self, dsn, name_proc):
191 self.conn = connect(dsn).connection
192 self.conn.autocommit = True
193 self.name_processor = name_proc
195 self._cache = _TokenCache()
199 """ Free all resources used by the analyzer.
206 def get_word_token_info(self, words):
207 """ Return token information for the given list of words.
208 If a word starts with # it is assumed to be a full name
209 otherwise is a partial name.
211 The function returns a list of tuples with
212 (original word, word token, word id).
214 The function is used for testing and debugging only
215 and not necessarily efficient.
220 if word.startswith('#'):
221 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
223 partial_tokens[word] = self.name_processor.get_search_normalized(word)
225 with self.conn.cursor() as cur:
226 cur.execute("""SELECT word_token, word_id
227 FROM word WHERE word_token = ANY(%s) and type = 'W'
228 """, (list(full_tokens.values()),))
229 full_ids = {r[0]: r[1] for r in cur}
230 cur.execute("""SELECT word_token, word_id
231 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
232 (list(partial_tokens.values()),))
233 part_ids = {r[0]: r[1] for r in cur}
235 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
236 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
240 def normalize_postcode(postcode):
241 """ Convert the postcode to a standardized form.
243 This function must yield exactly the same result as the SQL function
244 'token_normalized_postcode()'.
246 return postcode.strip().upper()
249 def _make_standard_hnr(self, hnr):
250 """ Create a normalised version of a housenumber.
252 This function takes minor shortcuts on transliteration.
254 return self.name_processor.get_search_normalized(hnr)
256 def update_postcodes_from_db(self):
257 """ Update postcode tokens in the word table from the location_postcode
261 with self.conn.cursor() as cur:
262 # This finds us the rows in location_postcode and word that are
263 # missing in the other table.
264 cur.execute("""SELECT * FROM
265 (SELECT pc, word FROM
266 (SELECT distinct(postcode) as pc FROM location_postcode) p
268 (SELECT word FROM word WHERE type = 'P') w
270 WHERE pc is null or word is null""")
272 with CopyBuffer() as copystr:
273 for postcode, word in cur:
275 to_delete.append(word)
277 copystr.add(self.name_processor.get_search_normalized(postcode),
281 cur.execute("""DELETE FROM WORD
282 WHERE type ='P' and word = any(%s)
285 copystr.copy_out(cur, 'word',
286 columns=['word_token', 'type', 'word'])
289 def update_special_phrases(self, phrases, should_replace):
290 """ Replace the search index for special phrases with the new phrases.
291 If `should_replace` is True, then the previous set of will be
292 completely replaced. Otherwise the phrases are added to the
293 already existing ones.
295 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
298 with self.conn.cursor() as cur:
299 # Get the old phrases.
300 existing_phrases = set()
301 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
302 for word, info in cur:
303 existing_phrases.add((word, info['class'], info['type'],
304 info.get('op') or '-'))
306 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
308 deleted = self._remove_special_phrases(cur, norm_phrases,
313 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
314 len(norm_phrases), added, deleted)
317 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
318 """ Add all phrases to the database that are not yet there.
320 to_add = new_phrases - existing_phrases
323 with CopyBuffer() as copystr:
324 for word, cls, typ, oper in to_add:
325 term = self.name_processor.get_search_normalized(word)
327 copystr.add(term, 'S', word,
328 json.dumps({'class': cls, 'type': typ,
329 'op': oper if oper in ('in', 'near') else None}))
332 copystr.copy_out(cursor, 'word',
333 columns=['word_token', 'type', 'word', 'info'])
339 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
340 """ Remove all phrases from the databse that are no longer in the
343 to_delete = existing_phrases - new_phrases
346 cursor.execute_values(
347 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
348 WHERE type = 'S' and word = name
349 and info->>'class' = in_class and info->>'type' = in_type
350 and ((op = '-' and info->>'op' is null) or op = info->>'op')
353 return len(to_delete)
356 def add_country_names(self, country_code, names):
357 """ Add names for the given country to the search index.
360 for name in self._compute_full_names(names):
361 norm_name = self.name_processor.get_search_normalized(name)
363 word_tokens.add(norm_name)
365 with self.conn.cursor() as cur:
367 cur.execute("""SELECT word_token FROM word
368 WHERE type = 'C' and word = %s""",
370 word_tokens.difference_update((t[0] for t in cur))
372 # Only add those names that are not yet in the list.
374 cur.execute("""INSERT INTO word (word_token, type, word)
375 (SELECT token, 'C', %s
376 FROM unnest(%s) as token)
377 """, (country_code, list(word_tokens)))
379 # No names are deleted at the moment.
380 # If deletion is made possible, then the static names from the
381 # initial 'country_name' table should be kept.
384 def process_place(self, place):
385 """ Determine tokenizer information about the given place.
387 Returns a JSON-serialisable structure that will be handed into
388 the database via the token_info field.
390 token_info = _TokenInfo(self._cache)
395 fulls, partials = self._compute_name_tokens(names)
397 token_info.add_names(fulls, partials)
399 if place.is_country():
400 self.add_country_names(place.country_code, names)
402 address = place.address
404 self._process_place_address(token_info, address)
406 return token_info.data
409 def _process_place_address(self, token_info, address):
412 for key, value in address.items():
413 if key == 'postcode':
414 self._add_postcode(value)
415 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
417 elif key == 'street':
418 token_info.add_street(self._compute_partial_tokens(value))
420 token_info.add_place(self._compute_partial_tokens(value))
421 elif not key.startswith('_') and \
422 key not in ('country', 'full'):
423 addr_terms.append((key, self._compute_partial_tokens(value)))
426 hnrs = self._split_housenumbers(hnrs)
427 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
430 token_info.add_address_terms(addr_terms)
432 def _compute_partial_tokens(self, name):
433 """ Normalize the given term, split it into partial words and return
434 then token list for them.
436 norm_name = self.name_processor.get_search_normalized(name)
440 for partial in norm_name.split():
441 token = self._cache.partials.get(partial)
445 need_lookup.append(partial)
448 with self.conn.cursor() as cur:
449 cur.execute("""SELECT word, getorcreate_partial_word(word)
450 FROM unnest(%s) word""",
453 for partial, token in cur:
455 self._cache.partials[partial] = token
459 def _compute_name_tokens(self, names):
460 """ Computes the full name and partial name tokens for the given
463 full_names = self._compute_full_names(names)
465 partial_tokens = set()
467 for name in full_names:
468 norm_name = self.name_processor.get_normalized(name)
469 full, part = self._cache.names.get(norm_name, (None, None))
471 variants = self.name_processor.get_variants_ascii(norm_name)
475 with self.conn.cursor() as cur:
476 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
477 (norm_name, variants))
478 full, part = cur.fetchone()
480 self._cache.names[norm_name] = (full, part)
482 full_tokens.add(full)
483 partial_tokens.update(part)
485 return full_tokens, partial_tokens
489 def _compute_full_names(names):
490 """ Return the set of all full name word ids to be used with the
491 given dictionary of names.
494 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
498 brace_idx = name.find('(')
500 full_names.add(name[:brace_idx].strip())
505 def _add_postcode(self, postcode):
506 """ Make sure the normalized postcode is present in the word table.
508 if re.search(r'[:,;]', postcode) is None:
509 postcode = self.normalize_postcode(postcode)
511 if postcode not in self._cache.postcodes:
512 term = self.name_processor.get_search_normalized(postcode)
516 with self.conn.cursor() as cur:
517 # no word_id needed for postcodes
518 cur.execute("""INSERT INTO word (word_token, type, word)
519 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
522 WHERE type = 'P' and word = pc))
523 """, (term, postcode))
524 self._cache.postcodes.add(postcode)
528 def _split_housenumbers(hnrs):
529 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
530 # split numbers if necessary
533 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
535 if len(simple_list) > 1:
536 hnrs = list(set(simple_list))
546 """ Collect token information to be sent back to the database.
548 def __init__(self, cache):
553 def _mk_array(tokens):
554 return '{%s}' % ','.join((str(s) for s in tokens))
557 def add_names(self, fulls, partials):
558 """ Adds token information for the normalised names.
560 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
563 def add_housenumbers(self, conn, hnrs):
564 """ Extract housenumber information from a list of normalised
567 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
568 self.data['hnr'] = ';'.join(hnrs)
571 def add_street(self, tokens):
572 """ Add addr:street match terms.
575 self.data['street'] = self._mk_array(tokens)
578 def add_place(self, tokens):
579 """ Add addr:place search and match terms.
582 self.data['place'] = self._mk_array(tokens)
585 def add_address_terms(self, terms):
586 """ Add additional address terms.
588 tokens = {key: self._mk_array(partials)
589 for key, partials in terms if partials}
592 self.data['addr'] = tokens
596 """ Cache for token information to avoid repeated database queries.
598 This cache is not thread-safe and needs to be instantiated per
604 self.postcodes = set()
605 self.housenumbers = {}
608 def get_hnr_tokens(self, conn, terms):
609 """ Get token ids for a list of housenumbers, looking them up in the
610 database if necessary. `terms` is an iterable of normalized
617 token = self.housenumbers.get(term)
624 with conn.cursor() as cur:
625 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
627 for term, tid in cur:
628 self.housenumbers[term] = tid