nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 from textwrap import dedent
  15
  16 from nominatim.db.connection import connect
  17 from nominatim.db.utils import CopyBuffer
  18 from nominatim.db.sql_preprocessor import SQLPreprocessor
  19 from nominatim.data.place_info import PlaceInfo
  20 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  21 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  22
  23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  24
  25 LOG = logging.getLogger()
  26
  27 def create(dsn, data_dir):
  28     """ Create a new instance of the tokenizer provided by this module.
  29     """
  30     return LegacyICUTokenizer(dsn, data_dir)
  31
  32
  33 class LegacyICUTokenizer(AbstractTokenizer):
  34     """ This tokenizer uses libICU to covert names and queries to ASCII.
  35         Otherwise it uses the same algorithms and data structures as the
  36         normalization routines in Nominatim 3.
  37     """
  38
  39     def __init__(self, dsn, data_dir):
  40         self.dsn = dsn
  41         self.data_dir = data_dir
  42         self.loader = None
  43
  44
  45     def init_new_db(self, config, init_db=True):
  46         """ Set up a new tokenizer for the database.
  47
  48             This copies all necessary data in the project directory to make
  49             sure the tokenizer remains stable even over updates.
  50         """
  51         self.loader = ICURuleLoader(config)
  52
  53         self._install_php(config.lib_dir.php, overwrite=True)
  54         self._save_config()
  55
  56         if init_db:
  57             self.update_sql_functions(config)
  58             self._init_db_tables(config)
  59
  60
  61     def init_from_project(self, config):
  62         """ Initialise the tokenizer from the project directory.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         with connect(self.dsn) as conn:
  67             self.loader.load_config_from_db(conn)
  68
  69         self._install_php(config.lib_dir.php, overwrite=False)
  70
  71
  72     def finalize_import(self, config):
  73         """ Do any required postprocessing to make the tokenizer data ready
  74             for use.
  75         """
  76         with connect(self.dsn) as conn:
  77             sqlp = SQLPreprocessor(conn, config)
  78             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  79
  80
  81     def update_sql_functions(self, config):
  82         """ Reimport the SQL functions for this tokenizer.
  83         """
  84         with connect(self.dsn) as conn:
  85             sqlp = SQLPreprocessor(conn, config)
  86             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  87
  88
  89     def check_database(self, config):
  90         """ Check that the tokenizer is set up correctly.
  91         """
  92         # Will throw an error if there is an issue.
  93         self.init_from_project(config)
  94
  95
  96     def update_statistics(self):
  97         """ Recompute frequencies for all name words.
  98         """
  99         with connect(self.dsn) as conn:
 100             if conn.table_exists('search_name'):
 101                 with conn.cursor() as cur:
 102                     cur.drop_table("word_frequencies")
 103                     LOG.info("Computing word frequencies")
 104                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 105                                      SELECT unnest(name_vector) as id, count(*)
 106                                      FROM search_name GROUP BY id""")
 107                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 108                     LOG.info("Update word table with recomputed frequencies")
 109                     cur.execute("""UPDATE word
 110                                    SET info = info || jsonb_build_object('count', count)
 111                                    FROM word_frequencies WHERE word_id = id""")
 112                     cur.drop_table("word_frequencies")
 113             conn.commit()
 114
 115
 116     def _cleanup_housenumbers(self):
 117         """ Remove unused house numbers.
 118         """
 119         with connect(self.dsn) as conn:
 120             if not conn.table_exists('search_name'):
 121                 return
 122             with conn.cursor(name="hnr_counter") as cur:
 123                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 124                                FROM word
 125                                WHERE type = 'H'
 126                                  AND NOT EXISTS(SELECT * FROM search_name
 127                                                 WHERE ARRAY[word.word_id] && name_vector)
 128                                  AND (char_length(coalesce(word, word_token)) > 6
 129                                       OR coalesce(word, word_token) not similar to '\\d+')
 130                             """)
 131                 candidates = {token: wid for wid, token in cur}
 132             with conn.cursor(name="hnr_counter") as cur:
 133                 cur.execute("""SELECT housenumber FROM placex
 134                                WHERE housenumber is not null
 135                                      AND (char_length(housenumber) > 6
 136                                           OR housenumber not similar to '\\d+')
 137                             """)
 138                 for row in cur:
 139                     for hnr in row[0].split(';'):
 140                         candidates.pop(hnr, None)
 141             LOG.info("There are %s outdated housenumbers.", len(candidates))
 142             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 143             if candidates:
 144                 with conn.cursor() as cur:
 145                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 146                                 (list(candidates.values()), ))
 147                 conn.commit()
 148
 149
 150
 151     def update_word_tokens(self):
 152         """ Remove unused tokens.
 153         """
 154         LOG.warning("Cleaning up housenumber tokens.")
 155         self._cleanup_housenumbers()
 156         LOG.warning("Tokenizer house-keeping done.")
 157
 158
 159     def name_analyzer(self):
 160         """ Create a new analyzer for tokenizing names and queries
 161             using this tokinzer. Analyzers are context managers and should
 162             be used accordingly:
 163
 164             ```
 165             with tokenizer.name_analyzer() as analyzer:
 166                 analyser.tokenize()
 167             ```
 168
 169             When used outside the with construct, the caller must ensure to
 170             call the close() function before destructing the analyzer.
 171
 172             Analyzers are not thread-safe. You need to instantiate one per thread.
 173         """
 174         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 175                                      self.loader.make_token_analysis())
 176
 177
 178     def _install_php(self, phpdir, overwrite=True):
 179         """ Install the php script for the tokenizer.
 180         """
 181         php_file = self.data_dir / "tokenizer.php"
 182
 183         if not php_file.exists() or overwrite:
 184             php_file.write_text(dedent(f"""\
 185                 <?php
 186                 @define('CONST_Max_Word_Frequency', 10000000);
 187                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 188                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 189                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 190
 191
 192     def _save_config(self):
 193         """ Save the configuration that needs to remain stable for the given
 194             database as database properties.
 195         """
 196         with connect(self.dsn) as conn:
 197             self.loader.save_config_to_db(conn)
 198
 199
 200     def _init_db_tables(self, config):
 201         """ Set up the word table and fill it with pre-computed word
 202             frequencies.
 203         """
 204         with connect(self.dsn) as conn:
 205             sqlp = SQLPreprocessor(conn, config)
 206             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 207             conn.commit()
 208
 209
 210 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 211     """ The legacy analyzer uses the ICU library for splitting names.
 212
 213         Each instance opens a connection to the database to request the
 214         normalization.
 215     """
 216
 217     def __init__(self, dsn, sanitizer, token_analysis):
 218         self.conn = connect(dsn).connection
 219         self.conn.autocommit = True
 220         self.sanitizer = sanitizer
 221         self.token_analysis = token_analysis
 222
 223         self._cache = _TokenCache()
 224
 225
 226     def close(self):
 227         """ Free all resources used by the analyzer.
 228         """
 229         if self.conn:
 230             self.conn.close()
 231             self.conn = None
 232
 233
 234     def _search_normalized(self, name):
 235         """ Return the search token transliteration of the given name.
 236         """
 237         return self.token_analysis.search.transliterate(name).strip()
 238
 239
 240     def _normalized(self, name):
 241         """ Return the normalized version of the given name with all
 242             non-relevant information removed.
 243         """
 244         return self.token_analysis.normalizer.transliterate(name).strip()
 245
 246
 247     def get_word_token_info(self, words):
 248         """ Return token information for the given list of words.
 249             If a word starts with # it is assumed to be a full name
 250             otherwise is a partial name.
 251
 252             The function returns a list of tuples with
 253             (original word, word token, word id).
 254
 255             The function is used for testing and debugging only
 256             and not necessarily efficient.
 257         """
 258         full_tokens = {}
 259         partial_tokens = {}
 260         for word in words:
 261             if word.startswith('#'):
 262                 full_tokens[word] = self._search_normalized(word[1:])
 263             else:
 264                 partial_tokens[word] = self._search_normalized(word)
 265
 266         with self.conn.cursor() as cur:
 267             cur.execute("""SELECT word_token, word_id
 268                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 269                         """, (list(full_tokens.values()),))
 270             full_ids = {r[0]: r[1] for r in cur}
 271             cur.execute("""SELECT word_token, word_id
 272                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 273                         (list(partial_tokens.values()),))
 274             part_ids = {r[0]: r[1] for r in cur}
 275
 276         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 277                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 278
 279
 280     def normalize_postcode(self, postcode):
 281         """ Convert the postcode to a standardized form.
 282
 283             This function must yield exactly the same result as the SQL function
 284             'token_normalized_postcode()'.
 285         """
 286         return postcode.strip().upper()
 287
 288
 289     def update_postcodes_from_db(self):
 290         """ Update postcode tokens in the word table from the location_postcode
 291             table.
 292         """
 293         analyzer = self.token_analysis.analysis.get('@postcode')
 294
 295         with self.conn.cursor() as cur:
 296             # First get all postcode names currently in the word table.
 297             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 298             word_entries = set((entry[0] for entry in cur))
 299
 300             # Then compute the required postcode names from the postcode table.
 301             needed_entries = set()
 302             cur.execute("SELECT country_code, postcode FROM location_postcode")
 303             for cc, postcode in cur:
 304                 info = PlaceInfo({'country_code': cc,
 305                                   'class': 'place', 'type': 'postcode',
 306                                   'address': {'postcode': postcode}})
 307                 address = self.sanitizer.process_names(info)[1]
 308                 for place in address:
 309                     if place.kind == 'postcode':
 310                         if analyzer is None:
 311                             postcode_name = place.name.strip().upper()
 312                             variant_base = None
 313                         else:
 314                             postcode_name = analyzer.normalize(place.name)
 315                             variant_base = place.get_attr("variant")
 316
 317                         if variant_base:
 318                             needed_entries.add(f'{postcode_name}@{variant_base}')
 319                         else:
 320                             needed_entries.add(postcode_name)
 321                         break
 322
 323         # Now update the word table.
 324         self._delete_unused_postcode_words(word_entries - needed_entries)
 325         self._add_missing_postcode_words(needed_entries - word_entries)
 326
 327     def _delete_unused_postcode_words(self, tokens):
 328         if tokens:
 329             with self.conn.cursor() as cur:
 330                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 331                             (list(tokens), ))
 332
 333     def _add_missing_postcode_words(self, tokens):
 334         if not tokens:
 335             return
 336
 337         analyzer = self.token_analysis.analysis.get('@postcode')
 338         terms = []
 339
 340         for postcode_name in tokens:
 341             if '@' in postcode_name:
 342                 term, variant = postcode_name.split('@', 2)
 343                 term = self._search_normalized(term)
 344                 variants = {term}
 345                 if analyzer is not None:
 346                     variants.update(analyzer.get_variants_ascii(variant))
 347                     variants = list(variants)
 348             else:
 349                 variants = [self._search_normalized(postcode_name)]
 350             terms.append((postcode_name, variants))
 351
 352         if terms:
 353             with self.conn.cursor() as cur:
 354                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 355                                       FROM (VALUES %s) AS v(pc, var)""",
 356                                    terms)
 357
 358
 359
 360
 361     def update_special_phrases(self, phrases, should_replace):
 362         """ Replace the search index for special phrases with the new phrases.
 363             If `should_replace` is True, then the previous set of will be
 364             completely replaced. Otherwise the phrases are added to the
 365             already existing ones.
 366         """
 367         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 368                             for p in phrases))
 369
 370         with self.conn.cursor() as cur:
 371             # Get the old phrases.
 372             existing_phrases = set()
 373             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 374             for word, info in cur:
 375                 existing_phrases.add((word, info['class'], info['type'],
 376                                       info.get('op') or '-'))
 377
 378             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 379             if should_replace:
 380                 deleted = self._remove_special_phrases(cur, norm_phrases,
 381                                                        existing_phrases)
 382             else:
 383                 deleted = 0
 384
 385         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 386                  len(norm_phrases), added, deleted)
 387
 388
 389     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 390         """ Add all phrases to the database that are not yet there.
 391         """
 392         to_add = new_phrases - existing_phrases
 393
 394         added = 0
 395         with CopyBuffer() as copystr:
 396             for word, cls, typ, oper in to_add:
 397                 term = self._search_normalized(word)
 398                 if term:
 399                     copystr.add(term, 'S', word,
 400                                 json.dumps({'class': cls, 'type': typ,
 401                                             'op': oper if oper in ('in', 'near') else None}))
 402                     added += 1
 403
 404             copystr.copy_out(cursor, 'word',
 405                              columns=['word_token', 'type', 'word', 'info'])
 406
 407         return added
 408
 409
 410     @staticmethod
 411     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 412         """ Remove all phrases from the databse that are no longer in the
 413             new phrase list.
 414         """
 415         to_delete = existing_phrases - new_phrases
 416
 417         if to_delete:
 418             cursor.execute_values(
 419                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 420                     WHERE type = 'S' and word = name
 421                           and info->>'class' = in_class and info->>'type' = in_type
 422                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 423                 """, to_delete)
 424
 425         return len(to_delete)
 426
 427
 428     def add_country_names(self, country_code, names):
 429         """ Add default names for the given country to the search index.
 430         """
 431         # Make sure any name preprocessing for country names applies.
 432         info = PlaceInfo({'name': names, 'country_code': country_code,
 433                           'rank_address': 4, 'class': 'boundary',
 434                           'type': 'administrative'})
 435         self._add_country_full_names(country_code,
 436                                      self.sanitizer.process_names(info)[0],
 437                                      internal=True)
 438
 439
 440     def _add_country_full_names(self, country_code, names, internal=False):
 441         """ Add names for the given country from an already sanitized
 442             name list.
 443         """
 444         word_tokens = set()
 445         for name in names:
 446             norm_name = self._search_normalized(name.name)
 447             if norm_name:
 448                 word_tokens.add(norm_name)
 449
 450         with self.conn.cursor() as cur:
 451             # Get existing names
 452             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 453                              FROM word
 454                              WHERE type = 'C' and word = %s""",
 455                         (country_code, ))
 456             existing_tokens = {True: set(), False: set()} # internal/external names
 457             for word in cur:
 458                 existing_tokens[word[1]].add(word[0])
 459
 460             # Delete names that no longer exist.
 461             gone_tokens = existing_tokens[internal] - word_tokens
 462             if internal:
 463                 gone_tokens.update(existing_tokens[False] & word_tokens)
 464             if gone_tokens:
 465                 cur.execute("""DELETE FROM word
 466                                USING unnest(%s) as token
 467                                WHERE type = 'C' and word = %s
 468                                      and word_token = token""",
 469                             (list(gone_tokens), country_code))
 470
 471             # Only add those names that are not yet in the list.
 472             new_tokens = word_tokens - existing_tokens[True]
 473             if not internal:
 474                 new_tokens -= existing_tokens[False]
 475             if new_tokens:
 476                 if internal:
 477                     sql = """INSERT INTO word (word_token, type, word, info)
 478                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 479                                   FROM unnest(%s) as token)
 480                            """
 481                 else:
 482                     sql = """INSERT INTO word (word_token, type, word)
 483                                    (SELECT token, 'C', %s
 484                                     FROM unnest(%s) as token)
 485                           """
 486                 cur.execute(sql, (country_code, list(new_tokens)))
 487
 488
 489     def process_place(self, place):
 490         """ Determine tokenizer information about the given place.
 491
 492             Returns a JSON-serializable structure that will be handed into
 493             the database via the token_info field.
 494         """
 495         token_info = _TokenInfo()
 496
 497         names, address = self.sanitizer.process_names(place)
 498
 499         if names:
 500             token_info.set_names(*self._compute_name_tokens(names))
 501
 502             if place.is_country():
 503                 self._add_country_full_names(place.country_code, names)
 504
 505         if address:
 506             self._process_place_address(token_info, address)
 507
 508         return token_info.to_dict()
 509
 510
 511     def _process_place_address(self, token_info, address):
 512         for item in address:
 513             if item.kind == 'postcode':
 514                 token_info.set_postcode(self._add_postcode(item))
 515             elif item.kind == 'housenumber':
 516                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 517             elif item.kind == 'street':
 518                 token_info.add_street(self._retrieve_full_tokens(item.name))
 519             elif item.kind == 'place':
 520                 if not item.suffix:
 521                     token_info.add_place(self._compute_partial_tokens(item.name))
 522             elif not item.kind.startswith('_') and not item.suffix and \
 523                  item.kind not in ('country', 'full', 'inclusion'):
 524                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 525
 526
 527     def _compute_housenumber_token(self, hnr):
 528         """ Normalize the housenumber and return the word token and the
 529             canonical form.
 530         """
 531         analyzer = self.token_analysis.analysis.get('@housenumber')
 532         result = None, None
 533
 534         if analyzer is None:
 535             # When no custom analyzer is set, simply normalize and transliterate
 536             norm_name = self._search_normalized(hnr.name)
 537             if norm_name:
 538                 result = self._cache.housenumbers.get(norm_name, result)
 539                 if result[0] is None:
 540                     with self.conn.cursor() as cur:
 541                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 542                         result = cur.fetchone()[0], norm_name
 543                         self._cache.housenumbers[norm_name] = result
 544         else:
 545             # Otherwise use the analyzer to determine the canonical name.
 546             # Per convention we use the first variant as the 'lookup name', the
 547             # name that gets saved in the housenumber field of the place.
 548             norm_name = analyzer.normalize(hnr.name)
 549             if norm_name:
 550                 result = self._cache.housenumbers.get(norm_name, result)
 551                 if result[0] is None:
 552                     variants = analyzer.get_variants_ascii(norm_name)
 553                     if variants:
 554                         with self.conn.cursor() as cur:
 555                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
 556                                         (norm_name, list(variants)))
 557                             result = cur.fetchone()[0], variants[0]
 558                             self._cache.housenumbers[norm_name] = result
 559
 560         return result
 561
 562
 563     def _compute_partial_tokens(self, name):
 564         """ Normalize the given term, split it into partial words and return
 565             then token list for them.
 566         """
 567         norm_name = self._search_normalized(name)
 568
 569         tokens = []
 570         need_lookup = []
 571         for partial in norm_name.split():
 572             token = self._cache.partials.get(partial)
 573             if token:
 574                 tokens.append(token)
 575             else:
 576                 need_lookup.append(partial)
 577
 578         if need_lookup:
 579             with self.conn.cursor() as cur:
 580                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 581                                FROM unnest(%s) word""",
 582                             (need_lookup, ))
 583
 584                 for partial, token in cur:
 585                     tokens.append(token)
 586                     self._cache.partials[partial] = token
 587
 588         return tokens
 589
 590
 591     def _retrieve_full_tokens(self, name):
 592         """ Get the full name token for the given name, if it exists.
 593             The name is only retrived for the standard analyser.
 594         """
 595         norm_name = self._search_normalized(name)
 596
 597         # return cached if possible
 598         if norm_name in self._cache.fulls:
 599             return self._cache.fulls[norm_name]
 600
 601         with self.conn.cursor() as cur:
 602             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 603                         (norm_name, ))
 604             full = [row[0] for row in cur]
 605
 606         self._cache.fulls[norm_name] = full
 607
 608         return full
 609
 610
 611     def _compute_name_tokens(self, names):
 612         """ Computes the full name and partial name tokens for the given
 613             dictionary of names.
 614         """
 615         full_tokens = set()
 616         partial_tokens = set()
 617
 618         for name in names:
 619             analyzer_id = name.get_attr('analyzer')
 620             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 621             norm_name = analyzer.normalize(name.name)
 622             if analyzer_id is None:
 623                 token_id = norm_name
 624             else:
 625                 token_id = f'{norm_name}@{analyzer_id}'
 626
 627             full, part = self._cache.names.get(token_id, (None, None))
 628             if full is None:
 629                 variants = analyzer.get_variants_ascii(norm_name)
 630                 if not variants:
 631                     continue
 632
 633                 with self.conn.cursor() as cur:
 634                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 635                                 (token_id, variants))
 636                     full, part = cur.fetchone()
 637
 638                 self._cache.names[token_id] = (full, part)
 639
 640             full_tokens.add(full)
 641             partial_tokens.update(part)
 642
 643         return full_tokens, partial_tokens
 644
 645
 646     def _add_postcode(self, item):
 647         """ Make sure the normalized postcode is present in the word table.
 648         """
 649         analyzer = self.token_analysis.analysis.get('@postcode')
 650
 651         if analyzer is None:
 652             postcode_name = item.name.strip().upper()
 653             variant_base = None
 654         else:
 655             postcode_name = analyzer.normalize(item.name)
 656             variant_base = item.get_attr("variant")
 657
 658         if variant_base:
 659             postcode = f'{postcode_name}@{variant_base}'
 660         else:
 661             postcode = postcode_name
 662
 663         if postcode not in self._cache.postcodes:
 664             term = self._search_normalized(postcode_name)
 665             if not term:
 666                 return None
 667
 668             variants = {term}
 669             if analyzer is not None and variant_base:
 670                 variants.update(analyzer.get_variants_ascii(variant_base))
 671
 672             with self.conn.cursor() as cur:
 673                 cur.execute("SELECT create_postcode_word(%s, %s)",
 674                             (postcode, list(variants)))
 675             self._cache.postcodes.add(postcode)
 676
 677         return postcode_name
 678
 679
 680 class _TokenInfo:
 681     """ Collect token information to be sent back to the database.
 682     """
 683     def __init__(self):
 684         self.names = None
 685         self.housenumbers = set()
 686         self.housenumber_tokens = set()
 687         self.street_tokens = set()
 688         self.place_tokens = set()
 689         self.address_tokens = {}
 690         self.postcode = None
 691
 692
 693     @staticmethod
 694     def _mk_array(tokens):
 695         return f"{{{','.join((str(s) for s in tokens))}}}"
 696
 697
 698     def to_dict(self):
 699         """ Return the token information in database importable format.
 700         """
 701         out = {}
 702
 703         if self.names:
 704             out['names'] = self.names
 705
 706         if self.housenumbers:
 707             out['hnr'] = ';'.join(self.housenumbers)
 708             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 709
 710         if self.street_tokens:
 711             out['street'] = self._mk_array(self.street_tokens)
 712
 713         if self.place_tokens:
 714             out['place'] = self._mk_array(self.place_tokens)
 715
 716         if self.address_tokens:
 717             out['addr'] = self.address_tokens
 718
 719         if self.postcode:
 720             out['postcode'] = self.postcode
 721
 722         return out
 723
 724
 725     def set_names(self, fulls, partials):
 726         """ Adds token information for the normalised names.
 727         """
 728         self.names = self._mk_array(itertools.chain(fulls, partials))
 729
 730
 731     def add_housenumber(self, token, hnr):
 732         """ Extract housenumber information from a list of normalised
 733             housenumbers.
 734         """
 735         if token:
 736             self.housenumbers.add(hnr)
 737             self.housenumber_tokens.add(token)
 738
 739
 740     def add_street(self, tokens):
 741         """ Add addr:street match terms.
 742         """
 743         self.street_tokens.update(tokens)
 744
 745
 746     def add_place(self, tokens):
 747         """ Add addr:place search and match terms.
 748         """
 749         self.place_tokens.update(tokens)
 750
 751
 752     def add_address_term(self, key, partials):
 753         """ Add additional address terms.
 754         """
 755         if partials:
 756             self.address_tokens[key] = self._mk_array(partials)
 757
 758     def set_postcode(self, postcode):
 759         """ Set the postcode to the given one.
 760         """
 761         self.postcode = postcode
 762
 763
 764 class _TokenCache:
 765     """ Cache for token information to avoid repeated database queries.
 766
 767         This cache is not thread-safe and needs to be instantiated per
 768         analyzer.
 769     """
 770     def __init__(self):
 771         self.names = {}
 772         self.partials = {}
 773         self.fulls = {}
 774         self.postcodes = set()
 775         self.housenumbers = {}