nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 from textwrap import dedent
  15
  16 from nominatim.db.connection import connect
  17 from nominatim.db.utils import CopyBuffer
  18 from nominatim.db.sql_preprocessor import SQLPreprocessor
  19 from nominatim.indexer.place_info import PlaceInfo
  20 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  21 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  22
  23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  24
  25 LOG = logging.getLogger()
  26
  27 def create(dsn, data_dir):
  28     """ Create a new instance of the tokenizer provided by this module.
  29     """
  30     return LegacyICUTokenizer(dsn, data_dir)
  31
  32
  33 class LegacyICUTokenizer(AbstractTokenizer):
  34     """ This tokenizer uses libICU to covert names and queries to ASCII.
  35         Otherwise it uses the same algorithms and data structures as the
  36         normalization routines in Nominatim 3.
  37     """
  38
  39     def __init__(self, dsn, data_dir):
  40         self.dsn = dsn
  41         self.data_dir = data_dir
  42         self.loader = None
  43
  44
  45     def init_new_db(self, config, init_db=True):
  46         """ Set up a new tokenizer for the database.
  47
  48             This copies all necessary data in the project directory to make
  49             sure the tokenizer remains stable even over updates.
  50         """
  51         self.loader = ICURuleLoader(config)
  52
  53         self._install_php(config.lib_dir.php, overwrite=True)
  54         self._save_config()
  55
  56         if init_db:
  57             self.update_sql_functions(config)
  58             self._init_db_tables(config)
  59
  60
  61     def init_from_project(self, config):
  62         """ Initialise the tokenizer from the project directory.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         with connect(self.dsn) as conn:
  67             self.loader.load_config_from_db(conn)
  68
  69         self._install_php(config.lib_dir.php, overwrite=False)
  70
  71
  72     def finalize_import(self, config):
  73         """ Do any required postprocessing to make the tokenizer data ready
  74             for use.
  75         """
  76         with connect(self.dsn) as conn:
  77             sqlp = SQLPreprocessor(conn, config)
  78             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  79
  80
  81     def update_sql_functions(self, config):
  82         """ Reimport the SQL functions for this tokenizer.
  83         """
  84         with connect(self.dsn) as conn:
  85             sqlp = SQLPreprocessor(conn, config)
  86             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  87
  88
  89     def check_database(self, config):
  90         """ Check that the tokenizer is set up correctly.
  91         """
  92         # Will throw an error if there is an issue.
  93         self.init_from_project(config)
  94
  95
  96     def update_statistics(self):
  97         """ Recompute frequencies for all name words.
  98         """
  99         with connect(self.dsn) as conn:
 100             if conn.table_exists('search_name'):
 101                 with conn.cursor() as cur:
 102                     cur.drop_table("word_frequencies")
 103                     LOG.info("Computing word frequencies")
 104                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 105                                      SELECT unnest(name_vector) as id, count(*)
 106                                      FROM search_name GROUP BY id""")
 107                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 108                     LOG.info("Update word table with recomputed frequencies")
 109                     cur.execute("""UPDATE word
 110                                    SET info = info || jsonb_build_object('count', count)
 111                                    FROM word_frequencies WHERE word_id = id""")
 112                     cur.drop_table("word_frequencies")
 113             conn.commit()
 114
 115
 116     def _cleanup_housenumbers(self):
 117         """ Remove unused house numbers.
 118         """
 119         with connect(self.dsn) as conn:
 120             if not conn.table_exists('search_name'):
 121                 return
 122             with conn.cursor(name="hnr_counter") as cur:
 123                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 124                                FROM word
 125                                WHERE type = 'H'
 126                                  AND NOT EXISTS(SELECT * FROM search_name
 127                                                 WHERE ARRAY[word.word_id] && name_vector)
 128                                  AND (char_length(coalesce(word, word_token)) > 6
 129                                       OR coalesce(word, word_token) not similar to '\\d+')
 130                             """)
 131                 candidates = {token: wid for wid, token in cur}
 132             with conn.cursor(name="hnr_counter") as cur:
 133                 cur.execute("""SELECT housenumber FROM placex
 134                                WHERE housenumber is not null
 135                                      AND (char_length(housenumber) > 6
 136                                           OR housenumber not similar to '\\d+')
 137                             """)
 138                 for row in cur:
 139                     for hnr in row[0].split(';'):
 140                         candidates.pop(hnr, None)
 141             LOG.info("There are %s outdated housenumbers.", len(candidates))
 142             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 143             if candidates:
 144                 with conn.cursor() as cur:
 145                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 146                                 (list(candidates.values()), ))
 147                 conn.commit()
 148
 149
 150
 151     def update_word_tokens(self):
 152         """ Remove unused tokens.
 153         """
 154         LOG.warning("Cleaning up housenumber tokens.")
 155         self._cleanup_housenumbers()
 156         LOG.warning("Tokenizer house-keeping done.")
 157
 158
 159     def name_analyzer(self):
 160         """ Create a new analyzer for tokenizing names and queries
 161             using this tokinzer. Analyzers are context managers and should
 162             be used accordingly:
 163
 164             ```
 165             with tokenizer.name_analyzer() as analyzer:
 166                 analyser.tokenize()
 167             ```
 168
 169             When used outside the with construct, the caller must ensure to
 170             call the close() function before destructing the analyzer.
 171
 172             Analyzers are not thread-safe. You need to instantiate one per thread.
 173         """
 174         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 175                                      self.loader.make_token_analysis())
 176
 177
 178     def _install_php(self, phpdir, overwrite=True):
 179         """ Install the php script for the tokenizer.
 180         """
 181         php_file = self.data_dir / "tokenizer.php"
 182
 183         if not php_file.exists() or overwrite:
 184             php_file.write_text(dedent(f"""\
 185                 <?php
 186                 @define('CONST_Max_Word_Frequency', 10000000);
 187                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 188                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 189                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 190
 191
 192     def _save_config(self):
 193         """ Save the configuration that needs to remain stable for the given
 194             database as database properties.
 195         """
 196         with connect(self.dsn) as conn:
 197             self.loader.save_config_to_db(conn)
 198
 199
 200     def _init_db_tables(self, config):
 201         """ Set up the word table and fill it with pre-computed word
 202             frequencies.
 203         """
 204         with connect(self.dsn) as conn:
 205             sqlp = SQLPreprocessor(conn, config)
 206             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 207             conn.commit()
 208
 209
 210 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 211     """ The legacy analyzer uses the ICU library for splitting names.
 212
 213         Each instance opens a connection to the database to request the
 214         normalization.
 215     """
 216
 217     def __init__(self, dsn, sanitizer, token_analysis):
 218         self.conn = connect(dsn).connection
 219         self.conn.autocommit = True
 220         self.sanitizer = sanitizer
 221         self.token_analysis = token_analysis
 222
 223         self._cache = _TokenCache()
 224
 225
 226     def close(self):
 227         """ Free all resources used by the analyzer.
 228         """
 229         if self.conn:
 230             self.conn.close()
 231             self.conn = None
 232
 233
 234     def _search_normalized(self, name):
 235         """ Return the search token transliteration of the given name.
 236         """
 237         return self.token_analysis.search.transliterate(name).strip()
 238
 239
 240     def _normalized(self, name):
 241         """ Return the normalized version of the given name with all
 242             non-relevant information removed.
 243         """
 244         return self.token_analysis.normalizer.transliterate(name).strip()
 245
 246
 247     def get_word_token_info(self, words):
 248         """ Return token information for the given list of words.
 249             If a word starts with # it is assumed to be a full name
 250             otherwise is a partial name.
 251
 252             The function returns a list of tuples with
 253             (original word, word token, word id).
 254
 255             The function is used for testing and debugging only
 256             and not necessarily efficient.
 257         """
 258         full_tokens = {}
 259         partial_tokens = {}
 260         for word in words:
 261             if word.startswith('#'):
 262                 full_tokens[word] = self._search_normalized(word[1:])
 263             else:
 264                 partial_tokens[word] = self._search_normalized(word)
 265
 266         with self.conn.cursor() as cur:
 267             cur.execute("""SELECT word_token, word_id
 268                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 269                         """, (list(full_tokens.values()),))
 270             full_ids = {r[0]: r[1] for r in cur}
 271             cur.execute("""SELECT word_token, word_id
 272                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 273                         (list(partial_tokens.values()),))
 274             part_ids = {r[0]: r[1] for r in cur}
 275
 276         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 277                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 278
 279
 280     def normalize_postcode(self, postcode):
 281         """ Convert the postcode to a standardized form.
 282
 283             This function must yield exactly the same result as the SQL function
 284             'token_normalized_postcode()'.
 285         """
 286         return postcode.strip().upper()
 287
 288
 289     def update_postcodes_from_db(self):
 290         """ Update postcode tokens in the word table from the location_postcode
 291             table.
 292         """
 293         to_delete = []
 294         with self.conn.cursor() as cur:
 295             # This finds us the rows in location_postcode and word that are
 296             # missing in the other table.
 297             cur.execute("""SELECT * FROM
 298                             (SELECT pc, word FROM
 299                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 300                               FULL JOIN
 301                               (SELECT word FROM word WHERE type = 'P') w
 302                               ON pc = word) x
 303                            WHERE pc is null or word is null""")
 304
 305             with CopyBuffer() as copystr:
 306                 for postcode, word in cur:
 307                     if postcode is None:
 308                         to_delete.append(word)
 309                     else:
 310                         copystr.add(self._search_normalized(postcode),
 311                                     'P', postcode)
 312
 313                 if to_delete:
 314                     cur.execute("""DELETE FROM WORD
 315                                    WHERE type ='P' and word = any(%s)
 316                                 """, (to_delete, ))
 317
 318                 copystr.copy_out(cur, 'word',
 319                                  columns=['word_token', 'type', 'word'])
 320
 321
 322     def update_special_phrases(self, phrases, should_replace):
 323         """ Replace the search index for special phrases with the new phrases.
 324             If `should_replace` is True, then the previous set of will be
 325             completely replaced. Otherwise the phrases are added to the
 326             already existing ones.
 327         """
 328         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 329                             for p in phrases))
 330
 331         with self.conn.cursor() as cur:
 332             # Get the old phrases.
 333             existing_phrases = set()
 334             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 335             for word, info in cur:
 336                 existing_phrases.add((word, info['class'], info['type'],
 337                                       info.get('op') or '-'))
 338
 339             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 340             if should_replace:
 341                 deleted = self._remove_special_phrases(cur, norm_phrases,
 342                                                        existing_phrases)
 343             else:
 344                 deleted = 0
 345
 346         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 347                  len(norm_phrases), added, deleted)
 348
 349
 350     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 351         """ Add all phrases to the database that are not yet there.
 352         """
 353         to_add = new_phrases - existing_phrases
 354
 355         added = 0
 356         with CopyBuffer() as copystr:
 357             for word, cls, typ, oper in to_add:
 358                 term = self._search_normalized(word)
 359                 if term:
 360                     copystr.add(term, 'S', word,
 361                                 json.dumps({'class': cls, 'type': typ,
 362                                             'op': oper if oper in ('in', 'near') else None}))
 363                     added += 1
 364
 365             copystr.copy_out(cursor, 'word',
 366                              columns=['word_token', 'type', 'word', 'info'])
 367
 368         return added
 369
 370
 371     @staticmethod
 372     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 373         """ Remove all phrases from the databse that are no longer in the
 374             new phrase list.
 375         """
 376         to_delete = existing_phrases - new_phrases
 377
 378         if to_delete:
 379             cursor.execute_values(
 380                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 381                     WHERE type = 'S' and word = name
 382                           and info->>'class' = in_class and info->>'type' = in_type
 383                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 384                 """, to_delete)
 385
 386         return len(to_delete)
 387
 388
 389     def add_country_names(self, country_code, names):
 390         """ Add default names for the given country to the search index.
 391         """
 392         # Make sure any name preprocessing for country names applies.
 393         info = PlaceInfo({'name': names, 'country_code': country_code,
 394                           'rank_address': 4, 'class': 'boundary',
 395                           'type': 'administrative'})
 396         self._add_country_full_names(country_code,
 397                                      self.sanitizer.process_names(info)[0],
 398                                      internal=True)
 399
 400
 401     def _add_country_full_names(self, country_code, names, internal=False):
 402         """ Add names for the given country from an already sanitized
 403             name list.
 404         """
 405         word_tokens = set()
 406         for name in names:
 407             norm_name = self._search_normalized(name.name)
 408             if norm_name:
 409                 word_tokens.add(norm_name)
 410
 411         with self.conn.cursor() as cur:
 412             # Get existing names
 413             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 414                              FROM word
 415                              WHERE type = 'C' and word = %s""",
 416                         (country_code, ))
 417             existing_tokens = {True: set(), False: set()} # internal/external names
 418             for word in cur:
 419                 existing_tokens[word[1]].add(word[0])
 420
 421             # Delete names that no longer exist.
 422             gone_tokens = existing_tokens[internal] - word_tokens
 423             if internal:
 424                 gone_tokens.update(existing_tokens[False] & word_tokens)
 425             if gone_tokens:
 426                 cur.execute("""DELETE FROM word
 427                                USING unnest(%s) as token
 428                                WHERE type = 'C' and word = %s
 429                                      and word_token = token""",
 430                             (list(gone_tokens), country_code))
 431
 432             # Only add those names that are not yet in the list.
 433             new_tokens = word_tokens - existing_tokens[True]
 434             if not internal:
 435                 new_tokens -= existing_tokens[False]
 436             if new_tokens:
 437                 if internal:
 438                     sql = """INSERT INTO word (word_token, type, word, info)
 439                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 440                                   FROM unnest(%s) as token)
 441                            """
 442                 else:
 443                     sql = """INSERT INTO word (word_token, type, word)
 444                                    (SELECT token, 'C', %s
 445                                     FROM unnest(%s) as token)
 446                           """
 447                 cur.execute(sql, (country_code, list(new_tokens)))
 448
 449
 450     def process_place(self, place):
 451         """ Determine tokenizer information about the given place.
 452
 453             Returns a JSON-serializable structure that will be handed into
 454             the database via the token_info field.
 455         """
 456         token_info = _TokenInfo()
 457
 458         names, address = self.sanitizer.process_names(place)
 459
 460         if names:
 461             token_info.set_names(*self._compute_name_tokens(names))
 462
 463             if place.is_country():
 464                 self._add_country_full_names(place.country_code, names)
 465
 466         if address:
 467             self._process_place_address(token_info, address)
 468
 469         return token_info.to_dict()
 470
 471
 472     def _process_place_address(self, token_info, address):
 473         for item in address:
 474             if item.kind == 'postcode':
 475                 token_info.set_postcode(self._add_postcode(item))
 476             elif item.kind == 'housenumber':
 477                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 478             elif item.kind == 'street':
 479                 token_info.add_street(self._retrieve_full_tokens(item.name))
 480             elif item.kind == 'place':
 481                 if not item.suffix:
 482                     token_info.add_place(self._compute_partial_tokens(item.name))
 483             elif not item.kind.startswith('_') and not item.suffix and \
 484                  item.kind not in ('country', 'full', 'inclusion'):
 485                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 486
 487
 488     def _compute_housenumber_token(self, hnr):
 489         """ Normalize the housenumber and return the word token and the
 490             canonical form.
 491         """
 492         analyzer = self.token_analysis.analysis.get('@housenumber')
 493         result = None, None
 494
 495         if analyzer is None:
 496             # When no custom analyzer is set, simply normalize and transliterate
 497             norm_name = self._search_normalized(hnr.name)
 498             if norm_name:
 499                 result = self._cache.housenumbers.get(norm_name, result)
 500                 if result[0] is None:
 501                     with self.conn.cursor() as cur:
 502                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 503                         result = cur.fetchone()[0], norm_name
 504                         self._cache.housenumbers[norm_name] = result
 505         else:
 506             # Otherwise use the analyzer to determine the canonical name.
 507             # Per convention we use the first variant as the 'lookup name', the
 508             # name that gets saved in the housenumber field of the place.
 509             norm_name = analyzer.normalize(hnr.name)
 510             if norm_name:
 511                 result = self._cache.housenumbers.get(norm_name, result)
 512                 if result[0] is None:
 513                     variants = analyzer.get_variants_ascii(norm_name)
 514                     if variants:
 515                         with self.conn.cursor() as cur:
 516                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
 517                                         (norm_name, list(variants)))
 518                             result = cur.fetchone()[0], variants[0]
 519                             self._cache.housenumbers[norm_name] = result
 520
 521         return result
 522
 523
 524     def _compute_partial_tokens(self, name):
 525         """ Normalize the given term, split it into partial words and return
 526             then token list for them.
 527         """
 528         norm_name = self._search_normalized(name)
 529
 530         tokens = []
 531         need_lookup = []
 532         for partial in norm_name.split():
 533             token = self._cache.partials.get(partial)
 534             if token:
 535                 tokens.append(token)
 536             else:
 537                 need_lookup.append(partial)
 538
 539         if need_lookup:
 540             with self.conn.cursor() as cur:
 541                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 542                                FROM unnest(%s) word""",
 543                             (need_lookup, ))
 544
 545                 for partial, token in cur:
 546                     tokens.append(token)
 547                     self._cache.partials[partial] = token
 548
 549         return tokens
 550
 551
 552     def _retrieve_full_tokens(self, name):
 553         """ Get the full name token for the given name, if it exists.
 554             The name is only retrived for the standard analyser.
 555         """
 556         norm_name = self._search_normalized(name)
 557
 558         # return cached if possible
 559         if norm_name in self._cache.fulls:
 560             return self._cache.fulls[norm_name]
 561
 562         with self.conn.cursor() as cur:
 563             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 564                         (norm_name, ))
 565             full = [row[0] for row in cur]
 566
 567         self._cache.fulls[norm_name] = full
 568
 569         return full
 570
 571
 572     def _compute_name_tokens(self, names):
 573         """ Computes the full name and partial name tokens for the given
 574             dictionary of names.
 575         """
 576         full_tokens = set()
 577         partial_tokens = set()
 578
 579         for name in names:
 580             analyzer_id = name.get_attr('analyzer')
 581             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 582             norm_name = analyzer.normalize(name.name)
 583             if analyzer_id is None:
 584                 token_id = norm_name
 585             else:
 586                 token_id = f'{norm_name}@{analyzer_id}'
 587
 588             full, part = self._cache.names.get(token_id, (None, None))
 589             if full is None:
 590                 variants = analyzer.get_variants_ascii(norm_name)
 591                 if not variants:
 592                     continue
 593
 594                 with self.conn.cursor() as cur:
 595                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 596                                 (token_id, variants))
 597                     full, part = cur.fetchone()
 598
 599                 self._cache.names[token_id] = (full, part)
 600
 601             full_tokens.add(full)
 602             partial_tokens.update(part)
 603
 604         return full_tokens, partial_tokens
 605
 606
 607     def _add_postcode(self, item):
 608         """ Make sure the normalized postcode is present in the word table.
 609         """
 610         analyzer = self.token_analysis.get_analyzer('@postcode')
 611
 612         if analyzer is None:
 613             postcode_name = item.name.strip().upper()
 614             variant_base = None
 615         else:
 616             postcode_name = analyzer.normalize(item.name)
 617             variant_base = item.get_attr("variant")
 618
 619         if variant_base is not None:
 620             postcode = f'{postcode_name}@{variant_base}'
 621         else:
 622             postcode = postcode_name
 623
 624         if postcode not in self._cache.postcodes:
 625             term = self._search_normalized(postcode_name)
 626             if not term:
 627                 return
 628
 629             variants = {term}
 630             if analyzer is not None and variant_base is not None:
 631                 variants.update(analyzer.get_variants_ascii(variant_base))
 632
 633             with self.conn.cursor() as cur:
 634                 cur.execute("SELECT create_postcode_word(%s, %s)",
 635                             (postcode, list(variants)))
 636             self._cache.postcodes.add(postcode)
 637
 638
 639 class _TokenInfo:
 640     """ Collect token information to be sent back to the database.
 641     """
 642     def __init__(self):
 643         self.names = None
 644         self.housenumbers = set()
 645         self.housenumber_tokens = set()
 646         self.street_tokens = set()
 647         self.place_tokens = set()
 648         self.address_tokens = {}
 649         self.postcode = None
 650
 651
 652     @staticmethod
 653     def _mk_array(tokens):
 654         return f"{{{','.join((str(s) for s in tokens))}}}"
 655
 656
 657     def to_dict(self):
 658         """ Return the token information in database importable format.
 659         """
 660         out = {}
 661
 662         if self.names:
 663             out['names'] = self.names
 664
 665         if self.housenumbers:
 666             out['hnr'] = ';'.join(self.housenumbers)
 667             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 668
 669         if self.street_tokens:
 670             out['street'] = self._mk_array(self.street_tokens)
 671
 672         if self.place_tokens:
 673             out['place'] = self._mk_array(self.place_tokens)
 674
 675         if self.address_tokens:
 676             out['addr'] = self.address_tokens
 677
 678         return out
 679
 680
 681     def set_names(self, fulls, partials):
 682         """ Adds token information for the normalised names.
 683         """
 684         self.names = self._mk_array(itertools.chain(fulls, partials))
 685
 686
 687     def add_housenumber(self, token, hnr):
 688         """ Extract housenumber information from a list of normalised
 689             housenumbers.
 690         """
 691         if token:
 692             self.housenumbers.add(hnr)
 693             self.housenumber_tokens.add(token)
 694
 695
 696     def add_street(self, tokens):
 697         """ Add addr:street match terms.
 698         """
 699         self.street_tokens.update(tokens)
 700
 701
 702     def add_place(self, tokens):
 703         """ Add addr:place search and match terms.
 704         """
 705         self.place_tokens.update(tokens)
 706
 707
 708     def add_address_term(self, key, partials):
 709         """ Add additional address terms.
 710         """
 711         if partials:
 712             self.address_tokens[key] = self._mk_array(partials)
 713
 714     def set_postcode(self, postcode):
 715         """ Set the postcode to the given one.
 716         """
 717         self.postcode = postcode
 718
 719
 720 class _TokenCache:
 721     """ Cache for token information to avoid repeated database queries.
 722
 723         This cache is not thread-safe and needs to be instantiated per
 724         analyzer.
 725     """
 726     def __init__(self):
 727         self.names = {}
 728         self.partials = {}
 729         self.fulls = {}
 730         self.postcodes = set()
 731         self.housenumbers = {}