nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php, overwrite=True)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70         self._install_php(config.lib_dir.php, overwrite=False)
  71
  72
  73     def finalize_import(self, config):
  74         """ Do any required postprocessing to make the tokenizer data ready
  75             for use.
  76         """
  77         with connect(self.dsn) as conn:
  78             sqlp = SQLPreprocessor(conn, config)
  79             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  80
  81
  82     def update_sql_functions(self, config):
  83         """ Reimport the SQL functions for this tokenizer.
  84         """
  85         with connect(self.dsn) as conn:
  86             sqlp = SQLPreprocessor(conn, config)
  87             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  88
  89
  90     def check_database(self, config):
  91         """ Check that the tokenizer is set up correctly.
  92         """
  93         # Will throw an error if there is an issue.
  94         self.init_from_project(config)
  95
  96
  97     def update_statistics(self):
  98         """ Recompute frequencies for all name words.
  99         """
 100         with connect(self.dsn) as conn:
 101             if conn.table_exists('search_name'):
 102                 with conn.cursor() as cur:
 103                     cur.drop_table("word_frequencies")
 104                     LOG.info("Computing word frequencies")
 105                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 106                                      SELECT unnest(name_vector) as id, count(*)
 107                                      FROM search_name GROUP BY id""")
 108                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 109                     LOG.info("Update word table with recomputed frequencies")
 110                     cur.execute("""UPDATE word
 111                                    SET info = info || jsonb_build_object('count', count)
 112                                    FROM word_frequencies WHERE word_id = id""")
 113                     cur.drop_table("word_frequencies")
 114             conn.commit()
 115
 116
 117     def _cleanup_housenumbers(self):
 118         """ Remove unused house numbers.
 119         """
 120         with connect(self.dsn) as conn:
 121             if not conn.table_exists('search_name'):
 122                 return
 123             with conn.cursor(name="hnr_counter") as cur:
 124                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 125                                FROM word
 126                                WHERE type = 'H'
 127                                  AND NOT EXISTS(SELECT * FROM search_name
 128                                                 WHERE ARRAY[word.word_id] && name_vector)
 129                                  AND (char_length(coalesce(word, word_token)) > 6
 130                                       OR coalesce(word, word_token) not similar to '\\d+')
 131                             """)
 132                 candidates = {token: wid for wid, token in cur}
 133             with conn.cursor(name="hnr_counter") as cur:
 134                 cur.execute("""SELECT housenumber FROM placex
 135                                WHERE housenumber is not null
 136                                      AND (char_length(housenumber) > 6
 137                                           OR housenumber not similar to '\\d+')
 138                             """)
 139                 for row in cur:
 140                     for hnr in row[0].split(';'):
 141                         candidates.pop(hnr, None)
 142             LOG.info("There are %s outdated housenumbers.", len(candidates))
 143             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 144             if candidates:
 145                 with conn.cursor() as cur:
 146                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 147                                 (list(candidates.values()), ))
 148                 conn.commit()
 149
 150
 151
 152     def update_word_tokens(self):
 153         """ Remove unused tokens.
 154         """
 155         LOG.warning("Cleaning up housenumber tokens.")
 156         self._cleanup_housenumbers()
 157         LOG.warning("Tokenizer house-keeping done.")
 158
 159
 160     def name_analyzer(self):
 161         """ Create a new analyzer for tokenizing names and queries
 162             using this tokinzer. Analyzers are context managers and should
 163             be used accordingly:
 164
 165             ```
 166             with tokenizer.name_analyzer() as analyzer:
 167                 analyser.tokenize()
 168             ```
 169
 170             When used outside the with construct, the caller must ensure to
 171             call the close() function before destructing the analyzer.
 172
 173             Analyzers are not thread-safe. You need to instantiate one per thread.
 174         """
 175         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 176                                      self.loader.make_token_analysis())
 177
 178
 179     def _install_php(self, phpdir, overwrite=True):
 180         """ Install the php script for the tokenizer.
 181         """
 182         php_file = self.data_dir / "tokenizer.php"
 183
 184         if not php_file.exists() or overwrite:
 185             php_file.write_text(dedent(f"""\
 186                 <?php
 187                 @define('CONST_Max_Word_Frequency', 10000000);
 188                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 189                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 190                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 191
 192
 193     def _save_config(self):
 194         """ Save the configuration that needs to remain stable for the given
 195             database as database properties.
 196         """
 197         with connect(self.dsn) as conn:
 198             self.loader.save_config_to_db(conn)
 199
 200
 201     def _init_db_tables(self, config):
 202         """ Set up the word table and fill it with pre-computed word
 203             frequencies.
 204         """
 205         with connect(self.dsn) as conn:
 206             sqlp = SQLPreprocessor(conn, config)
 207             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 208             conn.commit()
 209
 210
 211 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 212     """ The legacy analyzer uses the ICU library for splitting names.
 213
 214         Each instance opens a connection to the database to request the
 215         normalization.
 216     """
 217
 218     def __init__(self, dsn, sanitizer, token_analysis):
 219         self.conn = connect(dsn).connection
 220         self.conn.autocommit = True
 221         self.sanitizer = sanitizer
 222         self.token_analysis = token_analysis
 223
 224         self._cache = _TokenCache()
 225
 226
 227     def close(self):
 228         """ Free all resources used by the analyzer.
 229         """
 230         if self.conn:
 231             self.conn.close()
 232             self.conn = None
 233
 234
 235     def _search_normalized(self, name):
 236         """ Return the search token transliteration of the given name.
 237         """
 238         return self.token_analysis.search.transliterate(name).strip()
 239
 240
 241     def _normalized(self, name):
 242         """ Return the normalized version of the given name with all
 243             non-relevant information removed.
 244         """
 245         return self.token_analysis.normalizer.transliterate(name).strip()
 246
 247
 248     def get_word_token_info(self, words):
 249         """ Return token information for the given list of words.
 250             If a word starts with # it is assumed to be a full name
 251             otherwise is a partial name.
 252
 253             The function returns a list of tuples with
 254             (original word, word token, word id).
 255
 256             The function is used for testing and debugging only
 257             and not necessarily efficient.
 258         """
 259         full_tokens = {}
 260         partial_tokens = {}
 261         for word in words:
 262             if word.startswith('#'):
 263                 full_tokens[word] = self._search_normalized(word[1:])
 264             else:
 265                 partial_tokens[word] = self._search_normalized(word)
 266
 267         with self.conn.cursor() as cur:
 268             cur.execute("""SELECT word_token, word_id
 269                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 270                         """, (list(full_tokens.values()),))
 271             full_ids = {r[0]: r[1] for r in cur}
 272             cur.execute("""SELECT word_token, word_id
 273                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 274                         (list(partial_tokens.values()),))
 275             part_ids = {r[0]: r[1] for r in cur}
 276
 277         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 278                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 279
 280
 281     @staticmethod
 282     def normalize_postcode(postcode):
 283         """ Convert the postcode to a standardized form.
 284
 285             This function must yield exactly the same result as the SQL function
 286             'token_normalized_postcode()'.
 287         """
 288         return postcode.strip().upper()
 289
 290
 291     def update_postcodes_from_db(self):
 292         """ Update postcode tokens in the word table from the location_postcode
 293             table.
 294         """
 295         to_delete = []
 296         with self.conn.cursor() as cur:
 297             # This finds us the rows in location_postcode and word that are
 298             # missing in the other table.
 299             cur.execute("""SELECT * FROM
 300                             (SELECT pc, word FROM
 301                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 302                               FULL JOIN
 303                               (SELECT word FROM word WHERE type = 'P') w
 304                               ON pc = word) x
 305                            WHERE pc is null or word is null""")
 306
 307             with CopyBuffer() as copystr:
 308                 for postcode, word in cur:
 309                     if postcode is None:
 310                         to_delete.append(word)
 311                     else:
 312                         copystr.add(self._search_normalized(postcode),
 313                                     'P', postcode)
 314
 315                 if to_delete:
 316                     cur.execute("""DELETE FROM WORD
 317                                    WHERE type ='P' and word = any(%s)
 318                                 """, (to_delete, ))
 319
 320                 copystr.copy_out(cur, 'word',
 321                                  columns=['word_token', 'type', 'word'])
 322
 323
 324     def update_special_phrases(self, phrases, should_replace):
 325         """ Replace the search index for special phrases with the new phrases.
 326             If `should_replace` is True, then the previous set of will be
 327             completely replaced. Otherwise the phrases are added to the
 328             already existing ones.
 329         """
 330         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 331                             for p in phrases))
 332
 333         with self.conn.cursor() as cur:
 334             # Get the old phrases.
 335             existing_phrases = set()
 336             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 337             for word, info in cur:
 338                 existing_phrases.add((word, info['class'], info['type'],
 339                                       info.get('op') or '-'))
 340
 341             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 342             if should_replace:
 343                 deleted = self._remove_special_phrases(cur, norm_phrases,
 344                                                        existing_phrases)
 345             else:
 346                 deleted = 0
 347
 348         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 349                  len(norm_phrases), added, deleted)
 350
 351
 352     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 353         """ Add all phrases to the database that are not yet there.
 354         """
 355         to_add = new_phrases - existing_phrases
 356
 357         added = 0
 358         with CopyBuffer() as copystr:
 359             for word, cls, typ, oper in to_add:
 360                 term = self._search_normalized(word)
 361                 if term:
 362                     copystr.add(term, 'S', word,
 363                                 json.dumps({'class': cls, 'type': typ,
 364                                             'op': oper if oper in ('in', 'near') else None}))
 365                     added += 1
 366
 367             copystr.copy_out(cursor, 'word',
 368                              columns=['word_token', 'type', 'word', 'info'])
 369
 370         return added
 371
 372
 373     @staticmethod
 374     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 375         """ Remove all phrases from the databse that are no longer in the
 376             new phrase list.
 377         """
 378         to_delete = existing_phrases - new_phrases
 379
 380         if to_delete:
 381             cursor.execute_values(
 382                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 383                     WHERE type = 'S' and word = name
 384                           and info->>'class' = in_class and info->>'type' = in_type
 385                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 386                 """, to_delete)
 387
 388         return len(to_delete)
 389
 390
 391     def add_country_names(self, country_code, names):
 392         """ Add default names for the given country to the search index.
 393         """
 394         # Make sure any name preprocessing for country names applies.
 395         info = PlaceInfo({'name': names, 'country_code': country_code,
 396                           'rank_address': 4, 'class': 'boundary',
 397                           'type': 'administrative'})
 398         self._add_country_full_names(country_code,
 399                                      self.sanitizer.process_names(info)[0],
 400                                      internal=True)
 401
 402
 403     def _add_country_full_names(self, country_code, names, internal=False):
 404         """ Add names for the given country from an already sanitized
 405             name list.
 406         """
 407         word_tokens = set()
 408         for name in names:
 409             norm_name = self._search_normalized(name.name)
 410             if norm_name:
 411                 word_tokens.add(norm_name)
 412
 413         with self.conn.cursor() as cur:
 414             # Get existing names
 415             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 416                              FROM word
 417                              WHERE type = 'C' and word = %s""",
 418                         (country_code, ))
 419             existing_tokens = {True: set(), False: set()} # internal/external names
 420             for word in cur:
 421                 existing_tokens[word[1]].add(word[0])
 422
 423             # Delete names that no longer exist.
 424             gone_tokens = existing_tokens[internal] - word_tokens
 425             if internal:
 426                 gone_tokens.update(existing_tokens[False] & word_tokens)
 427             if gone_tokens:
 428                 cur.execute("""DELETE FROM word
 429                                USING unnest(%s) as token
 430                                WHERE type = 'C' and word = %s
 431                                      and word_token = token""",
 432                             (list(gone_tokens), country_code))
 433
 434             # Only add those names that are not yet in the list.
 435             new_tokens = word_tokens - existing_tokens[True]
 436             if not internal:
 437                 new_tokens -= existing_tokens[False]
 438             if new_tokens:
 439                 if internal:
 440                     sql = """INSERT INTO word (word_token, type, word, info)
 441                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 442                                   FROM unnest(%s) as token)
 443                            """
 444                 else:
 445                     sql = """INSERT INTO word (word_token, type, word)
 446                                    (SELECT token, 'C', %s
 447                                     FROM unnest(%s) as token)
 448                           """
 449                 cur.execute(sql, (country_code, list(new_tokens)))
 450
 451
 452     def process_place(self, place):
 453         """ Determine tokenizer information about the given place.
 454
 455             Returns a JSON-serializable structure that will be handed into
 456             the database via the token_info field.
 457         """
 458         token_info = _TokenInfo()
 459
 460         names, address = self.sanitizer.process_names(place)
 461
 462         if names:
 463             token_info.set_names(*self._compute_name_tokens(names))
 464
 465             if place.is_country():
 466                 self._add_country_full_names(place.country_code, names)
 467
 468         if address:
 469             self._process_place_address(token_info, address)
 470
 471         return token_info.to_dict()
 472
 473
 474     def _process_place_address(self, token_info, address):
 475         for item in address:
 476             if item.kind == 'postcode':
 477                 self._add_postcode(item.name)
 478             elif item.kind == 'housenumber':
 479                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 480             elif item.kind == 'street':
 481                 token_info.add_street(self._retrieve_full_tokens(item.name))
 482             elif item.kind == 'place':
 483                 if not item.suffix:
 484                     token_info.add_place(self._compute_partial_tokens(item.name))
 485             elif not item.kind.startswith('_') and not item.suffix and \
 486                  item.kind not in ('country', 'full'):
 487                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 488
 489
 490     def _compute_housenumber_token(self, hnr):
 491         """ Normalize the housenumber and return the word token and the
 492             canonical form.
 493         """
 494         analyzer = self.token_analysis.analysis.get('@housenumber')
 495         result = None, None
 496
 497         if analyzer is None:
 498             # When no custom analyzer is set, simply normalize and transliterate
 499             norm_name = self._search_normalized(hnr.name)
 500             if norm_name:
 501                 result = self._cache.housenumbers.get(norm_name, result)
 502                 if result[0] is None:
 503                     with self.conn.cursor() as cur:
 504                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 505                         result = cur.fetchone()[0], norm_name
 506                         self._cache.housenumbers[norm_name] = result
 507         else:
 508             # Otherwise use the analyzer to determine the canonical name.
 509             # Per convention we use the first variant as the 'lookup name', the
 510             # name that gets saved in the housenumber field of the place.
 511             norm_name = analyzer.normalize(hnr.name)
 512             if norm_name:
 513                 result = self._cache.housenumbers.get(norm_name, result)
 514                 if result[0] is None:
 515                     variants = analyzer.get_variants_ascii(norm_name)
 516                     if variants:
 517                         with self.conn.cursor() as cur:
 518                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
 519                                         (norm_name, list(variants)))
 520                             result = cur.fetchone()[0], variants[0]
 521                             self._cache.housenumbers[norm_name] = result
 522
 523         return result
 524
 525
 526     def _compute_partial_tokens(self, name):
 527         """ Normalize the given term, split it into partial words and return
 528             then token list for them.
 529         """
 530         norm_name = self._search_normalized(name)
 531
 532         tokens = []
 533         need_lookup = []
 534         for partial in norm_name.split():
 535             token = self._cache.partials.get(partial)
 536             if token:
 537                 tokens.append(token)
 538             else:
 539                 need_lookup.append(partial)
 540
 541         if need_lookup:
 542             with self.conn.cursor() as cur:
 543                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 544                                FROM unnest(%s) word""",
 545                             (need_lookup, ))
 546
 547                 for partial, token in cur:
 548                     tokens.append(token)
 549                     self._cache.partials[partial] = token
 550
 551         return tokens
 552
 553
 554     def _retrieve_full_tokens(self, name):
 555         """ Get the full name token for the given name, if it exists.
 556             The name is only retrived for the standard analyser.
 557         """
 558         norm_name = self._search_normalized(name)
 559
 560         # return cached if possible
 561         if norm_name in self._cache.fulls:
 562             return self._cache.fulls[norm_name]
 563
 564         with self.conn.cursor() as cur:
 565             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 566                         (norm_name, ))
 567             full = [row[0] for row in cur]
 568
 569         self._cache.fulls[norm_name] = full
 570
 571         return full
 572
 573
 574     def _compute_name_tokens(self, names):
 575         """ Computes the full name and partial name tokens for the given
 576             dictionary of names.
 577         """
 578         full_tokens = set()
 579         partial_tokens = set()
 580
 581         for name in names:
 582             analyzer_id = name.get_attr('analyzer')
 583             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 584             norm_name = analyzer.normalize(name.name)
 585             if analyzer_id is None:
 586                 token_id = norm_name
 587             else:
 588                 token_id = f'{norm_name}@{analyzer_id}'
 589
 590             full, part = self._cache.names.get(token_id, (None, None))
 591             if full is None:
 592                 variants = analyzer.get_variants_ascii(norm_name)
 593                 if not variants:
 594                     continue
 595
 596                 with self.conn.cursor() as cur:
 597                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 598                                 (token_id, variants))
 599                     full, part = cur.fetchone()
 600
 601                 self._cache.names[token_id] = (full, part)
 602
 603             full_tokens.add(full)
 604             partial_tokens.update(part)
 605
 606         return full_tokens, partial_tokens
 607
 608
 609     def _add_postcode(self, postcode):
 610         """ Make sure the normalized postcode is present in the word table.
 611         """
 612         if re.search(r'[:,;]', postcode) is None:
 613             postcode = self.normalize_postcode(postcode)
 614
 615             if postcode not in self._cache.postcodes:
 616                 term = self._search_normalized(postcode)
 617                 if not term:
 618                     return
 619
 620                 with self.conn.cursor() as cur:
 621                     # no word_id needed for postcodes
 622                     cur.execute("""INSERT INTO word (word_token, type, word)
 623                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 624                                     WHERE NOT EXISTS
 625                                      (SELECT * FROM word
 626                                       WHERE type = 'P' and word = pc))
 627                                 """, (term, postcode))
 628                 self._cache.postcodes.add(postcode)
 629
 630
 631 class _TokenInfo:
 632     """ Collect token information to be sent back to the database.
 633     """
 634     def __init__(self):
 635         self.names = None
 636         self.housenumbers = set()
 637         self.housenumber_tokens = set()
 638         self.street_tokens = set()
 639         self.place_tokens = set()
 640         self.address_tokens = {}
 641
 642
 643     @staticmethod
 644     def _mk_array(tokens):
 645         return f"{{{','.join((str(s) for s in tokens))}}}"
 646
 647
 648     def to_dict(self):
 649         """ Return the token information in database importable format.
 650         """
 651         out = {}
 652
 653         if self.names:
 654             out['names'] = self.names
 655
 656         if self.housenumbers:
 657             out['hnr'] = ';'.join(self.housenumbers)
 658             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 659
 660         if self.street_tokens:
 661             out['street'] = self._mk_array(self.street_tokens)
 662
 663         if self.place_tokens:
 664             out['place'] = self._mk_array(self.place_tokens)
 665
 666         if self.address_tokens:
 667             out['addr'] = self.address_tokens
 668
 669         return out
 670
 671
 672     def set_names(self, fulls, partials):
 673         """ Adds token information for the normalised names.
 674         """
 675         self.names = self._mk_array(itertools.chain(fulls, partials))
 676
 677
 678     def add_housenumber(self, token, hnr):
 679         """ Extract housenumber information from a list of normalised
 680             housenumbers.
 681         """
 682         if token:
 683             self.housenumbers.add(hnr)
 684             self.housenumber_tokens.add(token)
 685
 686
 687     def add_street(self, tokens):
 688         """ Add addr:street match terms.
 689         """
 690         self.street_tokens.update(tokens)
 691
 692
 693     def add_place(self, tokens):
 694         """ Add addr:place search and match terms.
 695         """
 696         self.place_tokens.update(tokens)
 697
 698
 699     def add_address_term(self, key, partials):
 700         """ Add additional address terms.
 701         """
 702         if partials:
 703             self.address_tokens[key] = self._mk_array(partials)
 704
 705
 706 class _TokenCache:
 707     """ Cache for token information to avoid repeated database queries.
 708
 709         This cache is not thread-safe and needs to be instantiated per
 710         analyzer.
 711     """
 712     def __init__(self):
 713         self.names = {}
 714         self.partials = {}
 715         self.fulls = {}
 716         self.postcodes = set()
 717         self.housenumbers = {}