src/nominatim_db/tokenizer/legacy_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4.
   9 """
  10 from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
  11                    cast, Dict, Set, Iterable
  12 from collections import OrderedDict
  13 import logging
  14 from pathlib import Path
  15 import re
  16 import shutil
  17 from textwrap import dedent
  18
  19 from icu import Transliterator
  20 import psycopg
  21 from psycopg import sql as pysql
  22
  23 from ..errors import UsageError
  24 from ..db.connection import connect, Connection, drop_tables, table_exists,\
  25                             execute_scalar, register_hstore
  26 from ..config import Configuration
  27 from ..db import properties
  28 from ..db import utils as db_utils
  29 from ..db.sql_preprocessor import SQLPreprocessor
  30 from ..data.place_info import PlaceInfo
  31 from .base import AbstractAnalyzer, AbstractTokenizer
  32
  33 DBCFG_NORMALIZATION = "tokenizer_normalization"
  34 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  35
  36 LOG = logging.getLogger()
  37
  38 def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
  39     """ Create a new instance of the tokenizer provided by this module.
  40     """
  41     LOG.warning('WARNING: the legacy tokenizer is deprecated '
  42                 'and will be removed in Nominatim 5.0.')
  43     return LegacyTokenizer(dsn, data_dir)
  44
  45
  46 def _install_module(config_module_path: str, src_dir: Optional[Path], module_dir: Path) -> str:
  47     """ Copies the PostgreSQL normalisation module into the project
  48         directory if necessary. For historical reasons the module is
  49         saved in the '/module' subdirectory and not with the other tokenizer
  50         data.
  51
  52         The function detects when the installation is run from the
  53         build directory. It doesn't touch the module in that case.
  54     """
  55     # Custom module locations are simply used as is.
  56     if config_module_path:
  57         LOG.info("Using custom path for database module at '%s'", config_module_path)
  58         return config_module_path
  59
  60     # Otherwise a source dir must be given.
  61     if src_dir is None:
  62         raise UsageError("The legacy tokenizer cannot be used with the Nominatim pip module.")
  63
  64     # Compatibility mode for builddir installations.
  65     if module_dir.exists() and src_dir.samefile(module_dir):
  66         LOG.info('Running from build directory. Leaving database module as is.')
  67         return str(module_dir)
  68
  69     # In any other case install the module in the project directory.
  70     if not module_dir.exists():
  71         module_dir.mkdir()
  72
  73     destfile = module_dir / 'nominatim.so'
  74     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  75     destfile.chmod(0o755)
  76
  77     LOG.info('Database module installed at %s', str(destfile))
  78
  79     return str(module_dir)
  80
  81
  82 def _check_module(module_dir: str, conn: Connection) -> None:
  83     """ Try to use the PostgreSQL module to confirm that it is correctly
  84         installed and accessible from PostgreSQL.
  85     """
  86     with conn.cursor() as cur:
  87         try:
  88             cur.execute(pysql.SQL("""CREATE FUNCTION nominatim_test_import_func(text)
  89                                      RETURNS text AS {}, 'transliteration'
  90                                      LANGUAGE c IMMUTABLE STRICT;
  91                                      DROP FUNCTION nominatim_test_import_func(text)
  92                                  """).format(pysql.Literal(f'{module_dir}/nominatim.so')))
  93         except psycopg.DatabaseError as err:
  94             LOG.fatal("Error accessing database module: %s", err)
  95             raise UsageError("Database module cannot be accessed.") from err
  96
  97
  98 class LegacyTokenizer(AbstractTokenizer):
  99     """ The legacy tokenizer uses a special PostgreSQL module to normalize
 100         names and queries. The tokenizer thus implements normalization through
 101         calls to the database.
 102     """
 103
 104     def __init__(self, dsn: str, data_dir: Path) -> None:
 105         self.dsn = dsn
 106         self.data_dir = data_dir
 107         self.normalization: Optional[str] = None
 108
 109
 110     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 111         """ Set up a new tokenizer for the database.
 112
 113             This copies all necessary data in the project directory to make
 114             sure the tokenizer remains stable even over updates.
 115         """
 116         assert config.project_dir is not None
 117         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 118                                      config.lib_dir.module,
 119                                      config.project_dir / 'module')
 120
 121         self.normalization = config.TERM_NORMALIZATION
 122
 123         self._install_php(config, overwrite=True)
 124
 125         with connect(self.dsn) as conn:
 126             _check_module(module_dir, conn)
 127             self._save_config(conn, config)
 128             conn.commit()
 129
 130         if init_db:
 131             self.update_sql_functions(config)
 132             self._init_db_tables(config)
 133
 134
 135     def init_from_project(self, config: Configuration) -> None:
 136         """ Initialise the tokenizer from the project directory.
 137         """
 138         assert config.project_dir is not None
 139
 140         with connect(self.dsn) as conn:
 141             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 142
 143         if not (config.project_dir / 'module' / 'nominatim.so').exists():
 144             _install_module(config.DATABASE_MODULE_PATH,
 145                             config.lib_dir.module,
 146                             config.project_dir / 'module')
 147
 148         self._install_php(config, overwrite=False)
 149
 150     def finalize_import(self, config: Configuration) -> None:
 151         """ Do any required postprocessing to make the tokenizer data ready
 152             for use.
 153         """
 154         with connect(self.dsn) as conn:
 155             sqlp = SQLPreprocessor(conn, config)
 156             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
 157
 158
 159     def update_sql_functions(self, config: Configuration) -> None:
 160         """ Reimport the SQL functions for this tokenizer.
 161         """
 162         assert config.project_dir is not None
 163
 164         with connect(self.dsn) as conn:
 165             max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
 166             modulepath = config.DATABASE_MODULE_PATH or \
 167                          str((config.project_dir / 'module').resolve())
 168             sqlp = SQLPreprocessor(conn, config)
 169             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
 170                               max_word_freq=max_word_freq,
 171                               modulepath=modulepath)
 172
 173
 174     def check_database(self, _: Configuration) -> Optional[str]:
 175         """ Check that the tokenizer is set up correctly.
 176         """
 177         hint = """\
 178              The Postgresql extension nominatim.so was not correctly loaded.
 179
 180              Error: {error}
 181
 182              Hints:
 183              * Check the output of the CMmake/make installation step
 184              * Does nominatim.so exist?
 185              * Does nominatim.so exist on the database server?
 186              * Can nominatim.so be accessed by the database user?
 187              """
 188         with connect(self.dsn) as conn:
 189             try:
 190                 out = execute_scalar(conn, "SELECT make_standard_name('a')")
 191             except psycopg.Error as err:
 192                 return hint.format(error=str(err))
 193
 194         if out != 'a':
 195             return hint.format(error='Unexpected result for make_standard_name()')
 196
 197         return None
 198
 199
 200     def migrate_database(self, config: Configuration) -> None:
 201         """ Initialise the project directory of an existing database for
 202             use with this tokenizer.
 203
 204             This is a special migration function for updating existing databases
 205             to new software versions.
 206         """
 207         assert config.project_dir is not None
 208
 209         self.normalization = config.TERM_NORMALIZATION
 210         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 211                                      config.lib_dir.module,
 212                                      config.project_dir / 'module')
 213
 214         with connect(self.dsn) as conn:
 215             _check_module(module_dir, conn)
 216             self._save_config(conn, config)
 217
 218
 219     def update_statistics(self, config: Configuration, threads: int = 1) -> None:
 220         """ Recompute the frequency of full words.
 221         """
 222         with connect(self.dsn) as conn:
 223             if table_exists(conn, 'search_name'):
 224                 drop_tables(conn, "word_frequencies")
 225                 with conn.cursor() as cur:
 226                     LOG.info("Computing word frequencies")
 227                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 228                                      SELECT unnest(name_vector) as id, count(*)
 229                                      FROM search_name GROUP BY id""")
 230                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 231                     LOG.info("Update word table with recomputed frequencies")
 232                     cur.execute("""UPDATE word SET search_name_count = count
 233                                    FROM word_frequencies
 234                                    WHERE word_token like ' %' and word_id = id""")
 235                 drop_tables(conn, "word_frequencies")
 236             conn.commit()
 237
 238
 239     def update_word_tokens(self) -> None:
 240         """ No house-keeping implemented for the legacy tokenizer.
 241         """
 242         LOG.info("No tokenizer clean-up available.")
 243
 244
 245     def name_analyzer(self) -> 'LegacyNameAnalyzer':
 246         """ Create a new analyzer for tokenizing names and queries
 247             using this tokinzer. Analyzers are context managers and should
 248             be used accordingly:
 249
 250             ```
 251             with tokenizer.name_analyzer() as analyzer:
 252                 analyser.tokenize()
 253             ```
 254
 255             When used outside the with construct, the caller must ensure to
 256             call the close() function before destructing the analyzer.
 257
 258             Analyzers are not thread-safe. You need to instantiate one per thread.
 259         """
 260         normalizer = Transliterator.createFromRules("phrase normalizer",
 261                                                     self.normalization)
 262         return LegacyNameAnalyzer(self.dsn, normalizer)
 263
 264
 265     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 266         """ Return a list of the `num` most frequent full words
 267             in the database.
 268         """
 269         with conn.cursor() as cur:
 270             cur.execute(""" SELECT word FROM word WHERE word is not null
 271                               ORDER BY search_name_count DESC LIMIT %s""", (num,))
 272             return list(s[0] for s in cur)
 273
 274
 275     def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
 276         """ Install the php script for the tokenizer.
 277         """
 278         if config.lib_dir.php is not None:
 279             php_file = self.data_dir / "tokenizer.php"
 280
 281             if not php_file.exists() or overwrite:
 282                 php_file.write_text(dedent(f"""\
 283                     <?php
 284                     @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
 285                     @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
 286                     require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
 287                     """), encoding='utf-8')
 288
 289
 290     def _init_db_tables(self, config: Configuration) -> None:
 291         """ Set up the word table and fill it with pre-computed word
 292             frequencies.
 293         """
 294         with connect(self.dsn) as conn:
 295             sqlp = SQLPreprocessor(conn, config)
 296             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 297             conn.commit()
 298
 299         LOG.warning("Precomputing word tokens")
 300         db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
 301
 302
 303     def _save_config(self, conn: Connection, config: Configuration) -> None:
 304         """ Save the configuration that needs to remain stable for the given
 305             database as database properties.
 306         """
 307         assert self.normalization is not None
 308
 309         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 310         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 311
 312
 313 class LegacyNameAnalyzer(AbstractAnalyzer):
 314     """ The legacy analyzer uses the special Postgresql module for
 315         splitting names.
 316
 317         Each instance opens a connection to the database to request the
 318         normalization.
 319     """
 320
 321     def __init__(self, dsn: str, normalizer: Any):
 322         self.conn: Optional[Connection] = connect(dsn)
 323         self.conn.autocommit = True
 324         self.normalizer = normalizer
 325         register_hstore(self.conn)
 326
 327         self._cache = _TokenCache(self.conn)
 328
 329
 330     def close(self) -> None:
 331         """ Free all resources used by the analyzer.
 332         """
 333         if self.conn:
 334             self.conn.close()
 335             self.conn = None
 336
 337
 338     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 339         """ Return token information for the given list of words.
 340             If a word starts with # it is assumed to be a full name
 341             otherwise is a partial name.
 342
 343             The function returns a list of tuples with
 344             (original word, word token, word id).
 345
 346             The function is used for testing and debugging only
 347             and not necessarily efficient.
 348         """
 349         assert self.conn is not None
 350         with self.conn.cursor() as cur:
 351             cur.execute("""SELECT t.term, word_token, word_id
 352                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 353                            WHERE word_token = (CASE
 354                                    WHEN left(t.term, 1) = '#' THEN
 355                                      ' ' || make_standard_name(substring(t.term from 2))
 356                                    ELSE
 357                                      make_standard_name(t.term)
 358                                    END)
 359                                  and class is null and country_code is null""",
 360                         (words, ))
 361
 362             return [(r[0], r[1], r[2]) for r in cur]
 363
 364
 365     def normalize(self, phrase: str) -> str:
 366         """ Normalize the given phrase, i.e. remove all properties that
 367             are irrelevant for search.
 368         """
 369         return cast(str, self.normalizer.transliterate(phrase))
 370
 371
 372     def normalize_postcode(self, postcode: str) -> str:
 373         """ Convert the postcode to a standardized form.
 374
 375             This function must yield exactly the same result as the SQL function
 376             'token_normalized_postcode()'.
 377         """
 378         return postcode.strip().upper()
 379
 380
 381     def update_postcodes_from_db(self) -> None:
 382         """ Update postcode tokens in the word table from the location_postcode
 383             table.
 384         """
 385         assert self.conn is not None
 386
 387         with self.conn.cursor() as cur:
 388             # This finds us the rows in location_postcode and word that are
 389             # missing in the other table.
 390             cur.execute("""SELECT * FROM
 391                             (SELECT pc, word FROM
 392                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 393                               FULL JOIN
 394                               (SELECT word FROM word
 395                                 WHERE class ='place' and type = 'postcode') w
 396                               ON pc = word) x
 397                            WHERE pc is null or word is null""")
 398
 399             to_delete = []
 400             to_add = []
 401
 402             for postcode, word in cur:
 403                 if postcode is None:
 404                     to_delete.append(word)
 405                 else:
 406                     to_add.append(postcode)
 407
 408             if to_delete:
 409                 cur.execute("""DELETE FROM WORD
 410                                WHERE class ='place' and type = 'postcode'
 411                                      and word = any(%s)
 412                             """, (to_delete, ))
 413             if to_add:
 414                 cur.execute("""SELECT count(create_postcode_id(pc))
 415                                FROM unnest(%s::text[]) as pc
 416                             """, (to_add, ))
 417
 418
 419
 420     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 421                                should_replace: bool) -> None:
 422         """ Replace the search index for special phrases with the new phrases.
 423         """
 424         assert self.conn is not None
 425
 426         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 427                             for p in phrases))
 428
 429         with self.conn.cursor() as cur:
 430             # Get the old phrases.
 431             existing_phrases = set()
 432             cur.execute("""SELECT word, class as cls, type, operator FROM word
 433                            WHERE class != 'place'
 434                                  OR (type != 'house' AND type != 'postcode')""")
 435             for label, cls, typ, oper in cur:
 436                 existing_phrases.add((label, cls, typ, oper or '-'))
 437
 438             to_add = norm_phrases - existing_phrases
 439             to_delete = existing_phrases - norm_phrases
 440
 441             if to_add:
 442                 cur.executemany(
 443                     """ INSERT INTO word (word_id, word_token, word, class, type,
 444                                           search_name_count, operator)
 445                         (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
 446                                 class, type, 0,
 447                                 CASE WHEN op in ('in', 'near') THEN op ELSE null END
 448                            FROM (VALUES (%s, %s, %s, %s)) as v(name, class, type, op))""",
 449                     to_add)
 450
 451             if to_delete and should_replace:
 452                 cur.executemany(
 453                     """ DELETE FROM word
 454                           USING (VALUES (%s, %s, %s, %s)) as v(name, in_class, in_type, op)
 455                         WHERE word = name and class = in_class and type = in_type
 456                               and ((op = '-' and operator is null) or op = operator)""",
 457                     to_delete)
 458
 459         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 460                  len(norm_phrases), len(to_add), len(to_delete))
 461
 462
 463     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 464         """ Add names for the given country to the search index.
 465         """
 466         assert self.conn is not None
 467
 468         with self.conn.cursor() as cur:
 469             cur.execute(
 470                 """INSERT INTO word (word_id, word_token, country_code)
 471                    (SELECT nextval('seq_word'), lookup_token, %s
 472                       FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
 473                             FROM unnest(%s::TEXT[])n) y
 474                       WHERE NOT EXISTS(SELECT * FROM word
 475                                        WHERE word_token = lookup_token and country_code = %s))
 476                 """, (country_code, list(names.values()), country_code))
 477
 478
 479     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 480         """ Determine tokenizer information about the given place.
 481
 482             Returns a JSON-serialisable structure that will be handed into
 483             the database via the token_info field.
 484         """
 485         assert self.conn is not None
 486
 487         token_info = _TokenInfo(self._cache)
 488
 489         names = place.name
 490
 491         if names:
 492             token_info.add_names(self.conn, names)
 493
 494             if place.is_country():
 495                 assert place.country_code is not None
 496                 self.add_country_names(place.country_code, names)
 497
 498         address = place.address
 499         if address:
 500             self._process_place_address(token_info, address)
 501
 502         return token_info.data
 503
 504
 505     def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
 506         assert self.conn is not None
 507         hnrs = []
 508         addr_terms = []
 509
 510         for key, value in address.items():
 511             if key == 'postcode':
 512                 # Make sure the normalized postcode is present in the word table.
 513                 if re.search(r'[:,;]', value) is None:
 514                     norm_pc = self.normalize_postcode(value)
 515                     token_info.set_postcode(norm_pc)
 516                     self._cache.add_postcode(self.conn, norm_pc)
 517             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 518                 hnrs.append(value)
 519             elif key == 'street':
 520                 token_info.add_street(self.conn, value)
 521             elif key == 'place':
 522                 token_info.add_place(self.conn, value)
 523             elif not key.startswith('_') \
 524                  and key not in ('country', 'full', 'inclusion'):
 525                 addr_terms.append((key, value))
 526
 527         if hnrs:
 528             token_info.add_housenumbers(self.conn, hnrs)
 529
 530         if addr_terms:
 531             token_info.add_address_terms(self.conn, addr_terms)
 532
 533
 534
 535 class _TokenInfo:
 536     """ Collect token information to be sent back to the database.
 537     """
 538     def __init__(self, cache: '_TokenCache') -> None:
 539         self.cache = cache
 540         self.data: Dict[str, Any] = {}
 541
 542
 543     def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
 544         """ Add token information for the names of the place.
 545         """
 546         # Create the token IDs for all names.
 547         self.data['names'] = execute_scalar(conn, "SELECT make_keywords(%s)::text",
 548                                             (names, ))
 549
 550
 551     def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
 552         """ Extract housenumber information from the address.
 553         """
 554         if len(hnrs) == 1:
 555             token = self.cache.get_housenumber(hnrs[0])
 556             if token is not None:
 557                 self.data['hnr_tokens'] = token
 558                 self.data['hnr'] = hnrs[0]
 559                 return
 560
 561         # split numbers if necessary
 562         simple_list: List[str] = []
 563         for hnr in hnrs:
 564             simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 565
 566         if len(simple_list) > 1:
 567             simple_list = list(set(simple_list))
 568
 569         with conn.cursor() as cur:
 570             cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
 571             result = cur.fetchone()
 572             assert result is not None
 573             self.data['hnr_tokens'], self.data['hnr'] = result
 574
 575
 576     def set_postcode(self, postcode: str) -> None:
 577         """ Set or replace the postcode token with the given value.
 578         """
 579         self.data['postcode'] = postcode
 580
 581     def add_street(self, conn: Connection, street: str) -> None:
 582         """ Add addr:street match terms.
 583         """
 584         def _get_street(name: str) -> Optional[str]:
 585             return cast(Optional[str],
 586                         execute_scalar(conn, "SELECT word_ids_from_name(%s)::text", (name, )))
 587
 588         tokens = self.cache.streets.get(street, _get_street)
 589         self.data['street'] = tokens or '{}'
 590
 591
 592     def add_place(self, conn: Connection, place: str) -> None:
 593         """ Add addr:place search and match terms.
 594         """
 595         def _get_place(name: str) -> Tuple[List[int], List[int]]:
 596             with conn.cursor() as cur:
 597                 cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
 598                                       word_ids_from_name(%s)::text""",
 599                             (name, name))
 600                 return cast(Tuple[List[int], List[int]], cur.fetchone())
 601
 602         self.data['place_search'], self.data['place_match'] = \
 603             self.cache.places.get(place, _get_place)
 604
 605
 606     def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
 607         """ Add additional address terms.
 608         """
 609         def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
 610             with conn.cursor() as cur:
 611                 cur.execute("""SELECT addr_ids_from_name(%s)::text,
 612                                       word_ids_from_name(%s)::text""",
 613                             (name, name))
 614                 return cast(Tuple[List[int], List[int]], cur.fetchone())
 615
 616         tokens = {}
 617         for key, value in terms:
 618             items = self.cache.address_terms.get(value, _get_address_term)
 619             if items[0] or items[1]:
 620                 tokens[key] = items
 621
 622         if tokens:
 623             self.data['addr'] = tokens
 624
 625
 626 class _LRU:
 627     """ Least recently used cache that accepts a generator function to
 628         produce the item when there is a cache miss.
 629     """
 630
 631     def __init__(self, maxsize: int = 128):
 632         self.data: 'OrderedDict[str, Any]' = OrderedDict()
 633         self.maxsize = maxsize
 634
 635
 636     def get(self, key: str, generator: Callable[[str], Any]) -> Any:
 637         """ Get the item with the given key from the cache. If nothing
 638             is found in the cache, generate the value through the
 639             generator function and store it in the cache.
 640         """
 641         value = self.data.get(key)
 642         if value is not None:
 643             self.data.move_to_end(key)
 644         else:
 645             value = generator(key)
 646             if len(self.data) >= self.maxsize:
 647                 self.data.popitem(last=False)
 648             self.data[key] = value
 649
 650         return value
 651
 652
 653 class _TokenCache:
 654     """ Cache for token information to avoid repeated database queries.
 655
 656         This cache is not thread-safe and needs to be instantiated per
 657         analyzer.
 658     """
 659     def __init__(self, conn: Connection):
 660         # various LRU caches
 661         self.streets = _LRU(maxsize=256)
 662         self.places = _LRU(maxsize=128)
 663         self.address_terms = _LRU(maxsize=1024)
 664
 665         # Lookup houseunumbers up to 100 and cache them
 666         with conn.cursor() as cur:
 667             cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
 668                            FROM generate_series(1, 100) as i""")
 669             self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
 670
 671         # For postcodes remember the ones that have already been added
 672         self.postcodes: Set[str] = set()
 673
 674     def get_housenumber(self, number: str) -> Optional[str]:
 675         """ Get a housenumber token from the cache.
 676         """
 677         return self._cached_housenumbers.get(number)
 678
 679
 680     def add_postcode(self, conn: Connection, postcode: str) -> None:
 681         """ Make sure the given postcode is in the database.
 682         """
 683         if postcode not in self.postcodes:
 684             with conn.cursor() as cur:
 685                 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
 686             self.postcodes.add(postcode)