Merge pull request #3422 from lonvia/drop-non-parented-interpolations

[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index 97ce6d16644cff6a19369c9db5a7a12af8387078..93808cc39f3407458bb2d570d2a8740128f2c168 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -7,8 +7,11 @@
  """
  Tokenizer implementing normalisation as used before Nominatim 4.
  """
  """
  Tokenizer implementing normalisation as used before Nominatim 4.
  """
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
+                   cast, Dict, Set, Iterable
  from collections import OrderedDict
  import logging
  from collections import OrderedDict
  import logging
+from pathlib import Path
  import re
  import shutil
  from textwrap import dedent
  import re
  import shutil
  from textwrap import dedent
@@ -17,10 +20,12 @@ from icu import Transliterator
  import psycopg2
  import psycopg2.extras
  
  import psycopg2
  import psycopg2.extras
  
-from nominatim.db.connection import connect
+from nominatim.db.connection import connect, Connection
+from nominatim.config import Configuration
  from nominatim.db import properties
  from nominatim.db import utils as db_utils
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db import properties
  from nominatim.db import utils as db_utils
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.data.place_info import PlaceInfo
  from nominatim.errors import UsageError
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.errors import UsageError
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
@@ -29,13 +34,13 @@ DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
-def create(dsn, data_dir):
+def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
      return LegacyTokenizer(dsn, data_dir)
  
  
      """ Create a new instance of the tokenizer provided by this module.
      """
      return LegacyTokenizer(dsn, data_dir)
  
  
-def _install_module(config_module_path, src_dir, module_dir):
+def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
      """ Copies the PostgreSQL normalisation module into the project
          directory if necessary. For historical reasons the module is
          saved in the '/module' subdirectory and not with the other tokenizer
      """ Copies the PostgreSQL normalisation module into the project
          directory if necessary. For historical reasons the module is
          saved in the '/module' subdirectory and not with the other tokenizer
@@ -52,7 +57,7 @@ def _install_module(config_module_path, src_dir, module_dir):
      # Compatibility mode for builddir installations.
      if module_dir.exists() and src_dir.samefile(module_dir):
          LOG.info('Running from build directory. Leaving database module as is.')
      # Compatibility mode for builddir installations.
      if module_dir.exists() and src_dir.samefile(module_dir):
          LOG.info('Running from build directory. Leaving database module as is.')
-        return module_dir
+        return str(module_dir)
  
      # In any other case install the module in the project directory.
      if not module_dir.exists():
  
      # In any other case install the module in the project directory.
      if not module_dir.exists():
@@ -64,20 +69,20 @@ def _install_module(config_module_path, src_dir, module_dir):
  
      LOG.info('Database module installed at %s', str(destfile))
  
  
      LOG.info('Database module installed at %s', str(destfile))
  
-    return module_dir
+    return str(module_dir)
  
  
  
  
-def _check_module(module_dir, conn):
+def _check_module(module_dir: str, conn: Connection) -> None:
      """ Try to use the PostgreSQL module to confirm that it is correctly
          installed and accessible from PostgreSQL.
      """
      with conn.cursor() as cur:
          try:
              cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
      """ Try to use the PostgreSQL module to confirm that it is correctly
          installed and accessible from PostgreSQL.
      """
      with conn.cursor() as cur:
          try:
              cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
-                           RETURNS text AS '{}/nominatim.so', 'transliteration'
+                           RETURNS text AS %s, 'transliteration'
                             LANGUAGE c IMMUTABLE STRICT;
                             DROP FUNCTION nominatim_test_import_func(text)
                             LANGUAGE c IMMUTABLE STRICT;
                             DROP FUNCTION nominatim_test_import_func(text)
-                        """.format(module_dir))
+                        """, (f'{module_dir}/nominatim.so', ))
          except psycopg2.DatabaseError as err:
              LOG.fatal("Error accessing database module: %s", err)
              raise UsageError("Database module cannot be accessed.") from err
          except psycopg2.DatabaseError as err:
              LOG.fatal("Error accessing database module: %s", err)
              raise UsageError("Database module cannot be accessed.") from err
@@ -89,18 +94,19 @@ class LegacyTokenizer(AbstractTokenizer):
          calls to the database.
      """
  
          calls to the database.
      """
  
-    def __init__(self, dsn, data_dir):
+    def __init__(self, dsn: str, data_dir: Path) -> None:
          self.dsn = dsn
          self.data_dir = data_dir
          self.dsn = dsn
          self.data_dir = data_dir
-        self.normalization = None
+        self.normalization: Optional[str] = None
  
  
  
  
-    def init_new_db(self, config, init_db=True):
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
          """ Set up a new tokenizer for the database.
  
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
          """ Set up a new tokenizer for the database.
  
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
+        assert config.project_dir is not None
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
                                       config.project_dir / 'module')
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
                                       config.project_dir / 'module')
@@ -119,9 +125,11 @@ class LegacyTokenizer(AbstractTokenizer):
              self._init_db_tables(config)
  
  
              self._init_db_tables(config)
  
  
-    def init_from_project(self, config):
+    def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
          """ Initialise the tokenizer from the project directory.
          """
+        assert config.project_dir is not None
+
          with connect(self.dsn) as conn:
              self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
  
          with connect(self.dsn) as conn:
              self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
  
@@ -132,7 +140,7 @@ class LegacyTokenizer(AbstractTokenizer):
  
          self._install_php(config, overwrite=False)
  
  
          self._install_php(config, overwrite=False)
  
-    def finalize_import(self, config):
+    def finalize_import(self, config: Configuration) -> None:
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
@@ -141,9 +149,11 @@ class LegacyTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  
  
              sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  
  
-    def update_sql_functions(self, config):
+    def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
          """ Reimport the SQL functions for this tokenizer.
          """
+        assert config.project_dir is not None
+
          with connect(self.dsn) as conn:
              max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
              modulepath = config.DATABASE_MODULE_PATH or \
          with connect(self.dsn) as conn:
              max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
              modulepath = config.DATABASE_MODULE_PATH or \
@@ -154,7 +164,7 @@ class LegacyTokenizer(AbstractTokenizer):
                                modulepath=modulepath)
  
  
                                modulepath=modulepath)
  
  
-    def check_database(self, _):
+    def check_database(self, _: Configuration) -> Optional[str]:
          """ Check that the tokenizer is set up correctly.
          """
          hint = """\
          """ Check that the tokenizer is set up correctly.
          """
          hint = """\
@@ -181,13 +191,15 @@ class LegacyTokenizer(AbstractTokenizer):
          return None
  
  
          return None
  
  
-    def migrate_database(self, config):
+    def migrate_database(self, config: Configuration) -> None:
          """ Initialise the project directory of an existing database for
              use with this tokenizer.
  
              This is a special migration function for updating existing databases
              to new software versions.
          """
          """ Initialise the project directory of an existing database for
              use with this tokenizer.
  
              This is a special migration function for updating existing databases
              to new software versions.
          """
+        assert config.project_dir is not None
+
          self.normalization = config.TERM_NORMALIZATION
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
          self.normalization = config.TERM_NORMALIZATION
          module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                       config.lib_dir.module,
@@ -198,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
              self._save_config(conn, config)
  
  
              self._save_config(conn, config)
  
  
-    def update_statistics(self):
+    def update_statistics(self, config: Configuration, threads: int = 1) -> None:
          """ Recompute the frequency of full words.
          """
          with connect(self.dsn) as conn:
          """ Recompute the frequency of full words.
          """
          with connect(self.dsn) as conn:
@@ -218,13 +230,13 @@ class LegacyTokenizer(AbstractTokenizer):
              conn.commit()
  
  
              conn.commit()
  
  
-    def update_word_tokens(self):
+    def update_word_tokens(self) -> None:
          """ No house-keeping implemented for the legacy tokenizer.
          """
          LOG.info("No tokenizer clean-up available.")
  
  
          """ No house-keeping implemented for the legacy tokenizer.
          """
          LOG.info("No tokenizer clean-up available.")
  
  
-    def name_analyzer(self):
+    def name_analyzer(self) -> 'LegacyNameAnalyzer':
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
              be used accordingly:
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
              be used accordingly:
@@ -244,21 +256,32 @@ class LegacyTokenizer(AbstractTokenizer):
          return LegacyNameAnalyzer(self.dsn, normalizer)
  
  
          return LegacyNameAnalyzer(self.dsn, normalizer)
  
  
-    def _install_php(self, config, overwrite=True):
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute(""" SELECT word FROM word WHERE word is not null
+                              ORDER BY search_name_count DESC LIMIT %s""", (num,))
+            return list(s[0] for s in cur)
+
+
+    def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
          """ Install the php script for the tokenizer.
          """
-        php_file = self.data_dir / "tokenizer.php"
+        if config.lib_dir.php is not None:
+            php_file = self.data_dir / "tokenizer.php"
  
  
-        if not php_file.exists() or overwrite:
-            php_file.write_text(dedent("""\
-                <?php
-                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
-                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
-                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)), encoding='utf-8')
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
+                    @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
+                    require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                    """), encoding='utf-8')
  
  
  
  
-    def _init_db_tables(self, config):
+    def _init_db_tables(self, config: Configuration) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
          """
          """ Set up the word table and fill it with pre-computed word
              frequencies.
          """
@@ -271,10 +294,12 @@ class LegacyTokenizer(AbstractTokenizer):
          db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
  
  
          db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
  
  
-    def _save_config(self, conn, config):
+    def _save_config(self, conn: Connection, config: Configuration) -> None:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
+        assert self.normalization is not None
+
          properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
          properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
  
          properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
          properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
  
@@ -287,8 +312,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
          normalization.
      """
  
          normalization.
      """
  
-    def __init__(self, dsn, normalizer):
-        self.conn = connect(dsn).connection
+    def __init__(self, dsn: str, normalizer: Any):
+        self.conn: Optional[Connection] = connect(dsn).connection
          self.conn.autocommit = True
          self.normalizer = normalizer
          psycopg2.extras.register_hstore(self.conn)
          self.conn.autocommit = True
          self.normalizer = normalizer
          psycopg2.extras.register_hstore(self.conn)
@@ -296,7 +321,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
          self._cache = _TokenCache(self.conn)
  
  
          self._cache = _TokenCache(self.conn)
  
  
-    def close(self):
+    def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
          if self.conn:
          """ Free all resources used by the analyzer.
          """
          if self.conn:
@@ -304,7 +329,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
              self.conn = None
  
  
              self.conn = None
  
  
-    def get_word_token_info(self, words):
+    def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
@@ -315,6 +340,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
+        assert self.conn is not None
          with self.conn.cursor() as cur:
              cur.execute("""SELECT t.term, word_token, word_id
                             FROM word, (SELECT unnest(%s::TEXT[]) as term) t
          with self.conn.cursor() as cur:
              cur.execute("""SELECT t.term, word_token, word_id
                             FROM word, (SELECT unnest(%s::TEXT[]) as term) t
@@ -330,15 +356,14 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
              return [(r[0], r[1], r[2]) for r in cur]
  
  
              return [(r[0], r[1], r[2]) for r in cur]
  
  
-    def normalize(self, phrase):
+    def normalize(self, phrase: str) -> str:
          """ Normalize the given phrase, i.e. remove all properties that
              are irrelevant for search.
          """
          """ Normalize the given phrase, i.e. remove all properties that
              are irrelevant for search.
          """
-        return self.normalizer.transliterate(phrase)
+        return cast(str, self.normalizer.transliterate(phrase))
  
  
  
  
-    @staticmethod
-    def normalize_postcode(postcode):
+    def normalize_postcode(self, postcode: str) -> str:
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
@@ -347,10 +372,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
          return postcode.strip().upper()
  
  
          return postcode.strip().upper()
  
  
-    def update_postcodes_from_db(self):
+    def update_postcodes_from_db(self) -> None:
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
+        assert self.conn is not None
+
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
@@ -384,9 +411,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
  
  
  
  
  
  
-    def update_special_phrases(self, phrases, should_replace):
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
          """
          """ Replace the search index for special phrases with the new phrases.
          """
+        assert self.conn is not None
+
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
@@ -423,9 +453,11 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
                   len(norm_phrases), len(to_add), len(to_delete))
  
  
                   len(norm_phrases), len(to_add), len(to_delete))
  
  
-    def add_country_names(self, country_code, names):
+    def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
          """ Add names for the given country to the search index.
          """
          """ Add names for the given country to the search index.
          """
+        assert self.conn is not None
+
          with self.conn.cursor() as cur:
              cur.execute(
                  """INSERT INTO word (word_id, word_token, country_code)
          with self.conn.cursor() as cur:
              cur.execute(
                  """INSERT INTO word (word_id, word_token, country_code)
@@ -437,12 +469,14 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
                  """, (country_code, list(names.values()), country_code))
  
  
                  """, (country_code, list(names.values()), country_code))
  
  
-    def process_place(self, place):
+    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
          """ Determine tokenizer information about the given place.
  
              Returns a JSON-serialisable structure that will be handed into
              the database via the token_info field.
          """
          """ Determine tokenizer information about the given place.
  
              Returns a JSON-serialisable structure that will be handed into
              the database via the token_info field.
          """
+        assert self.conn is not None
+
          token_info = _TokenInfo(self._cache)
  
          names = place.name
          token_info = _TokenInfo(self._cache)
  
          names = place.name
@@ -451,6 +485,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
              token_info.add_names(self.conn, names)
  
              if place.is_country():
              token_info.add_names(self.conn, names)
  
              if place.is_country():
+                assert place.country_code is not None
                  self.add_country_names(place.country_code, names)
  
          address = place.address
                  self.add_country_names(place.country_code, names)
  
          address = place.address
@@ -460,7 +495,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
          return token_info.data
  
  
          return token_info.data
  
  
-    def _process_place_address(self, token_info, address):
+    def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
+        assert self.conn is not None
          hnrs = []
          addr_terms = []
  
          hnrs = []
          addr_terms = []
  
@@ -468,15 +504,17 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
              if key == 'postcode':
                  # Make sure the normalized postcode is present in the word table.
                  if re.search(r'[:,;]', value) is None:
              if key == 'postcode':
                  # Make sure the normalized postcode is present in the word table.
                  if re.search(r'[:,;]', value) is None:
-                    self._cache.add_postcode(self.conn,
-                                             self.normalize_postcode(value))
+                    norm_pc = self.normalize_postcode(value)
+                    token_info.set_postcode(norm_pc)
+                    self._cache.add_postcode(self.conn, norm_pc)
              elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                  hnrs.append(value)
              elif key == 'street':
                  token_info.add_street(self.conn, value)
              elif key == 'place':
                  token_info.add_place(self.conn, value)
              elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                  hnrs.append(value)
              elif key == 'street':
                  token_info.add_street(self.conn, value)
              elif key == 'place':
                  token_info.add_place(self.conn, value)
-            elif not key.startswith('_') and key not in ('country', 'full'):
+            elif not key.startswith('_') \
+                 and key not in ('country', 'full', 'inclusion'):
                  addr_terms.append((key, value))
  
          if hnrs:
                  addr_terms.append((key, value))
  
          if hnrs:
@@ -490,12 +528,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
-    def __init__(self, cache):
+    def __init__(self, cache: '_TokenCache') -> None:
          self.cache = cache
          self.cache = cache
-        self.data = {}
+        self.data: Dict[str, Any] = {}
  
  
  
  
-    def add_names(self, conn, names):
+    def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
          """ Add token information for the names of the place.
          """
          with conn.cursor() as cur:
          """ Add token information for the names of the place.
          """
          with conn.cursor() as cur:
@@ -504,7 +542,7 @@ class _TokenInfo:
                                              (names, ))
  
  
                                              (names, ))
  
  
-    def add_housenumbers(self, conn, hnrs):
+    def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
          """ Extract housenumber information from the address.
          """
          if len(hnrs) == 1:
          """ Extract housenumber information from the address.
          """
          if len(hnrs) == 1:
@@ -515,7 +553,7 @@ class _TokenInfo:
                  return
  
          # split numbers if necessary
                  return
  
          # split numbers if necessary
-        simple_list = []
+        simple_list: List[str] = []
          for hnr in hnrs:
              simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
  
          for hnr in hnrs:
              simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
  
@@ -524,44 +562,51 @@ class _TokenInfo:
  
          with conn.cursor() as cur:
              cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
  
          with conn.cursor() as cur:
              cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
-            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+            result = cur.fetchone()
+            assert result is not None
+            self.data['hnr_tokens'], self.data['hnr'] = result
  
  
  
  
-    def add_street(self, conn, street):
+    def set_postcode(self, postcode: str) -> None:
+        """ Set or replace the postcode token with the given value.
+        """
+        self.data['postcode'] = postcode
+
+    def add_street(self, conn: Connection, street: str) -> None:
          """ Add addr:street match terms.
          """
          """ Add addr:street match terms.
          """
-        def _get_street(name):
+        def _get_street(name: str) -> Optional[str]:
              with conn.cursor() as cur:
              with conn.cursor() as cur:
-                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+                return cast(Optional[str],
+                            cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
  
          tokens = self.cache.streets.get(street, _get_street)
  
          tokens = self.cache.streets.get(street, _get_street)
-        if tokens:
-            self.data['street'] = tokens
+        self.data['street'] = tokens or '{}'
  
  
  
  
-    def add_place(self, conn, place):
+    def add_place(self, conn: Connection, place: str) -> None:
          """ Add addr:place search and match terms.
          """
          """ Add addr:place search and match terms.
          """
-        def _get_place(name):
+        def _get_place(name: str) -> Tuple[List[int], List[int]]:
              with conn.cursor() as cur:
                  cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
              with conn.cursor() as cur:
                  cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
-                return cur.fetchone()
+                return cast(Tuple[List[int], List[int]], cur.fetchone())
  
          self.data['place_search'], self.data['place_match'] = \
              self.cache.places.get(place, _get_place)
  
  
  
          self.data['place_search'], self.data['place_match'] = \
              self.cache.places.get(place, _get_place)
  
  
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
          """ Add additional address terms.
          """
          """ Add additional address terms.
          """
-        def _get_address_term(name):
+        def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
              with conn.cursor() as cur:
                  cur.execute("""SELECT addr_ids_from_name(%s)::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
              with conn.cursor() as cur:
                  cur.execute("""SELECT addr_ids_from_name(%s)::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name))
-                return cur.fetchone()
+                return cast(Tuple[List[int], List[int]], cur.fetchone())
  
          tokens = {}
          for key, value in terms:
  
          tokens = {}
          for key, value in terms:
@@ -578,13 +623,12 @@ class _LRU:
          produce the item when there is a cache miss.
      """
  
          produce the item when there is a cache miss.
      """
  
-    def __init__(self, maxsize=128, init_data=None):
-        self.data = init_data or OrderedDict()
+    def __init__(self, maxsize: int = 128):
+        self.data: 'OrderedDict[str, Any]' = OrderedDict()
          self.maxsize = maxsize
          self.maxsize = maxsize
-        if init_data is not None and len(init_data) > maxsize:
-            self.maxsize = len(init_data)
  
  
-    def get(self, key, generator):
+
+    def get(self, key: str, generator: Callable[[str], Any]) -> Any:
          """ Get the item with the given key from the cache. If nothing
              is found in the cache, generate the value through the
              generator function and store it in the cache.
          """ Get the item with the given key from the cache. If nothing
              is found in the cache, generate the value through the
              generator function and store it in the cache.
@@ -607,7 +651,7 @@ class _TokenCache:
          This cache is not thread-safe and needs to be instantiated per
          analyzer.
      """
          This cache is not thread-safe and needs to be instantiated per
          analyzer.
      """
-    def __init__(self, conn):
+    def __init__(self, conn: Connection):
          # various LRU caches
          self.streets = _LRU(maxsize=256)
          self.places = _LRU(maxsize=128)
          # various LRU caches
          self.streets = _LRU(maxsize=256)
          self.places = _LRU(maxsize=128)
@@ -617,18 +661,18 @@ class _TokenCache:
          with conn.cursor() as cur:
              cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
                             FROM generate_series(1, 100) as i""")
          with conn.cursor() as cur:
              cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
                             FROM generate_series(1, 100) as i""")
-            self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
+            self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
  
          # For postcodes remember the ones that have already been added
  
          # For postcodes remember the ones that have already been added
-        self.postcodes = set()
+        self.postcodes: Set[str] = set()
  
  
-    def get_housenumber(self, number):
+    def get_housenumber(self, number: str) -> Optional[str]:
          """ Get a housenumber token from the cache.
          """
          return self._cached_housenumbers.get(number)
  
  
          """ Get a housenumber token from the cache.
          """
          return self._cached_housenumbers.get(number)
  
  
-    def add_postcode(self, conn, postcode):
+    def add_postcode(self, conn: Connection, postcode: str) -> None:
          """ Make sure the given postcode is in the database.
          """
          if postcode not in self.postcodes:
          """ Make sure the given postcode is in the database.
          """
          if postcode not in self.postcodes: