Merge pull request #3342 from mtmail/tyops

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 61263678d811db87e90cc0ab8ed55b885d24a57c..c1821d7edc7b88b2aa1f95797be2ddfce0ee0c85 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -1,104 +1,188 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from collections import Counter
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
+                   Dict, Set, Iterable
  import itertools
  import json
  import logging
  import itertools
  import json
  import logging
-import re
+from pathlib import Path
  from textwrap import dedent
  
  from textwrap import dedent
  
-from nominatim.db.connection import connect
-from nominatim.db.properties import set_property, get_property
+from nominatim.db.connection import connect, Connection, Cursor
+from nominatim.config import Configuration
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.data.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.data.place_name import PlaceName
+from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
-def create(dsn, data_dir):
+WORD_TYPES =(('country_names', 'C'),
+             ('postcodes', 'P'),
+             ('full_word', 'W'),
+             ('housenumbers', 'H'))
+
+def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
      """ Create a new instance of the tokenizer provided by this module.
      """
-    return LegacyICUTokenizer(dsn, data_dir)
+    return ICUTokenizer(dsn, data_dir)
  
  
  
  
-class LegacyICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+class ICUTokenizer(AbstractTokenizer):
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """
  
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """
  
-    def __init__(self, dsn, data_dir):
+    def __init__(self, dsn: str, data_dir: Path) -> None:
          self.dsn = dsn
          self.data_dir = data_dir
          self.dsn = dsn
          self.data_dir = data_dir
-        self.naming_rules = None
-        self.term_normalization = None
-        self.max_word_frequency = None
+        self.loader: Optional[ICURuleLoader] = None
  
  
  
  
-    def init_new_db(self, config, init_db=True):
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
          """ Set up a new tokenizer for the database.
  
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
          """ Set up a new tokenizer for the database.
  
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
-        self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
+        self.loader = ICURuleLoader(config)
  
  
-        self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._install_php(config.lib_dir.php, overwrite=True)
+        self._save_config()
  
          if init_db:
              self.update_sql_functions(config)
  
          if init_db:
              self.update_sql_functions(config)
-            self._init_db_tables(config)
+            self._setup_db_tables(config)
+            self._create_base_indices(config, 'word')
  
  
  
  
-    def init_from_project(self):
+    def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
          """ Initialise the tokenizer from the project directory.
          """
+        self.loader = ICURuleLoader(config)
+
          with connect(self.dsn) as conn:
          with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
-            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
+            self.loader.load_config_from_db(conn)
  
  
+        self._install_php(config.lib_dir.php, overwrite=False)
  
  
-    def finalize_import(self, _):
+
+    def finalize_import(self, config: Configuration) -> None:
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
+        self._create_lookup_indices(config, 'word')
  
  
  
  
-    def update_sql_functions(self, config):
+    def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
              sqlp = SQLPreprocessor(conn, config)
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
  
  
-    def check_database(self):
+    def check_database(self, config: Configuration) -> None:
          """ Check that the tokenizer is set up correctly.
          """
          """ Check that the tokenizer is set up correctly.
          """
-        self.init_from_project()
+        # Will throw an error if there is an issue.
+        self.init_from_project(config)
+
  
  
-        if self.naming_rules is None:
-            return "Configuration for tokenizer 'icu' are missing."
+    def update_statistics(self, config: Configuration) -> None:
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            if not conn.table_exists('search_name'):
+                return
  
  
-        return None
+            with conn.cursor() as cur:
+                LOG.info('Computing word frequencies')
+                cur.drop_table('word_frequencies')
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute('CREATE INDEX ON word_frequencies(id)')
+                LOG.info('Update word table with recomputed frequencies')
+                cur.drop_table('tmp_word')
+                cur.execute("""CREATE TABLE tmp_word AS
+                                SELECT word_id, word_token, type, word,
+                                       (CASE WHEN wf.count is null THEN info
+                                          ELSE info || jsonb_build_object('count', wf.count)
+                                        END) as info
+                                FROM word LEFT JOIN word_frequencies wf
+                                  ON word.word_id = wf.id""")
+                cur.drop_table('word_frequencies')
  
  
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
+            conn.commit()
+        self._create_base_indices(config, 'tmp_word')
+        self._create_lookup_indices(config, 'tmp_word')
+        self._move_temporary_word_table('tmp_word')
+
+
+
+    def _cleanup_housenumbers(self) -> None:
+        """ Remove unused house numbers.
+        """
+        with connect(self.dsn) as conn:
+            if not conn.table_exists('search_name'):
+                return
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
+                               FROM word
+                               WHERE type = 'H'
+                                 AND NOT EXISTS(SELECT * FROM search_name
+                                                WHERE ARRAY[word.word_id] && name_vector)
+                                 AND (char_length(coalesce(word, word_token)) > 6
+                                      OR coalesce(word, word_token) not similar to '\\d+')
+                            """)
+                candidates = {token: wid for wid, token in cur}
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT housenumber FROM placex
+                               WHERE housenumber is not null
+                                     AND (char_length(housenumber) > 6
+                                          OR housenumber not similar to '\\d+')
+                            """)
+                for row in cur:
+                    for hnr in row[0].split(';'):
+                        candidates.pop(hnr, None)
+            LOG.info("There are %s outdated housenumbers.", len(candidates))
+            LOG.debug("Outdated housenumbers: %s", candidates.keys())
+            if candidates:
+                with conn.cursor() as cur:
+                    cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
+                                (list(candidates.values()), ))
+                conn.commit()
  
  
-    def name_analyzer(self):
+
+
+    def update_word_tokens(self) -> None:
+        """ Remove unused tokens.
+        """
+        LOG.warning("Cleaning up housenumber tokens.")
+        self._cleanup_housenumbers()
+        LOG.warning("Tokenizer house-keeping done.")
+
+
+    def name_analyzer(self) -> 'ICUNameAnalyzer':
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
              be used accordingly:
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
              be used accordingly:
@@ -113,97 +197,144 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        assert self.loader is not None
+        return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                               self.loader.make_token_analysis())
+
+
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
  
  
  
  
-    def _install_php(self, phpdir):
+    def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
          """ Install the php script for the tokenizer.
          """
-        php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent(f"""\
-            <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
-            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
-            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+        if phpdir is not None:
+            assert self.loader is not None
+            php_file = self.data_dir / "tokenizer.php"
+
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', 10000000);
+                    @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                    @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                    require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
  
  
  
  
-    def _save_config(self, config):
+    def _save_config(self) -> None:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
+        assert self.loader is not None
          with connect(self.dsn) as conn:
          with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
+            self.loader.save_config_to_db(conn)
  
  
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
-
-    def _init_db_tables(self, config):
+    def _setup_db_tables(self, config: Configuration) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
          """
          with connect(self.dsn) as conn:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
          """
          with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
              sqlp = SQLPreprocessor(conn, config)
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
+            sqlp.run_string(conn, """
+                CREATE TABLE word (
+                      word_id INTEGER,
+                      word_token text NOT NULL,
+                      type text NOT NULL,
+                      word text,
+                      info jsonb
+                    ) {{db.tablespace.search_data}};
+                GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+                DROP SEQUENCE IF EXISTS seq_word;
+                CREATE SEQUENCE seq_word start 1;
+                GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
+            """)
              conn.commit()
  
              conn.commit()
  
-            LOG.warning("Precomputing word tokens")
  
  
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
+    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
+                               USING BTREE (word_token) {{db.tablespace.search_index}}""",
+                            table_name=table_name)
+            for name, ctype in WORD_TYPES:
+                sqlp.run_string(conn,
+                                """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
+                                   USING BTREE (word) {{db.tablespace.address_index}}
+                                   WHERE type = '{{column_type}}'
+                                """,
+                                table_name=table_name, idx_name=name,
+                                column_type=ctype)
+            conn.commit()
  
  
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
  
  
+    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
+        """ Create additional indexes used when running the API.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            # Index required for details lookup.
+            sqlp.run_string(conn, """
+                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
+                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
+            """,
+            table_name=table_name)
              conn.commit()
  
              conn.commit()
  
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
  
  
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
+    def _move_temporary_word_table(self, old: str) -> None:
+        """ Rename all tables and indexes used by the tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+                cur.execute(f"ALTER TABLE {old} RENAME TO word")
+                for idx in ('word_token', 'word_id'):
+                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
+                                      RENAME TO idx_word_{idx}""")
+                for name, _ in WORD_TYPES:
+                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
+                                    RENAME TO idx_word_{name}""")
+            conn.commit()
  
  
-            for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
  
  
-        return words
  
  
  
  
-class LegacyICUNameAnalyzer(AbstractAnalyzer):
-    """ The legacy analyzer uses the ICU library for splitting names.
+class ICUNameAnalyzer(AbstractAnalyzer):
+    """ The ICU analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
          normalization.
      """
  
  
          Each instance opens a connection to the database to request the
          normalization.
      """
  
-    def __init__(self, dsn, name_proc):
-        self.conn = connect(dsn).connection
+    def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
+                 token_analysis: ICUTokenAnalysis) -> None:
+        self.conn: Optional[Connection] = connect(dsn).connection
          self.conn.autocommit = True
          self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
  
          self._cache = _TokenCache()
  
  
  
          self._cache = _TokenCache()
  
  
-    def close(self):
+    def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
          if self.conn:
          """ Free all resources used by the analyzer.
          """
          if self.conn:
@@ -211,7 +342,20 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              self.conn = None
  
  
              self.conn = None
  
  
-    def get_word_token_info(self, words):
+    def _search_normalized(self, name: str) -> str:
+        """ Return the search token transliteration of the given name.
+        """
+        return cast(str, self.token_analysis.search.transliterate(name)).strip()
+
+
+    def _normalized(self, name: str) -> str:
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
+
+
+    def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
@@ -222,13 +366,14 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
+        assert self.conn is not None
          full_tokens = {}
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
          full_tokens = {}
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
              else:
              else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
@@ -244,8 +389,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
                 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
-    @staticmethod
-    def normalize_postcode(postcode):
+    def normalize_postcode(self, postcode: str) -> str:
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
          """ Convert the postcode to a standardized form.
  
              This function must yield exactly the same result as the SQL function
@@ -254,53 +398,92 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return postcode.strip().upper()
  
  
          return postcode.strip().upper()
  
  
-    def _make_standard_hnr(self, hnr):
-        """ Create a normalised version of a housenumber.
-
-            This function takes minor shortcuts on transliteration.
-        """
-        return self.name_processor.get_search_normalized(hnr)
-
-    def update_postcodes_from_db(self):
+    def update_postcodes_from_db(self) -> None:
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
-        to_delete = []
+        assert self.conn is not None
+        analyzer = self.token_analysis.analysis.get('@postcode')
+
          with self.conn.cursor() as cur:
          with self.conn.cursor() as cur:
-            # This finds us the rows in location_postcode and word that are
-            # missing in the other table.
-            cur.execute("""SELECT * FROM
-                            (SELECT pc, word FROM
-                              (SELECT distinct(postcode) as pc FROM location_postcode) p
-                              FULL JOIN
-                              (SELECT word FROM word WHERE type = 'P') w
-                              ON pc = word) x
-                           WHERE pc is null or word is null""")
-
-            with CopyBuffer() as copystr:
-                for postcode, word in cur:
-                    if postcode is None:
-                        to_delete.append(word)
-                    else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
-                                    'P', postcode)
-
-                if to_delete:
-                    cur.execute("""DELETE FROM WORD
-                                   WHERE type ='P' and word = any(%s)
-                                """, (to_delete, ))
-
-                copystr.copy_out(cur, 'word',
-                                 columns=['word_token', 'type', 'word'])
-
-
-    def update_special_phrases(self, phrases, should_replace):
+            # First get all postcode names currently in the word table.
+            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
+            word_entries = set((entry[0] for entry in cur))
+
+            # Then compute the required postcode names from the postcode table.
+            needed_entries = set()
+            cur.execute("SELECT country_code, postcode FROM location_postcode")
+            for cc, postcode in cur:
+                info = PlaceInfo({'country_code': cc,
+                                  'class': 'place', 'type': 'postcode',
+                                  'address': {'postcode': postcode}})
+                address = self.sanitizer.process_names(info)[1]
+                for place in address:
+                    if place.kind == 'postcode':
+                        if analyzer is None:
+                            postcode_name = place.name.strip().upper()
+                            variant_base = None
+                        else:
+                            postcode_name = analyzer.get_canonical_id(place)
+                            variant_base = place.get_attr("variant")
+
+                        if variant_base:
+                            needed_entries.add(f'{postcode_name}@{variant_base}')
+                        else:
+                            needed_entries.add(postcode_name)
+                        break
+
+        # Now update the word table.
+        self._delete_unused_postcode_words(word_entries - needed_entries)
+        self._add_missing_postcode_words(needed_entries - word_entries)
+
+    def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
+        assert self.conn is not None
+        if tokens:
+            with self.conn.cursor() as cur:
+                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
+                            (list(tokens), ))
+
+    def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
+        assert self.conn is not None
+        if not tokens:
+            return
+
+        analyzer = self.token_analysis.analysis.get('@postcode')
+        terms = []
+
+        for postcode_name in tokens:
+            if '@' in postcode_name:
+                term, variant = postcode_name.split('@', 2)
+                term = self._search_normalized(term)
+                if analyzer is None:
+                    variants = [term]
+                else:
+                    variants = analyzer.compute_variants(variant)
+                    if term not in variants:
+                        variants.append(term)
+            else:
+                variants = [self._search_normalized(postcode_name)]
+            terms.append((postcode_name, variants))
+
+        if terms:
+            with self.conn.cursor() as cur:
+                cur.execute_values("""SELECT create_postcode_word(pc, var)
+                                      FROM (VALUES %s) AS v(pc, var)""",
+                                   terms)
+
+
+
+
+    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
              If `should_replace` is True, then the previous set of will be
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
          """ Replace the search index for special phrases with the new phrases.
              If `should_replace` is True, then the previous set of will be
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        assert self.conn is not None
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -322,7 +505,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                   len(norm_phrases), added, deleted)
  
  
                   len(norm_phrases), added, deleted)
  
  
-    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+    def _add_special_phrases(self, cursor: Cursor,
+                             new_phrases: Set[Tuple[str, str, str, str]],
+                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
          """ Add all phrases to the database that are not yet there.
          """
          to_add = new_phrases - existing_phrases
          """ Add all phrases to the database that are not yet there.
          """
          to_add = new_phrases - existing_phrases
@@ -330,7 +515,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
@@ -343,9 +528,10 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return added
  
  
          return added
  
  
-    @staticmethod
-    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
-        """ Remove all phrases from the databse that are no longer in the
+    def _remove_special_phrases(self, cursor: Cursor,
+                             new_phrases: Set[Tuple[str, str, str, str]],
+                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+        """ Remove all phrases from the database that are no longer in the
              new phrase list.
          """
          to_delete = existing_phrases - new_phrases
              new phrase list.
          """
          to_delete = existing_phrases - new_phrases
@@ -361,106 +547,230 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return len(to_delete)
  
  
          return len(to_delete)
  
  
-    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+    def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
+        """ Add default names for the given country to the search index.
+        """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0],
+                                     internal=True)
+
+
+    def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
+                                internal: bool = False) -> None:
+        """ Add names for the given country from an already sanitized
+            name list.
          """
          """
+        assert self.conn is not None
          word_tokens = set()
          word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
              if norm_name:
                  word_tokens.add(norm_name)
  
          with self.conn.cursor() as cur:
              # Get existing names
              if norm_name:
                  word_tokens.add(norm_name)
  
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and word = %s""",
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+                             FROM word
+                             WHERE type = 'C' and word = %s""",
                          (country_code, ))
                          (country_code, ))
-            word_tokens.difference_update((t[0] for t in cur))
+            # internal/external names
+            existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
+            for word in cur:
+                existing_tokens[word[1]].add(word[0])
+
+            # Delete names that no longer exist.
+            gone_tokens = existing_tokens[internal] - word_tokens
+            if internal:
+                gone_tokens.update(existing_tokens[False] & word_tokens)
+            if gone_tokens:
+                cur.execute("""DELETE FROM word
+                               USING unnest(%s) as token
+                               WHERE type = 'C' and word = %s
+                                     and word_token = token""",
+                            (list(gone_tokens), country_code))
  
              # Only add those names that are not yet in the list.
  
              # Only add those names that are not yet in the list.
-            if word_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, word)
-                               (SELECT token, 'C', %s
-                                FROM unnest(%s) as token)
-                            """, (country_code, list(word_tokens)))
-
-            # No names are deleted at the moment.
-            # If deletion is made possible, then the static names from the
-            # initial 'country_name' table should be kept.
-
-
-    def process_place(self, place):
+            new_tokens = word_tokens - existing_tokens[True]
+            if not internal:
+                new_tokens -= existing_tokens[False]
+            if new_tokens:
+                if internal:
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                               (SELECT token, 'C', %s, '{"internal": "yes"}'
+                                  FROM unnest(%s) as token)
+                           """
+                else:
+                    sql = """INSERT INTO word (word_token, type, word)
+                                   (SELECT token, 'C', %s
+                                    FROM unnest(%s) as token)
+                          """
+                cur.execute(sql, (country_code, list(new_tokens)))
+
+
+    def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
          """ Determine tokenizer information about the given place.
  
          """ Determine tokenizer information about the given place.
  
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
              the database via the token_info field.
          """
-        token_info = _TokenInfo(self._cache)
+        token_info = _TokenInfo()
  
  
-        names = place.get('name')
+        names, address = self.sanitizer.process_names(place)
  
          if names:
  
          if names:
-            fulls, partials = self._compute_name_tokens(names)
-
-            token_info.add_names(fulls, partials)
+            token_info.set_names(*self._compute_name_tokens(names))
  
  
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                assert place.country_code is not None
+                self._add_country_full_names(place.country_code, names)
  
  
-        address = place.get('address')
          if address:
              self._process_place_address(token_info, address)
  
          if address:
              self._process_place_address(token_info, address)
  
-        return token_info.data
+        return token_info.to_dict()
+
+
+    def _process_place_address(self, token_info: '_TokenInfo',
+                               address: Sequence[PlaceName]) -> None:
+        for item in address:
+            if item.kind == 'postcode':
+                token_info.set_postcode(self._add_postcode(item))
+            elif item.kind == 'housenumber':
+                token_info.add_housenumber(*self._compute_housenumber_token(item))
+            elif item.kind == 'street':
+                token_info.add_street(self._retrieve_full_tokens(item.name))
+            elif item.kind == 'place':
+                if not item.suffix:
+                    token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and not item.suffix and \
+                 item.kind not in ('country', 'full', 'inclusion'):
+                token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
+
+
+    def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
+        """ Normalize the housenumber and return the word token and the
+            canonical form.
+        """
+        assert self.conn is not None
+        analyzer = self.token_analysis.analysis.get('@housenumber')
+        result: Tuple[Optional[int], Optional[str]] = (None, None)
  
  
+        if analyzer is None:
+            # When no custom analyzer is set, simply normalize and transliterate
+            norm_name = self._search_normalized(hnr.name)
+            if norm_name:
+                result = self._cache.housenumbers.get(norm_name, result)
+                if result[0] is None:
+                    with self.conn.cursor() as cur:
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
+                        self._cache.housenumbers[norm_name] = result
+        else:
+            # Otherwise use the analyzer to determine the canonical name.
+            # Per convention we use the first variant as the 'lookup name', the
+            # name that gets saved in the housenumber field of the place.
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
+                if result[0] is None:
+                    variants = analyzer.compute_variants(word_id)
+                    if variants:
+                        with self.conn.cursor() as cur:
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
+                            self._cache.housenumbers[word_id] = result
+
+        return result
+
+
+    def _compute_partial_tokens(self, name: str) -> List[int]:
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        assert self.conn is not None
+        norm_name = self._search_normalized(name)
  
  
-    def _process_place_address(self, token_info, address):
-        hnrs = []
-        addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
-            elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
  
  
-        if hnrs:
-            hnrs = self._split_housenumbers(hnrs)
-            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
  
  
-        if addr_terms:
-            token_info.add_address_terms(addr_terms)
+                for partial, token in cur:
+                    assert token is not None
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
  
  
  
  
-    def _compute_name_tokens(self, names):
+    def _retrieve_full_tokens(self, name: str) -> List[int]:
+        """ Get the full name token for the given name, if it exists.
+            The name is only retrieved for the standard analyser.
+        """
+        assert self.conn is not None
+        norm_name = self._search_normalized(name)
+
+        # return cached if possible
+        if norm_name in self._cache.fulls:
+            return self._cache.fulls[norm_name]
+
+        with self.conn.cursor() as cur:
+            cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
+                        (norm_name, ))
+            full = [row[0] for row in cur]
+
+        self._cache.fulls[norm_name] = full
+
+        return full
+
+
+    def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
-        full_names = self._compute_full_names(names)
-        full_tokens = set()
-        partial_tokens = set()
+        assert self.conn is not None
+        full_tokens: Set[int] = set()
+        partial_tokens: Set[int] = set()
+
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            analyzer = self.token_analysis.get_analyzer(analyzer_id)
+            word_id = analyzer.get_canonical_id(name)
+            if analyzer_id is None:
+                token_id = word_id
+            else:
+                token_id = f'{word_id}@{analyzer_id}'
  
  
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+            full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
              if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
-                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
-                    full, part = cur.fetchone()
+                    cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
+                                (token_id, variants))
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
+
+                self._cache.names[token_id] = (full, part)
  
  
-                self._cache.names[norm_name] = (full, part)
+            assert part is not None
  
              full_tokens.add(full)
              partial_tokens.update(part)
  
              full_tokens.add(full)
              partial_tokens.update(part)
@@ -468,116 +778,125 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
  
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
-    def _add_postcode(self, postcode):
+    def _add_postcode(self, item: PlaceName) -> Optional[str]:
          """ Make sure the normalized postcode is present in the word table.
          """
          """ Make sure the normalized postcode is present in the word table.
          """
-        if re.search(r'[:,;]', postcode) is None:
-            postcode = self.normalize_postcode(postcode)
+        assert self.conn is not None
+        analyzer = self.token_analysis.analysis.get('@postcode')
  
  
-            if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
-                if not term:
-                    return
+        if analyzer is None:
+            postcode_name = item.name.strip().upper()
+            variant_base = None
+        else:
+            postcode_name = analyzer.get_canonical_id(item)
+            variant_base = item.get_attr("variant")
  
  
-                with self.conn.cursor() as cur:
-                    # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, word)
-                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
-                                    WHERE NOT EXISTS
-                                     (SELECT * FROM word
-                                      WHERE type = 'P' and word = pc))
-                                """, (term, postcode))
-                self._cache.postcodes.add(postcode)
-
-
-    @staticmethod
-    def _split_housenumbers(hnrs):
-        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
-            # split numbers if necessary
-            simple_list = []
-            for hnr in hnrs:
-                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
-
-            if len(simple_list) > 1:
-                hnrs = list(set(simple_list))
-            else:
-                hnrs = simple_list
+        if variant_base:
+            postcode = f'{postcode_name}@{variant_base}'
+        else:
+            postcode = postcode_name
  
  
-        return hnrs
+        if postcode not in self._cache.postcodes:
+            term = self._search_normalized(postcode_name)
+            if not term:
+                return None
  
  
+            variants = {term}
+            if analyzer is not None and variant_base:
+                variants.update(analyzer.compute_variants(variant_base))
  
  
+            with self.conn.cursor() as cur:
+                cur.execute("SELECT create_postcode_word(%s, %s)",
+                            (postcode, list(variants)))
+            self._cache.postcodes.add(postcode)
+
+        return postcode_name
  
  
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
  
  
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
-    def __init__(self, cache):
-        self._cache = cache
-        self.data = {}
+    def __init__(self) -> None:
+        self.names: Optional[str] = None
+        self.housenumbers: Set[str] = set()
+        self.housenumber_tokens: Set[int] = set()
+        self.street_tokens: Optional[Set[int]] = None
+        self.place_tokens: Set[int] = set()
+        self.address_tokens: Dict[str, str] = {}
+        self.postcode: Optional[str] = None
+
+
+    def _mk_array(self, tokens: Iterable[Any]) -> str:
+        return f"{{{','.join((str(s) for s in tokens))}}}"
+
+
+    def to_dict(self) -> Dict[str, Any]:
+        """ Return the token information in database importable format.
+        """
+        out: Dict[str, Any] = {}
  
  
-    @staticmethod
-    def _mk_array(tokens):
-        return '{%s}' % ','.join((str(s) for s in tokens))
+        if self.names:
+            out['names'] = self.names
  
  
+        if self.housenumbers:
+            out['hnr'] = ';'.join(self.housenumbers)
+            out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
  
  
-    def add_names(self, fulls, partials):
+        if self.street_tokens is not None:
+            out['street'] = self._mk_array(self.street_tokens)
+
+        if self.place_tokens:
+            out['place'] = self._mk_array(self.place_tokens)
+
+        if self.address_tokens:
+            out['addr'] = self.address_tokens
+
+        if self.postcode:
+            out['postcode'] = self.postcode
+
+        return out
+
+
+    def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
          """ Adds token information for the normalised names.
          """
          """ Adds token information for the normalised names.
          """
-        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
+        self.names = self._mk_array(itertools.chain(fulls, partials))
  
  
  
  
-    def add_housenumbers(self, conn, hnrs):
+    def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
-        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
-        self.data['hnr'] = ';'.join(hnrs)
+        if token:
+            assert hnr is not None
+            self.housenumbers.add(hnr)
+            self.housenumber_tokens.add(token)
  
  
  
  
-    def add_street(self, fulls, _):
+    def add_street(self, tokens: Iterable[int]) -> None:
          """ Add addr:street match terms.
          """
          """ Add addr:street match terms.
          """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if self.street_tokens is None:
+            self.street_tokens = set()
+        self.street_tokens.update(tokens)
  
  
  
  
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens: Iterable[int]) -> None:
          """ Add addr:place search and match terms.
          """
          """ Add addr:place search and match terms.
          """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        self.place_tokens.update(tokens)
  
  
  
  
-    def add_address_terms(self, terms):
+    def add_address_term(self, key: str, partials: Iterable[int]) -> None:
          """ Add additional address terms.
          """
          """ Add additional address terms.
          """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        if partials:
+            self.address_tokens[key] = self._mk_array(partials)
  
  
-        if tokens:
-            self.data['addr'] = tokens
+    def set_postcode(self, postcode: Optional[str]) -> None:
+        """ Set the postcode to the given one.
+        """
+        self.postcode = postcode
  
  
  class _TokenCache:
  
  
  class _TokenCache:
@@ -586,33 +905,9 @@ class _TokenCache:
          This cache is not thread-safe and needs to be instantiated per
          analyzer.
      """
          This cache is not thread-safe and needs to be instantiated per
          analyzer.
      """
-    def __init__(self):
-        self.names = {}
-        self.postcodes = set()
-        self.housenumbers = {}
-
-
-    def get_hnr_tokens(self, conn, terms):
-        """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary. `terms` is an iterable of normalized
-            housenumbers.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.housenumbers.get(term)
-            if token is None:
-                askdb.append(term)
-            else:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.housenumbers[term] = tid
-                    tokens.append(tid)
-
-        return tokens
+    def __init__(self) -> None:
+        self.names: Dict[str, Tuple[int, List[int]]] = {}
+        self.partials: Dict[str, int] = {}
+        self.fulls: Dict[str, List[int]] = {}
+        self.postcodes: Set[str] = set()
+        self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}