define formal public Python interface for tokenizer

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 2bd22c7207cb3f3cbf2de920a0a5887d67afd04d..44034f842622f08257878b69d392af1f47b00df7 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,8 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
-import functools
-import io
  import itertools
  import json
  import logging
  import itertools
  import json
  import logging
@@ -12,17 +10,16 @@ import re
  from textwrap import dedent
  from pathlib import Path
  
  from textwrap import dedent
  from pathlib import Path
  
-from icu import Transliterator
-import psycopg2.extras
-
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  
-DBCFG_NORMALIZATION = "tokenizer_normalization"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
@@ -32,7 +29,7 @@ def create(dsn, data_dir):
      return LegacyICUTokenizer(dsn, data_dir)
  
  
      return LegacyICUTokenizer(dsn, data_dir)
  
  
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
@@ -41,9 +38,9 @@ class LegacyICUTokenizer:
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None
  
  
      def init_new_db(self, config, init_db=True):
  
  
      def init_new_db(self, config, init_db=True):
@@ -55,14 +52,14 @@ class LegacyICUTokenizer:
          if config.TOKENIZER_CONFIG:
              cfgfile = Path(config.TOKENIZER_CONFIG)
          else:
          if config.TOKENIZER_CONFIG:
              cfgfile = Path(config.TOKENIZER_CONFIG)
          else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  
  
-        rules = json.loads(cfgfile.read_text())
-        self.transliteration = ';'.join(rules['normalization']) + ';'
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY
  
  
-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
          self._save_config(config)
  
          if init_db:
          self._save_config(config)
  
          if init_db:
@@ -74,18 +71,15 @@ class LegacyICUTokenizer:
          """ Initialise the tokenizer from the project directory.
          """
          with connect(self.dsn) as conn:
          """ Initialise the tokenizer from the project directory.
          """
          with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
  
  
-    def finalize_import(self, config):
+    def finalize_import(self, _):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  
  
      def update_sql_functions(self, config):
  
  
      def update_sql_functions(self, config):
@@ -103,9 +97,7 @@ class LegacyICUTokenizer:
          """
          self.init_from_project()
  
          """
          self.init_from_project()
  
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
@@ -126,26 +118,19 @@ class LegacyICUTokenizer:
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("trans", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
  
  
  
-    def _install_php(self, config):
+    def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
          """ Install the php script for the tokenizer.
          """
-        abbr_inverse = list(zip(*self.abbreviations))
          php_file = self.data_dir / "tokenizer.php"
          php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
+        php_file.write_text(dedent(f"""\
              <?php
              <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
  
  
      def _save_config(self, config):
  
  
      def _save_config(self, config):
@@ -153,10 +138,10 @@ class LegacyICUTokenizer:
              database as database properties.
          """
          with connect(self.dsn) as conn:
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
              set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
              set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
      def _init_db_tables(self, config):
  
  
      def _init_db_tables(self, config):
@@ -165,61 +150,64 @@ class LegacyICUTokenizer:
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
-            words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
-
-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+            words = self._count_partial_terms(conn)
  
              # copy them back into the word table
  
              # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
-
+            with CopyBuffer() as copystr:
+                for term, cnt in words.items():
+                    copystr.add('w', term, json.dumps({'count': cnt}))
  
  
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['type', 'word_token', 'info'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null and type = 'w'""")
  
              conn.commit()
  
  
              conn.commit()
  
+    def _count_partial_terms(self, conn):
+        """ Count the partial terms from the names in the place table.
+        """
+        words = Counter()
+        name_proc = ICUNameProcessor(self.naming_rules)
+
+        with conn.cursor(name="words") as cur:
+            cur.execute(""" SELECT v, count(*) FROM
+                              (SELECT svals(name) as v FROM place)x
+                            WHERE length(v) < 75 GROUP BY v""")
+
+            for name, cnt in cur:
+                terms = set()
+                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                    if ' ' in word:
+                        terms.update(word.split())
+                for term in terms:
+                    words[term] += cnt
  
  
-class LegacyICUNameAnalyzer:
+        return words
+
+
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
          normalization.
      """
  
      """ The legacy analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
          normalization.
      """
  
-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc
  
          self._cache = _TokenCache()
  
  
  
          self._cache = _TokenCache()
  
  
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
      def close(self):
          """ Free all resources used by the analyzer.
          """
      def close(self):
          """ Free all resources used by the analyzer.
          """
@@ -228,7 +216,7 @@ class LegacyICUNameAnalyzer:
              self.conn = None
  
  
              self.conn = None
  
  
-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
@@ -239,30 +227,28 @@ class LegacyICUNameAnalyzer:
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
-        tokens = {}
+        full_tokens = {}
+        partial_tokens = {}
          for word in words:
              if word.startswith('#'):
          for word in words:
              if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
              else:
              else:
-                tokens[word] = self.make_standard_word(word)
+                partial_tokens[word] = self.name_processor.get_search_normalized(word)
  
  
-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
              cur.execute("""SELECT word_token, word_id
-                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                           WHERE word_token = t.term
-                                 and class is null and country_code is null""",
-                        (list(tokens.values()), ))
-            ids = {r[0]: r[1] for r in cur}
+                            FROM word WHERE word_token = ANY(%s) and type = 'W'
+                        """, (list(full_tokens.values()),))
+            full_ids = {r[0]: r[1] for r in cur}
+            cur.execute("""SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'w'""",
+                        (list(partial_tokens.values()),))
+            part_ids = {r[0]: r[1] for r in cur}
  
  
-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+               + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
  
  
-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
-
      @staticmethod
      def normalize_postcode(postcode):
          """ Convert the postcode to a standardized form.
      @staticmethod
      def normalize_postcode(postcode):
          """ Convert the postcode to a standardized form.
@@ -273,34 +259,18 @@ class LegacyICUNameAnalyzer:
          return postcode.strip().upper()
  
  
          return postcode.strip().upper()
  
  
-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
-        """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
-
-
      def _make_standard_hnr(self, hnr):
          """ Create a normalised version of a housenumber.
  
              This function takes minor shortcuts on transliteration.
          """
      def _make_standard_hnr(self, hnr):
          """ Create a normalised version of a housenumber.
  
              This function takes minor shortcuts on transliteration.
          """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
          to_delete = []
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
              table.
          """
          to_delete = []
-        copystr = io.StringIO()
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
@@ -308,108 +278,120 @@ class LegacyICUNameAnalyzer:
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT word FROM word WHERE type = 'P') w
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
-            for postcode, word in cur:
-                if postcode is None:
-                    to_delete.append(word)
-                else:
-                    copystr.write(postcode)
-                    copystr.write('\t ')
-                    copystr.write(self.transliterator.transliterate(postcode))
-                    copystr.write('\tplace\tpostcode\t0\n')
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', postcode)
  
  
-            if to_delete:
-                cur.execute("""DELETE FROM WORD
-                               WHERE class ='place' and type = 'postcode'
-                                     and word = any(%s)
-                            """, (to_delete, ))
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE type ='P' and word = any(%s)
+                                """, (to_delete, ))
  
  
-            if copystr.getvalue():
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'search_name_count'])
+                copystr.copy_out(cur, 'word',
+                                 columns=['word_token', 'type', 'word'])
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
          """
          """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
                              for p in phrases))
  
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
-            cur.execute("""SELECT word, class, type, operator FROM word
-                           WHERE class != 'place'
-                                 OR (type != 'house' AND type != 'postcode')""")
-            for label, cls, typ, oper in cur:
-                existing_phrases.add((label, cls, typ, oper or '-'))
-
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
-
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
-
-            if to_delete and should_replace:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
+            cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+            for word, info in cur:
+                existing_phrases.add((word, info['class'], info['type'],
+                                      info.get('op') or '-'))
+
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+                 len(norm_phrases), added, deleted)
  
  
  
  
-    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(term, 'S', word,
+                                json.dumps({'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None}))
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
+                             columns=['word_token', 'type', 'word', 'info'])
+
+        return added
+
+
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
          """
          """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            cursor.execute_values(
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE type = 'S' and word = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
+
+        return len(to_delete)
  
  
  
  
-    def _add_normalized_country_names(self, country_code, names):
+    def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
          """ Add names for the given country to the search index.
          """
-        word_tokens = set((' ' + name for name in names))
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            norm_name = self.name_processor.get_search_normalized(name)
+            if norm_name:
+                word_tokens.add(norm_name)
+
          with self.conn.cursor() as cur:
              # Get existing names
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+            cur.execute("""SELECT word_token FROM word
+                            WHERE type = 'C' and word = %s""",
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
+            # Only add those names that are not yet in the list.
              if word_tokens:
              if word_tokens:
-                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
-                                                 search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                cur.execute("""INSERT INTO word (word_token, type, word)
+                               (SELECT token, 'C', %s
                                  FROM unnest(%s) as token)
                                  FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
+
+            # No names are deleted at the moment.
+            # If deletion is made possible, then the static names from the
+            # initial 'country_name' table should be kept.
  
  
      def process_place(self, place):
  
  
      def process_place(self, place):
@@ -423,58 +405,87 @@ class LegacyICUNameAnalyzer:
          names = place.get('name')
  
          if names:
          names = place.get('name')
  
          if names:
-            full_names = self._compute_full_names(names)
+            fulls, partials = self._compute_name_tokens(names)
  
  
-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
  
          address = place.get('address')
-
          if address:
          if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
-                elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+            self._process_place_address(token_info, address)
  
          return token_info.data
  
  
  
          return token_info.data
  
  
-    def _compute_full_names(self, names):
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
+    @staticmethod
+    def _compute_full_names(names):
          """ Return the set of all full name word ids to be used with the
              given dictionary of names.
          """
          full_names = set()
          """ Return the set of all full name word ids to be used with the
              given dictionary of names.
          """
          full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            word = self.make_standard_word(name)
-            if word:
-                full_names.add(word)
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
  
  
-                brace_split = name.split('(', 2)
-                if len(brace_split) > 1:
-                    word = self.make_standard_word(brace_split[0])
-                    if word:
-                        full_names.add(word)
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
  
          return full_names
  
  
          return full_names
  
@@ -486,22 +497,21 @@ class LegacyICUNameAnalyzer:
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
-                term = self.make_standard_word(postcode)
+                term = self.name_processor.get_search_normalized(postcode)
                  if not term:
                      return
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
                  if not term:
                      return
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
-                                    FROM (VALUES (%s)) as v(pc)
+                    cur.execute("""INSERT INTO word (word_token, type, word)
+                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and word = pc))
+                                """, (term, postcode))
                  self._cache.postcodes.add(postcode)
  
                  self._cache.postcodes.add(postcode)
  
+
      @staticmethod
      def _split_housenumbers(hnrs):
          if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
      @staticmethod
      def _split_housenumbers(hnrs):
          if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -524,7 +534,7 @@ class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
      def __init__(self, cache):
      """ Collect token information to be sent back to the database.
      """
      def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
          self.data = {}
  
      @staticmethod
          self.data = {}
  
      @staticmethod
@@ -532,86 +542,44 @@ class _TokenInfo:
          return '{%s}' % ','.join((str(s) for s in tokens))
  
  
          return '{%s}' % ','.join((str(s) for s in tokens))
  
  
-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
          """ Adds token information for the normalised names.
          """
          """ Adds token information for the normalised names.
          """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
  
  
      def add_housenumbers(self, conn, hnrs):
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
  
  
      def add_housenumbers(self, conn, hnrs):
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
          self.data['hnr'] = ';'.join(hnrs)
  
  
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, conn, street):
+    def add_street(self, fulls, _):
          """ Add addr:street match terms.
          """
          """ Add addr:street match terms.
          """
-        if not street:
-            return
-
-        term = ' ' + street
-
-        tid = self.cache.names.get(term)
-
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
-
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)
  
  
  
  
-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
          """ Add addr:place search and match terms.
          """
          """ Add addr:place search and match terms.
          """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)
  
  
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
  
  
-
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
          """ Add additional address terms.
          """
          tokens = {}
  
          """ Add additional address terms.
          """
          tokens = {}
  
-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]
  
          if tokens:
              self.data['addr'] = tokens
  
          if tokens:
              self.data['addr'] = tokens
@@ -629,35 +597,10 @@ class _TokenCache:
          self.housenumbers = {}
  
  
          self.housenumbers = {}
  
  
-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
          """
          tokens = []
          askdb = []
          """
          tokens = []
          askdb = []