]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_icu_tokenizer.py
define formal public Python interface for tokenizer
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
index 2bd22c7207cb3f3cbf2de920a0a5887d67afd04d..44034f842622f08257878b69d392af1f47b00df7 100644 (file)
@@ -3,8 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
-import functools
-import io
 import itertools
 import json
 import logging
 import itertools
 import json
 import logging
@@ -12,17 +10,16 @@ import re
 from textwrap import dedent
 from pathlib import Path
 
 from textwrap import dedent
 from pathlib import Path
 
-from icu import Transliterator
-import psycopg2.extras
-
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 
-DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
 
 
 LOG = logging.getLogger()
 
@@ -32,7 +29,7 @@ def create(dsn, data_dir):
     return LegacyICUTokenizer(dsn, data_dir)
 
 
     return LegacyICUTokenizer(dsn, data_dir)
 
 
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
     """ This tokenizer uses libICU to covert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """ This tokenizer uses libICU to covert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
@@ -41,9 +38,9 @@ class LegacyICUTokenizer:
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None
 
 
     def init_new_db(self, config, init_db=True):
 
 
     def init_new_db(self, config, init_db=True):
@@ -55,14 +52,14 @@ class LegacyICUTokenizer:
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
 
 
-        rules = json.loads(cfgfile.read_text())
-        self.transliteration = ';'.join(rules['normalization']) + ';'
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY
 
 
-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
         self._save_config(config)
 
         if init_db:
         self._save_config(config)
 
         if init_db:
@@ -74,18 +71,15 @@ class LegacyICUTokenizer:
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
 
 
 
 
-    def finalize_import(self, config):
+    def finalize_import(self, _):
         """ Do any required postprocessing to make the tokenizer data ready
             for use.
         """
         """ Do any required postprocessing to make the tokenizer data ready
             for use.
         """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
 
 
     def update_sql_functions(self, config):
 
 
     def update_sql_functions(self, config):
@@ -103,9 +97,7 @@ class LegacyICUTokenizer:
         """
         self.init_from_project()
 
         """
         self.init_from_project()
 
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
@@ -126,26 +118,19 @@ class LegacyICUTokenizer:
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("trans", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 
 
 
 
-    def _install_php(self, config):
+    def _install_php(self, phpdir):
         """ Install the php script for the tokenizer.
         """
         """ Install the php script for the tokenizer.
         """
-        abbr_inverse = list(zip(*self.abbreviations))
         php_file = self.data_dir / "tokenizer.php"
         php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
+        php_file.write_text(dedent(f"""\
             <?php
             <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
 
 
     def _save_config(self, config):
 
 
     def _save_config(self, config):
@@ -153,10 +138,10 @@ class LegacyICUTokenizer:
             database as database properties.
         """
         with connect(self.dsn) as conn:
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
     def _init_db_tables(self, config):
 
 
     def _init_db_tables(self, config):
@@ -165,61 +150,64 @@ class LegacyICUTokenizer:
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
             conn.commit()
 
             LOG.warning("Precomputing word tokens")
 
             # get partial words and their frequencies
             conn.commit()
 
             LOG.warning("Precomputing word tokens")
 
             # get partial words and their frequencies
-            words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
-
-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+            words = self._count_partial_terms(conn)
 
             # copy them back into the word table
 
             # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
-
+            with CopyBuffer() as copystr:
+                for term, cnt in words.items():
+                    copystr.add('w', term, json.dumps({'count': cnt}))
 
 
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['type', 'word_token', 'info'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null and type = 'w'""")
 
             conn.commit()
 
 
             conn.commit()
 
+    def _count_partial_terms(self, conn):
+        """ Count the partial terms from the names in the place table.
+        """
+        words = Counter()
+        name_proc = ICUNameProcessor(self.naming_rules)
+
+        with conn.cursor(name="words") as cur:
+            cur.execute(""" SELECT v, count(*) FROM
+                              (SELECT svals(name) as v FROM place)x
+                            WHERE length(v) < 75 GROUP BY v""")
+
+            for name, cnt in cur:
+                terms = set()
+                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                    if ' ' in word:
+                        terms.update(word.split())
+                for term in terms:
+                    words[term] += cnt
 
 
-class LegacyICUNameAnalyzer:
+        return words
+
+
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
     """ The legacy analyzer uses the ICU library for splitting names.
 
         Each instance opens a connection to the database to request the
         normalization.
     """
 
     """ The legacy analyzer uses the ICU library for splitting names.
 
         Each instance opens a connection to the database to request the
         normalization.
     """
 
-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc
 
         self._cache = _TokenCache()
 
 
 
         self._cache = _TokenCache()
 
 
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
     def close(self):
         """ Free all resources used by the analyzer.
         """
     def close(self):
         """ Free all resources used by the analyzer.
         """
@@ -228,7 +216,7 @@ class LegacyICUNameAnalyzer:
             self.conn = None
 
 
             self.conn = None
 
 
-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
             otherwise is a partial name.
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
             otherwise is a partial name.
@@ -239,30 +227,28 @@ class LegacyICUNameAnalyzer:
             The function is used for testing and debugging only
             and not necessarily efficient.
         """
             The function is used for testing and debugging only
             and not necessarily efficient.
         """
-        tokens = {}
+        full_tokens = {}
+        partial_tokens = {}
         for word in words:
             if word.startswith('#'):
         for word in words:
             if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
             else:
             else:
-                tokens[word] = self.make_standard_word(word)
+                partial_tokens[word] = self.name_processor.get_search_normalized(word)
 
 
-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
             cur.execute("""SELECT word_token, word_id
-                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                           WHERE word_token = t.term
-                                 and class is null and country_code is null""",
-                        (list(tokens.values()), ))
-            ids = {r[0]: r[1] for r in cur}
+                            FROM word WHERE word_token = ANY(%s) and type = 'W'
+                        """, (list(full_tokens.values()),))
+            full_ids = {r[0]: r[1] for r in cur}
+            cur.execute("""SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'w'""",
+                        (list(partial_tokens.values()),))
+            part_ids = {r[0]: r[1] for r in cur}
 
 
-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+               + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 
 
 
 
-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
-
     @staticmethod
     def normalize_postcode(postcode):
         """ Convert the postcode to a standardized form.
     @staticmethod
     def normalize_postcode(postcode):
         """ Convert the postcode to a standardized form.
@@ -273,34 +259,18 @@ class LegacyICUNameAnalyzer:
         return postcode.strip().upper()
 
 
         return postcode.strip().upper()
 
 
-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
-        """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
-
-
     def _make_standard_hnr(self, hnr):
         """ Create a normalised version of a housenumber.
 
             This function takes minor shortcuts on transliteration.
         """
     def _make_standard_hnr(self, hnr):
         """ Create a normalised version of a housenumber.
 
             This function takes minor shortcuts on transliteration.
         """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
             table.
         """
         to_delete = []
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
             table.
         """
         to_delete = []
-        copystr = io.StringIO()
         with self.conn.cursor() as cur:
             # This finds us the rows in location_postcode and word that are
             # missing in the other table.
         with self.conn.cursor() as cur:
             # This finds us the rows in location_postcode and word that are
             # missing in the other table.
@@ -308,108 +278,120 @@ class LegacyICUNameAnalyzer:
                             (SELECT pc, word FROM
                               (SELECT distinct(postcode) as pc FROM location_postcode) p
                               FULL JOIN
                             (SELECT pc, word FROM
                               (SELECT distinct(postcode) as pc FROM location_postcode) p
                               FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT word FROM word WHERE type = 'P') w
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
-            for postcode, word in cur:
-                if postcode is None:
-                    to_delete.append(word)
-                else:
-                    copystr.write(postcode)
-                    copystr.write('\t ')
-                    copystr.write(self.transliterator.transliterate(postcode))
-                    copystr.write('\tplace\tpostcode\t0\n')
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', postcode)
 
 
-            if to_delete:
-                cur.execute("""DELETE FROM WORD
-                               WHERE class ='place' and type = 'postcode'
-                                     and word = any(%s)
-                            """, (to_delete, ))
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE type ='P' and word = any(%s)
+                                """, (to_delete, ))
 
 
-            if copystr.getvalue():
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'search_name_count'])
+                copystr.copy_out(cur, 'word',
+                                 columns=['word_token', 'type', 'word'])
 
 
     def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
 
 
     def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
         """
         """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
             # Get the old phrases.
             existing_phrases = set()
                             for p in phrases))
 
         with self.conn.cursor() as cur:
             # Get the old phrases.
             existing_phrases = set()
-            cur.execute("""SELECT word, class, type, operator FROM word
-                           WHERE class != 'place'
-                                 OR (type != 'house' AND type != 'postcode')""")
-            for label, cls, typ, oper in cur:
-                existing_phrases.add((label, cls, typ, oper or '-'))
-
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
-
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
-
-            if to_delete and should_replace:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
+            cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+            for word, info in cur:
+                existing_phrases.add((word, info['class'], info['type'],
+                                      info.get('op') or '-'))
+
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
 
         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 
         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+                 len(norm_phrases), added, deleted)
 
 
 
 
-    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(term, 'S', word,
+                                json.dumps({'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None}))
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
+                             columns=['word_token', 'type', 'word', 'info'])
+
+        return added
+
+
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
         """
         """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            cursor.execute_values(
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE type = 'S' and word = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
+
+        return len(to_delete)
 
 
 
 
-    def _add_normalized_country_names(self, country_code, names):
+    def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
         """ Add names for the given country to the search index.
         """
-        word_tokens = set((' ' + name for name in names))
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            norm_name = self.name_processor.get_search_normalized(name)
+            if norm_name:
+                word_tokens.add(norm_name)
+
         with self.conn.cursor() as cur:
             # Get existing names
         with self.conn.cursor() as cur:
             # Get existing names
-            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+            cur.execute("""SELECT word_token FROM word
+                            WHERE type = 'C' and word = %s""",
                         (country_code, ))
             word_tokens.difference_update((t[0] for t in cur))
 
                         (country_code, ))
             word_tokens.difference_update((t[0] for t in cur))
 
+            # Only add those names that are not yet in the list.
             if word_tokens:
             if word_tokens:
-                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
-                                                 search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                cur.execute("""INSERT INTO word (word_token, type, word)
+                               (SELECT token, 'C', %s
                                 FROM unnest(%s) as token)
                                 FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
+
+            # No names are deleted at the moment.
+            # If deletion is made possible, then the static names from the
+            # initial 'country_name' table should be kept.
 
 
     def process_place(self, place):
 
 
     def process_place(self, place):
@@ -423,58 +405,87 @@ class LegacyICUNameAnalyzer:
         names = place.get('name')
 
         if names:
         names = place.get('name')
 
         if names:
-            full_names = self._compute_full_names(names)
+            fulls, partials = self._compute_name_tokens(names)
 
 
-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)
 
         address = place.get('address')
 
         address = place.get('address')
-
         if address:
         if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
-                elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+            self._process_place_address(token_info, address)
 
         return token_info.data
 
 
 
         return token_info.data
 
 
-    def _compute_full_names(self, names):
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
+    @staticmethod
+    def _compute_full_names(names):
         """ Return the set of all full name word ids to be used with the
             given dictionary of names.
         """
         full_names = set()
         """ Return the set of all full name word ids to be used with the
             given dictionary of names.
         """
         full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            word = self.make_standard_word(name)
-            if word:
-                full_names.add(word)
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
 
 
-                brace_split = name.split('(', 2)
-                if len(brace_split) > 1:
-                    word = self.make_standard_word(brace_split[0])
-                    if word:
-                        full_names.add(word)
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
 
         return full_names
 
 
         return full_names
 
@@ -486,22 +497,21 @@ class LegacyICUNameAnalyzer:
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
-                term = self.make_standard_word(postcode)
+                term = self.name_processor.get_search_normalized(postcode)
                 if not term:
                     return
 
                 with self.conn.cursor() as cur:
                     # no word_id needed for postcodes
                 if not term:
                     return
 
                 with self.conn.cursor() as cur:
                     # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
-                                    FROM (VALUES (%s)) as v(pc)
+                    cur.execute("""INSERT INTO word (word_token, type, word)
+                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
                                     WHERE NOT EXISTS
                                      (SELECT * FROM word
                                     WHERE NOT EXISTS
                                      (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and word = pc))
+                                """, (term, postcode))
                 self._cache.postcodes.add(postcode)
 
                 self._cache.postcodes.add(postcode)
 
+
     @staticmethod
     def _split_housenumbers(hnrs):
         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
     @staticmethod
     def _split_housenumbers(hnrs):
         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -524,7 +534,7 @@ class _TokenInfo:
     """ Collect token information to be sent back to the database.
     """
     def __init__(self, cache):
     """ Collect token information to be sent back to the database.
     """
     def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
         self.data = {}
 
     @staticmethod
         self.data = {}
 
     @staticmethod
@@ -532,86 +542,44 @@ class _TokenInfo:
         return '{%s}' % ','.join((str(s) for s in tokens))
 
 
         return '{%s}' % ','.join((str(s) for s in tokens))
 
 
-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
         """ Adds token information for the normalised names.
         """
         """ Adds token information for the normalised names.
         """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 
 
     def add_housenumbers(self, conn, hnrs):
         """ Extract housenumber information from a list of normalised
             housenumbers.
         """
 
 
     def add_housenumbers(self, conn, hnrs):
         """ Extract housenumber information from a list of normalised
             housenumbers.
         """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
         self.data['hnr'] = ';'.join(hnrs)
 
 
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, conn, street):
+    def add_street(self, fulls, _):
         """ Add addr:street match terms.
         """
         """ Add addr:street match terms.
         """
-        if not street:
-            return
-
-        term = ' ' + street
-
-        tid = self.cache.names.get(term)
-
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
-
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)
 
 
 
 
-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
         """ Add addr:place search and match terms.
         """
         """ Add addr:place search and match terms.
         """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)
 
 
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
 
 
-
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
         """ Add additional address terms.
         """
         tokens = {}
 
         """ Add additional address terms.
         """
         tokens = {}
 
-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]
 
         if tokens:
             self.data['addr'] = tokens
 
         if tokens:
             self.data['addr'] = tokens
@@ -629,35 +597,10 @@ class _TokenCache:
         self.housenumbers = {}
 
 
         self.housenumbers = {}
 
 
-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
         """
         tokens = []
         askdb = []
         """
         tokens = []
         askdb = []