switch word tokens to new word table layout

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 065fdb03a27041eb79bb435db387fdfd316d6801..14fa5b609456c51ee4a7f9a35f6e5bf9908636c7 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,26 +3,21 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
-import functools
-import io
  import itertools
  import itertools
-import json
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
-from icu import Transliterator
-import psycopg2.extras
-
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  
  
-DBCFG_NORMALIZATION = "tokenizer_normalization"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
@@ -41,9 +36,9 @@ class LegacyICUTokenizer:
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None
  
  
      def init_new_db(self, config, init_db=True):
  
  
      def init_new_db(self, config, init_db=True):
@@ -55,14 +50,14 @@ class LegacyICUTokenizer:
          if config.TOKENIZER_CONFIG:
              cfgfile = Path(config.TOKENIZER_CONFIG)
          else:
          if config.TOKENIZER_CONFIG:
              cfgfile = Path(config.TOKENIZER_CONFIG)
          else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  
  
-        rules = json.loads(cfgfile.read_text())
-        self.transliteration = ';'.join(rules['normalization']) + ';'
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY
  
  
-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
          self._save_config(config)
  
          if init_db:
          self._save_config(config)
  
          if init_db:
@@ -74,18 +69,16 @@ class LegacyICUTokenizer:
          """ Initialise the tokenizer from the project directory.
          """
          with connect(self.dsn) as conn:
          """ Initialise the tokenizer from the project directory.
          """
          with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
  
  
-    def finalize_import(self, config):
+    def finalize_import(self, _):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+        pass
  
  
      def update_sql_functions(self, config):
  
  
      def update_sql_functions(self, config):
@@ -103,9 +96,7 @@ class LegacyICUTokenizer:
          """
          self.init_from_project()
  
          """
          self.init_from_project()
  
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
@@ -126,26 +117,19 @@ class LegacyICUTokenizer:
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("trans", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
  
  
  
-    def _install_php(self, config):
+    def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
          """ Install the php script for the tokenizer.
          """
-        abbr_inverse = list(zip(*self.abbreviations))
          php_file = self.data_dir / "tokenizer.php"
          php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
+        php_file.write_text(dedent(f"""\
              <?php
              <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
  
  
      def _save_config(self, config):
  
  
      def _save_config(self, config):
@@ -153,10 +137,10 @@ class LegacyICUTokenizer:
              database as database properties.
          """
          with connect(self.dsn) as conn:
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
              set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
              set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
      def _init_db_tables(self, config):
  
  
      def _init_db_tables(self, config):
@@ -165,32 +149,37 @@ class LegacyICUTokenizer:
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
              words = Counter()
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
              words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
-
-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+            name_proc = ICUNameProcessor(self.naming_rules)
+            with conn.cursor(name="words") as cur:
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
+
+                for name, cnt in cur:
+                    terms = set()
+                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                        if ' ' in word:
+                            terms.update(word.split())
+                    for term in terms:
+                        words[term] += cnt
  
              # copy them back into the word table
  
              # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
+            with CopyBuffer() as copystr:
+                for k, v in words.items():
+                    copystr.add('w', k, {'count': v})
  
  
-
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['type', 'word_token', 'info'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null and type = 'w'""")
  
              conn.commit()
  
  
              conn.commit()
  
@@ -202,12 +191,10 @@ class LegacyICUNameAnalyzer:
          normalization.
      """
  
          normalization.
      """
  
-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc
  
          self._cache = _TokenCache()
  
  
          self._cache = _TokenCache()
  
@@ -228,7 +215,7 @@ class LegacyICUNameAnalyzer:
              self.conn = None
  
  
              self.conn = None
  
  
-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
@@ -239,40 +226,36 @@ class LegacyICUNameAnalyzer:
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
-        tokens = {}
+        full_tokens = {}
+        partial_tokens = {}
          for word in words:
              if word.startswith('#'):
          for word in words:
              if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
              else:
              else:
-                tokens[word] = self.make_standard_word(word)
-
-        with conn.cursor() as cur:
-            cur.execute("""SELECT word_token, word_id
-                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                           WHERE word_token = t.term
-                                 and class is null and country_code is null""",
-                        (list(tokens.values()), ))
+                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+
+        with self.conn.cursor() as cur:
+            cur.execute("""(SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'W')
+                           UNION
+                           (SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'w')""",
+                        (list(full_tokens.values()),
+                         list(partial_tokens.values())))
              ids = {r[0]: r[1] for r in cur}
  
              ids = {r[0]: r[1] for r in cur}
  
-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
+               + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
  
  
-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
+    @staticmethod
+    def normalize_postcode(postcode):
+        """ Convert the postcode to a standardized form.
  
  
-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
          """
          """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
+        return postcode.strip().upper()
  
  
      def _make_standard_hnr(self, hnr):
  
  
      def _make_standard_hnr(self, hnr):
@@ -280,107 +263,134 @@ class LegacyICUNameAnalyzer:
  
              This function takes minor shortcuts on transliteration.
          """
  
              This function takes minor shortcuts on transliteration.
          """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)
  
  
-    def add_postcodes_from_db(self):
-        """ Add postcodes from the location_postcode table to the word table.
+    def update_postcodes_from_db(self):
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
          """
          """
-        copystr = io.StringIO()
+        to_delete = []
          with self.conn.cursor() as cur:
          with self.conn.cursor() as cur:
-            cur.execute("SELECT distinct(postcode) FROM location_postcode")
-            for (postcode, ) in cur:
-                copystr.write(postcode)
-                copystr.write('\t ')
-                copystr.write(self.transliterator.transliterate(postcode))
-                copystr.write('\tplace\tpostcode\t0\n')
-
-            copystr.seek(0)
-            cur.copy_from(copystr, 'word',
-                          columns=['word', 'word_token', 'class', 'type',
-                                   'search_name_count'])
-            # Don't really need an ID for postcodes....
-            # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-            #                WHERE word_id is null and type = 'postcode'""")
-
-
-    def update_special_phrases(self, phrases):
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")
+
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', {'postcode': postcode})
+
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE type ='P' and info->>'postcode' = any(%s)
+                                """, (to_delete, ))
+
+                copystr.copy_out(cur, 'word',
+                                 columns=['word_token', 'type', 'info'])
+
+
+    def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
          """
          """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
                              for p in phrases))
  
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
-            cur.execute("""SELECT word, class, type, operator FROM word
-                           WHERE class != 'place'
-                                 OR (type != 'house' AND type != 'postcode')""")
-            for label, cls, typ, oper in cur:
-                existing_phrases.add((label, cls, typ, oper or '-'))
-
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
-
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
-
-            if to_delete:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
+            cur.execute("SELECT info FROM word WHERE type = 'S'")
+            for (info, ) in cur:
+                existing_phrases.add((info['word'], info['class'], info['type'],
+                                      info.get('op') or '-'))
+
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+                 len(norm_phrases), added, deleted)
  
  
  
  
-    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(term, 'S',
+                                {'word': word, 'class': cls, 'type': typ,
+                                 'op': oper if oper in ('in', 'near') else None})
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
+                             columns=['word_token', 'type', 'info'])
+
+        return added
+
+
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
          """
          """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            cursor.execute_values(
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE info->>'word' = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
+
+        return len(to_delete)
  
  
  
  
-    def _add_normalized_country_names(self, country_code, names):
+    def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
          """ Add names for the given country to the search index.
          """
-        word_tokens = set((' ' + name for name in names))
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            norm_name = self.name_processor.get_search_normalized(name)
+            if norm_name:
+                word_tokens.add(norm_name)
+
          with self.conn.cursor() as cur:
              # Get existing names
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+            cur.execute("""SELECT word_token FROM word
+                            WHERE type = 'C' and info->>'cc'= %s""",
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
+            # Only add those names that are not yet in the list.
              if word_tokens:
              if word_tokens:
-                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
-                                                 search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                cur.execute("""INSERT INTO word (word_token, type, info)
+                               (SELECT token, 'C', json_build_object('cc', %s)
                                  FROM unnest(%s) as token)
                                  FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
+
+            # No names are deleted at the moment.
+            # If deletion is made possible, then the static names from the
+            # initial 'country_name' table should be kept.
  
  
      def process_place(self, place):
  
  
      def process_place(self, place):
@@ -394,63 +404,113 @@ class LegacyICUNameAnalyzer:
          names = place.get('name')
  
          if names:
          names = place.get('name')
  
          if names:
-            full_names = set((self.make_standard_word(name) for name in names.values()))
-            full_names.discard('')
+            fulls, partials = self._compute_name_tokens(names)
  
  
-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
  
          address = place.get('address')
-
          if address:
          if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
-                elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+            self._process_place_address(token_info, address)
  
          return token_info.data
  
  
  
          return token_info.data
  
  
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
+    @staticmethod
+    def _compute_full_names(names):
+        """ Return the set of all full name word ids to be used with the
+            given dictionary of names.
+        """
+        full_names = set()
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
+
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
+
+        return full_names
+
+
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
-        if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
-            term = self.make_standard_word(postcode)
-            if not term:
-                return
-
-            with self.conn.cursor() as cur:
-                # no word_id needed for postcodes
-                cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                 search_name_count)
-                               (SELECT pc, %s, 'place', 'postcode', 0
-                                FROM (VALUES (%s)) as v(pc)
-                                WHERE NOT EXISTS
-                                 (SELECT * FROM word
-                                  WHERE word = pc and class='place' and type='postcode'))
-                            """, (' ' + term, postcode))
-            self._cache.postcodes.add(postcode)
+        if re.search(r'[:,;]', postcode) is None:
+            postcode = self.normalize_postcode(postcode)
+
+            if postcode not in self._cache.postcodes:
+                term = self.name_processor.get_search_normalized(postcode)
+                if not term:
+                    return
+
+                with self.conn.cursor() as cur:
+                    # no word_id needed for postcodes
+                    cur.execute("""INSERT INTO word (word_token, type, info)
+                                   (SELECT %s, 'P', json_build_object('postcode', pc)
+                                    FROM (VALUES (%s)) as v(pc)
+                                    WHERE NOT EXISTS
+                                     (SELECT * FROM word
+                                      WHERE type = 'P' and info->>postcode = pc))
+                                """, (term, postcode))
+                self._cache.postcodes.add(postcode)
+
  
      @staticmethod
      def _split_housenumbers(hnrs):
  
      @staticmethod
      def _split_housenumbers(hnrs):
@@ -474,7 +534,7 @@ class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
      def __init__(self, cache):
      """ Collect token information to be sent back to the database.
      """
      def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
          self.data = {}
  
      @staticmethod
          self.data = {}
  
      @staticmethod
@@ -482,88 +542,44 @@ class _TokenInfo:
          return '{%s}' % ','.join((str(s) for s in tokens))
  
  
          return '{%s}' % ','.join((str(s) for s in tokens))
  
  
-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
          """ Adds token information for the normalised names.
          """
          """ Adds token information for the normalised names.
          """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add partials for the full terms (TO BE REMOVED)
-        terms.update((n for n in names))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
  
  
      def add_housenumbers(self, conn, hnrs):
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
  
  
      def add_housenumbers(self, conn, hnrs):
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
          self.data['hnr'] = ';'.join(hnrs)
  
  
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, conn, street):
+    def add_street(self, fulls, _):
          """ Add addr:street match terms.
          """
          """ Add addr:street match terms.
          """
-        if not street:
-            return
-
-        term = ' ' + street
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)
  
  
-        tid = self.cache.names.get(term)
  
  
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
-
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
-
-
-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
          """ Add addr:place search and match terms.
          """
          """ Add addr:place search and match terms.
          """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)
  
  
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
  
  
-
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
          """ Add additional address terms.
          """
          tokens = {}
  
          """ Add additional address terms.
          """
          tokens = {}
  
-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]
  
          if tokens:
              self.data['addr'] = tokens
  
          if tokens:
              self.data['addr'] = tokens
@@ -581,35 +597,10 @@ class _TokenCache:
          self.housenumbers = {}
  
  
          self.housenumbers = {}
  
  
-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
          """
          tokens = []
          askdb = []
          """
          tokens = []
          askdb = []