add actual removal of housenumber tokens

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index cb4112049fb7e8173b835fa1638db0f6ee3a7cc4..0841300a9b92421ac130ab827ff2a9935af507ed 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -1,24 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from collections import Counter
  import itertools
  import json
  import logging
  import re
  from textwrap import dedent
  import itertools
  import json
  import logging
  import re
  from textwrap import dedent
-from pathlib import Path
  
  from nominatim.db.connection import connect
  
  from nominatim.db.connection import connect
-from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
@@ -38,9 +40,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.naming_rules = None
-        self.term_normalization = None
-        self.max_word_frequency = None
+        self.loader = None
  
  
      def init_new_db(self, config, init_db=True):
  
  
      def init_new_db(self, config, init_db=True):
@@ -49,58 +49,106 @@ class LegacyICUTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        if config.TOKENIZER_CONFIG:
-            cfgfile = Path(config.TOKENIZER_CONFIG)
-        else:
-            cfgfile = config.config_dir / 'icu_tokenizer.yaml'
-
-        loader = ICURuleLoader(cfgfile)
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
-        self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
+        self.loader = ICURuleLoader(config)
  
          self._install_php(config.lib_dir.php)
  
          self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
  
          if init_db:
              self.update_sql_functions(config)
              self._init_db_tables(config)
  
  
  
          if init_db:
              self.update_sql_functions(config)
              self._init_db_tables(config)
  
  
-    def init_from_project(self):
+    def init_from_project(self, config):
          """ Initialise the tokenizer from the project directory.
          """
          """ Initialise the tokenizer from the project directory.
          """
+        self.loader = ICURuleLoader(config)
+
          with connect(self.dsn) as conn:
          with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
-            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
+            self.loader.load_config_from_db(conn)
  
  
  
  
-    def finalize_import(self, _):
+    def finalize_import(self, config):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  
  
      def update_sql_functions(self, config):
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
  
  
      def update_sql_functions(self, config):
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
              sqlp = SQLPreprocessor(conn, config)
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
  
  
-    def check_database(self):
+    def check_database(self, config):
          """ Check that the tokenizer is set up correctly.
          """
          """ Check that the tokenizer is set up correctly.
          """
-        self.init_from_project()
+        # Will throw an error if there is an issue.
+        self.init_from_project(config)
+
+
+    def update_statistics(self):
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word
+                                   SET info = info || jsonb_build_object('count', count)
+                                   FROM word_frequencies WHERE word_id = id""")
+                    cur.drop_table("word_frequencies")
+            conn.commit()
+
+
+    def _cleanup_housenumbers(self):
+        """ Remove unused house numbers.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT word_id, word_token FROM word
+                               WHERE type = 'H'
+                                 AND NOT EXISTS(SELECT * FROM search_name
+                                                WHERE ARRAY[word.word_id] && name_vector)
+                                 AND (char_length(word_token) > 6
+                                      OR word_token not similar to '\d+')
+                            """)
+                candidates = {token: wid for wid, token in cur}
+            with conn.cursor(name="hnr_counter") as cur:
+                cur.execute("""SELECT housenumber FROM placex
+                               WHERE housenumber is not null
+                                     AND (char_length(housenumber) > 6
+                                          OR housenumber not similar to '\d+')
+                            """)
+                for row in cur:
+                    for hnr in row[0].split(';'):
+                        candidates.pop(hnr, None)
+            LOG.info("There are %s outdated housenumbers.", len(candidates))
+            if candidates:
+                with conn.cursor() as cur:
+                    cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
+                                (list(candidates.values()), ))
+                conn.commit()
  
  
-        if self.naming_rules is None:
-            return "Configuration for tokenizer 'icu' are missing."
  
  
-        return None
+
+    def update_word_tokens(self):
+        """ Remove unused tokens.
+        """
+        LOG.warn("Cleaning up housenumber tokens.")
+        self._cleanup_housenumbers()
+        LOG.warn("Tokenizer house-keeping done.")
  
  
      def name_analyzer(self):
  
  
      def name_analyzer(self):
@@ -118,7 +166,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
  
  
      def _install_php(self, phpdir):
  
  
      def _install_php(self, phpdir):
@@ -127,21 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
          php_file = self.data_dir / "tokenizer.php"
          php_file.write_text(dedent(f"""\
              <?php
          php_file = self.data_dir / "tokenizer.php"
          php_file.write_text(dedent(f"""\
              <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
-            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Max_Word_Frequency', 10000000);
+            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
-    def _save_config(self, config):
+    def _save_config(self):
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          with connect(self.dsn) as conn:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
+            self.loader.save_config_to_db(conn)
  
  
      def _init_db_tables(self, config):
  
  
      def _init_db_tables(self, config):
@@ -153,45 +199,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
-            LOG.warning("Precomputing word tokens")
-
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
-
-        return words
-
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
@@ -200,10 +207,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          normalization.
      """
  
          normalization.
      """
  
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
  
          self._cache = _TokenCache()
  
  
          self._cache = _TokenCache()
  
@@ -216,6 +224,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              self.conn = None
  
  
              self.conn = None
  
  
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.search.transliterate(name).strip()
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.normalizer.transliterate(name).strip()
+
+
      def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
      def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
@@ -231,9 +252,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
              else:
              else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
@@ -264,7 +285,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
              This function takes minor shortcuts on transliteration.
          """
  
              This function takes minor shortcuts on transliteration.
          """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
@@ -287,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                      if postcode is None:
                          to_delete.append(word)
                      else:
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                      'P', postcode)
  
                  if to_delete:
                                      'P', postcode)
  
                  if to_delete:
@@ -305,7 +326,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -335,7 +356,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
@@ -369,9 +390,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
          word_tokens = set()
          word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
              if norm_name:
                  word_tokens.add(norm_name)
  
              if norm_name:
                  word_tokens.add(norm_name)
  
@@ -397,23 +430,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
          token_info = _TokenInfo(self._cache)
  
              the database via the token_info field.
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.get('name')
+        names, address = self.sanitizer.process_names(place)
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self._add_country_full_names(place.country_code, names)
  
  
-        address = place.get('address')
          if address:
              self._process_place_address(token_info, address)
  
          if address:
              self._process_place_address(token_info, address)
  
@@ -421,51 +452,110 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
  
      def _process_place_address(self, token_info, address):
  
  
      def _process_place_address(self, token_info, address):
-        hnrs = []
+        hnrs = set()
          addr_terms = []
          addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
-            elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+        streets = []
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind == 'housenumber':
+                norm_name = self._make_standard_hnr(item.name)
+                if norm_name:
+                    hnrs.add(norm_name)
+            elif item.kind == 'street':
+                streets.extend(self._retrieve_full_tokens(item.name))
+            elif item.kind == 'place':
+                if not item.suffix:
+                    token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and not item.suffix and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
  
          if hnrs:
  
          if hnrs:
-            hnrs = self._split_housenumbers(hnrs)
-            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+            token_info.add_housenumbers(self.conn, hnrs)
  
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
  
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
+        if streets:
+            token_info.add_street(streets)
+
+
+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self._search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
+
+
+    def _retrieve_full_tokens(self, name):
+        """ Get the full name token for the given name, if it exists.
+            The name is only retrived for the standard analyser.
+        """
+        norm_name = self._search_normalized(name)
+
+        # return cached if possible
+        if norm_name in self._cache.fulls:
+            return self._cache.fulls[norm_name]
+
+        with self.conn.cursor() as cur:
+            cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
+                        (norm_name, ))
+            full = [row[0] for row in cur]
+
+        self._cache.fulls[norm_name] = full
+
+        return full
+
  
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
  
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
-        full_names = self._compute_full_names(names)
          full_tokens = set()
          partial_tokens = set()
  
          full_tokens = set()
          partial_tokens = set()
  
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            norm_name = self._normalized(name.name)
+            if analyzer_id is None:
+                token_id = norm_name
+            else:
+                token_id = f'{norm_name}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
              if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
+                                (token_id, variants))
                      full, part = cur.fetchone()
  
                      full, part = cur.fetchone()
  
-                self._cache.names[norm_name] = (full, part)
+                self._cache.names[token_id] = (full, part)
  
              full_tokens.add(full)
              partial_tokens.update(part)
  
              full_tokens.add(full)
              partial_tokens.update(part)
@@ -473,23 +563,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
  
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
@@ -497,7 +570,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                  if not term:
                      return
  
                  if not term:
                      return
  
@@ -512,24 +585,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                  self._cache.postcodes.add(postcode)
  
  
                  self._cache.postcodes.add(postcode)
  
  
-    @staticmethod
-    def _split_housenumbers(hnrs):
-        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
-            # split numbers if necessary
-            simple_list = []
-            for hnr in hnrs:
-                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
-
-            if len(simple_list) > 1:
-                hnrs = list(set(simple_list))
-            else:
-                hnrs = simple_list
-
-        return hnrs
-
-
-
-
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
  class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
@@ -556,30 +611,24 @@ class _TokenInfo:
          self.data['hnr'] = ';'.join(hnrs)
  
  
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
          """ Add addr:street match terms.
          """
          """ Add addr:street match terms.
          """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        self.data['street'] = self._mk_array(tokens)
  
  
  
  
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
          """ Add addr:place search and match terms.
          """
          """ Add addr:place search and match terms.
          """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)
  
  
      def add_address_terms(self, terms):
          """ Add additional address terms.
          """
  
  
      def add_address_terms(self, terms):
          """ Add additional address terms.
          """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}
  
          if tokens:
              self.data['addr'] = tokens
  
          if tokens:
              self.data['addr'] = tokens
@@ -593,6 +642,8 @@ class _TokenCache:
      """
      def __init__(self):
          self.names = {}
      """
      def __init__(self):
          self.names = {}
+        self.partials = {}
+        self.fulls = {}
          self.postcodes = set()
          self.housenumbers = {}
  
          self.postcodes = set()
          self.housenumbers = {}