]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_icu_tokenizer.py
complete tests for rule loader
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
index 696004f5136d74a51a6b79985b2baf561f4a6371..960148890d0fe0a7ba465706d00a3097dbaf2d74 100644 (file)
@@ -3,26 +3,23 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
-import functools
-import io
 import itertools
 import itertools
-import json
 import logging
 import re
 from textwrap import dedent
 from pathlib import Path
 
 import logging
 import re
 from textwrap import dedent
 from pathlib import Path
 
-from icu import Transliterator
 import psycopg2.extras
 
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 import psycopg2.extras
 
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 
 
-DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
 
 
 LOG = logging.getLogger()
 
@@ -41,9 +38,9 @@ class LegacyICUTokenizer:
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None
 
 
     def init_new_db(self, config, init_db=True):
 
 
     def init_new_db(self, config, init_db=True):
@@ -55,14 +52,14 @@ class LegacyICUTokenizer:
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
 
 
-        rules = json.loads(cfgfile.read_text())
-        self.transliteration = ';'.join(rules['normalization']) + ';'
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY
 
 
-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
         self._save_config(config)
 
         if init_db:
         self._save_config(config)
 
         if init_db:
@@ -74,9 +71,9 @@ class LegacyICUTokenizer:
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
 
 
     def finalize_import(self, config):
 
 
     def finalize_import(self, config):
@@ -103,9 +100,7 @@ class LegacyICUTokenizer:
         """
         self.init_from_project()
 
         """
         self.init_from_project()
 
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
@@ -126,26 +121,20 @@ class LegacyICUTokenizer:
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("normalizer", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 
 
 
 
-    def _install_php(self, config):
+    def _install_php(self, phpdir):
         """ Install the php script for the tokenizer.
         """
         """ Install the php script for the tokenizer.
         """
-        abbr_inverse = list(zip(*self.abbreviations))
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent("""\
             <?php
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent("""\
             <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
+            @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
+            require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
+            """.format(self, phpdir))) # pylint: disable=missing-format-attribute
 
 
     def _save_config(self, config):
 
 
     def _save_config(self, config):
@@ -153,10 +142,10 @@ class LegacyICUTokenizer:
             database as database properties.
         """
         with connect(self.dsn) as conn:
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
     def _init_db_tables(self, config):
 
 
     def _init_db_tables(self, config):
@@ -172,25 +161,25 @@ class LegacyICUTokenizer:
 
             # get partial words and their frequencies
             words = Counter()
 
             # get partial words and their frequencies
             words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+            name_proc = ICUNameProcessor(self.naming_rules)
+            with conn.cursor(name="words") as cur:
+                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 
 
-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+                for name, cnt in cur:
+                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                        for term in word.split():
+                            words[term] += cnt
 
             # copy them back into the word table
 
             # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
-
+            with CopyBuffer() as copystr:
+                for args in words.items():
+                    copystr.add(*args)
 
 
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['word_token', 'search_name_count'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null""")
 
             conn.commit()
 
 
             conn.commit()
 
@@ -202,12 +191,10 @@ class LegacyICUNameAnalyzer:
         normalization.
     """
 
         normalization.
     """
 
-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc
 
         self._cache = _TokenCache()
 
 
         self._cache = _TokenCache()
 
@@ -228,7 +215,7 @@ class LegacyICUNameAnalyzer:
             self.conn = None
 
 
             self.conn = None
 
 
-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
             otherwise is a partial name.
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
             otherwise is a partial name.
@@ -242,11 +229,11 @@ class LegacyICUNameAnalyzer:
         tokens = {}
         for word in words:
             if word.startswith('#'):
         tokens = {}
         for word in words:
             if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
             else:
             else:
-                tokens[word] = self.make_standard_word(word)
+                tokens[word] = self.name_processor.get_search_normalized(word)
 
 
-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                            WHERE word_token = t.term
             cur.execute("""SELECT word_token, word_id
                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                            WHERE word_token = t.term
@@ -254,25 +241,17 @@ class LegacyICUNameAnalyzer:
                         (list(tokens.values()), ))
             ids = {r[0]: r[1] for r in cur}
 
                         (list(tokens.values()), ))
             ids = {r[0]: r[1] for r in cur}
 
-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 
 
 
 
-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
+    @staticmethod
+    def normalize_postcode(postcode):
+        """ Convert the postcode to a standardized form.
 
 
-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
         """
         """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
+        return postcode.strip().upper()
 
 
     def _make_standard_hnr(self, hnr):
 
 
     def _make_standard_hnr(self, hnr):
@@ -280,36 +259,50 @@ class LegacyICUNameAnalyzer:
 
             This function takes minor shortcuts on transliteration.
         """
 
             This function takes minor shortcuts on transliteration.
         """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)
 
 
-    def add_postcodes_from_db(self):
-        """ Add postcodes from the location_postcode table to the word table.
+    def update_postcodes_from_db(self):
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
         """
         """
-        copystr = io.StringIO()
+        to_delete = []
         with self.conn.cursor() as cur:
         with self.conn.cursor() as cur:
-            cur.execute("SELECT distinct(postcode) FROM location_postcode")
-            for (postcode, ) in cur:
-                copystr.write(postcode)
-                copystr.write('\t ')
-                copystr.write(self.transliterator.transliterate(postcode))
-                copystr.write('\tplace\tpostcode\t0\n')
-
-            copystr.seek(0)
-            cur.copy_from(copystr, 'word',
-                          columns=['word', 'word_token', 'class', 'type',
-                                   'search_name_count'])
-            # Don't really need an ID for postcodes....
-            # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-            #                WHERE word_id is null and type = 'postcode'""")
-
-
-    def update_special_phrases(self, phrases):
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT word FROM word
+                                WHERE class ='place' and type = 'postcode') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")
+
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(
+                            postcode,
+                            ' ' + self.name_processor.get_search_normalized(postcode),
+                            'place', 'postcode', 0)
+
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE class ='place' and type = 'postcode'
+                                         and word = any(%s)
+                                """, (to_delete, ))
+
+                copystr.copy_out(cur, 'word',
+                                 columns=['word', 'word_token', 'class', 'type',
+                                          'search_name_count'])
+
+
+    def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
         """ Replace the search index for special phrases with the new phrases.
         """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -321,54 +314,64 @@ class LegacyICUNameAnalyzer:
             for label, cls, typ, oper in cur:
                 existing_phrases.add((label, cls, typ, oper or '-'))
 
             for label, cls, typ, oper in cur:
                 existing_phrases.add((label, cls, typ, oper or '-'))
 
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
-
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
-
-            if to_delete:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
 
         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 
         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+                 len(norm_phrases), added, deleted)
 
 
 
 
-    def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
         """
         """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(word, term, cls, typ,
+                                oper if oper in ('in', 'near')  else None, 0)
+                    added += 1
 
 
+            copystr.copy_out(cursor, 'word',
+                             columns=['word', 'word_token', 'class', 'type',
+                                      'operator', 'search_name_count'])
 
 
-    def _add_normalized_country_names(self, country_code, names):
+        return added
+
+
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
+        """
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            psycopg2.extras.execute_values(
+                cursor,
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE word = name and class = in_class and type = in_type
+                          and ((op = '-' and operator is null) or op = operator)""",
+                to_delete)
+
+        return len(to_delete)
+
+
+    def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
         """ Add names for the given country to the search index.
         """
-        word_tokens = set((' ' + name for name in names))
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            if name:
+                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
+
         with self.conn.cursor() as cur:
             # Get existing names
             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
         with self.conn.cursor() as cur:
             # Get existing names
             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@@ -394,15 +397,13 @@ class LegacyICUNameAnalyzer:
         names = place.get('name')
 
         if names:
         names = place.get('name')
 
         if names:
-            full_names = set((self.make_standard_word(name) for name in names.values()))
-            full_names.discard('')
+            fulls, partials = self._compute_name_tokens(names)
 
 
-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)
 
         address = place.get('address')
 
 
         address = place.get('address')
 
@@ -415,42 +416,88 @@ class LegacyICUNameAnalyzer:
                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                     hnrs.append(value)
                 elif key == 'street':
                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                     hnrs.append(value)
                 elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
+                    token_info.add_street(*self._compute_name_tokens({'name': value}))
                 elif key == 'place':
                 elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
+                    token_info.add_place(*self._compute_name_tokens({'name': value}))
                 elif not key.startswith('_') and \
                      key not in ('country', 'full'):
                 elif not key.startswith('_') and \
                      key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
+                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 
             if hnrs:
                 hnrs = self._split_housenumbers(hnrs)
                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 
             if addr_terms:
 
             if hnrs:
                 hnrs = self._split_housenumbers(hnrs)
                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 
             if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+                token_info.add_address_terms(addr_terms)
 
         return token_info.data
 
 
 
         return token_info.data
 
 
+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
+    @staticmethod
+    def _compute_full_names(names):
+        """ Return the set of all full name word ids to be used with the
+            given dictionary of names.
+        """
+        full_names = set()
+        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
+            full_names.add(name.strip())
+
+            brace_idx = name.find('(')
+            if brace_idx >= 0:
+                full_names.add(name[:brace_idx].strip())
+
+        return full_names
+
+
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
-        if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
-            term = self.make_standard_word(postcode)
-            if not term:
-                return
-
-            with self.conn.cursor() as cur:
-                # no word_id needed for postcodes
-                cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                 search_name_count)
-                               (SELECT pc, %s, 'place', 'postcode', 0
-                                FROM (VALUES (%s)) as v(pc)
-                                WHERE NOT EXISTS
-                                 (SELECT * FROM word
-                                  WHERE word = pc and class='place' and type='postcode'))
-                            """, (' ' + term, postcode))
-            self._cache.postcodes.add(postcode)
+        if re.search(r'[:,;]', postcode) is None:
+            postcode = self.normalize_postcode(postcode)
+
+            if postcode not in self._cache.postcodes:
+                term = self.name_processor.get_search_normalized(postcode)
+                if not term:
+                    return
+
+                with self.conn.cursor() as cur:
+                    # no word_id needed for postcodes
+                    cur.execute("""INSERT INTO word (word, word_token, class, type,
+                                                     search_name_count)
+                                   (SELECT pc, %s, 'place', 'postcode', 0
+                                    FROM (VALUES (%s)) as v(pc)
+                                    WHERE NOT EXISTS
+                                     (SELECT * FROM word
+                                      WHERE word = pc and class='place' and type='postcode'))
+                                """, (' ' + term, postcode))
+                self._cache.postcodes.add(postcode)
+
 
     @staticmethod
     def _split_housenumbers(hnrs):
 
     @staticmethod
     def _split_housenumbers(hnrs):
@@ -474,7 +521,7 @@ class _TokenInfo:
     """ Collect token information to be sent back to the database.
     """
     def __init__(self, cache):
     """ Collect token information to be sent back to the database.
     """
     def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
         self.data = {}
 
     @staticmethod
         self.data = {}
 
     @staticmethod
@@ -482,88 +529,44 @@ class _TokenInfo:
         return '{%s}' % ','.join((str(s) for s in tokens))
 
 
         return '{%s}' % ','.join((str(s) for s in tokens))
 
 
-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
         """ Adds token information for the normalised names.
         """
         """ Adds token information for the normalised names.
         """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add partials for the full terms (TO BE REMOVED)
-        terms.update((n for n in names))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 
 
     def add_housenumbers(self, conn, hnrs):
         """ Extract housenumber information from a list of normalised
             housenumbers.
         """
 
 
     def add_housenumbers(self, conn, hnrs):
         """ Extract housenumber information from a list of normalised
             housenumbers.
         """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
         self.data['hnr'] = ';'.join(hnrs)
 
 
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, conn, street):
+    def add_street(self, fulls, _):
         """ Add addr:street match terms.
         """
         """ Add addr:street match terms.
         """
-        if not street:
-            return
-
-        term = ' ' + street
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)
 
 
-        tid = self.cache.names.get(term)
-
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
 
 
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
-
-
-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
         """ Add addr:place search and match terms.
         """
         """ Add addr:place search and match terms.
         """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
-
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)
 
 
 
 
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
         """ Add additional address terms.
         """
         tokens = {}
 
         """ Add additional address terms.
         """
         tokens = {}
 
-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]
 
         if tokens:
             self.data['addr'] = tokens
 
         if tokens:
             self.data['addr'] = tokens
@@ -581,32 +584,6 @@ class _TokenCache:
         self.housenumbers = {}
 
 
         self.housenumbers = {}
 
 
-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
             database if necessary.
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
             database if necessary.