move abbreviation computation into import phase

author Sarah Hoffmann <lonvia@denofr.de>

Fri, 28 May 2021 20:06:13 +0000 (22:06 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Fri, 28 May 2021 20:06:13 +0000 (22:06 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index 09cfe70fbf661a3e0440531310a45fb1fabbfab7..92dd727283019ea3454b20ee7232f0234f583b0c 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -47,9 +47,7 @@ class Tokenizer
  
      private function makeStandardWord($sTerm)
      {
-        $sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' ';
-
-        return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
+        return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
      }
  
  
@@ -90,6 +88,7 @@ class Tokenizer
          foreach ($aPhrases as $iPhrase => $oPhrase) {
              $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
              $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
+            Debug::printVar('Phrase', $sPhrase);
              if (strlen($sPhrase) > 0) {
                  $aWords = explode(' ', $sPhrase);
                  Tokenizer::addTokens($aTokens, $aWords);
diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql

index 8fd0ede40e87f5560fbfe16471879deaf6491e18..686137de5f11a5bbdeb350350340c91a508f93da 100644 (file)
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  --------------- private functions ----------------------------------------------
  
-CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
-  RETURNS INTEGER
+CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
+                                                 OUT full_token INT,
+                                                 OUT partial_tokens INT[])
    AS $$
  DECLARE
-  return_id INTEGER;
+  partial_terms TEXT[] = '{}'::TEXT[];
+  term TEXT;
+  term_id INTEGER;
    term_count INTEGER;
  BEGIN
-  SELECT min(word_id), max(search_name_count) INTO return_id, term_count
-    FROM word WHERE word_token = lookup_term and class is null and type is null;
+  SELECT min(word_id) INTO full_token
+    FROM word WHERE word = norm_term and class is null and country_code is null;
  
-  IF return_id IS NULL THEN
-    return_id := nextval('seq_word');
-    INSERT INTO word (word_id, word_token, search_name_count)
-      VALUES (return_id, lookup_term, 0);
-  ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
-    return_id := 0;
+  IF full_token IS NULL THEN
+    full_token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, word, search_name_count)
+      SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
    END IF;
  
-  RETURN return_id;
+  FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
+    term := trim(term);
+    IF NOT (ARRAY[term] <@ partial_terms) THEN
+      partial_terms := partial_terms || term;
+    END IF;
+  END LOOP;
+
+  partial_tokens := '{}'::INT[];
+  FOR term IN SELECT unnest(partial_terms) LOOP
+    SELECT min(word_id), max(search_name_count) INTO term_id, term_count
+      FROM word WHERE word_token = term and class is null and country_code is null;
+
+    IF term_id IS NULL THEN
+      term_id := nextval('seq_word');
+      term_count := 0;
+      INSERT INTO word (word_id, word_token, search_name_count)
+        VALUES (term_id, term, 0);
+    END IF;
+
+    IF term_count < {{ max_word_freq }} THEN
+      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
+    END IF;
+  END LOOP;
  END;
  $$
  LANGUAGE plpgsql;
diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py

new file mode 100644 (file)

index 0000000..0e71799
--- /dev/null
+++ b/nominatim/tokenizer/icu_name_processor.py
@@ -0,0 +1,111 @@
+"""
+Processor for names that are imported into the database based on the
+ICU library.
+"""
+import json
+import itertools
+
+from icu import Transliterator
+import datrie
+
+from nominatim.db.properties import set_property, get_property
+
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
+DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
+
+
+class ICUNameProcessorRules:
+    """ Data object that saves the rules needed for the name processor.
+
+        The rules can either be initialised through an ICURuleLoader or
+        be loaded from a database when a connection is given.
+    """
+    def __init__(self, loader=None, conn=None):
+        if loader is not None:
+            self.norm_rules = loader.get_normalization_rules()
+            self.trans_rules = loader.get_transliteration_rules()
+            self.replacements = loader.get_replacement_pairs()
+            self.search_rules = loader.get_search_rules()
+        elif conn is not None:
+            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+            self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
+            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
+        else:
+            assert False, "Parameter loader or conn required."
+
+        # Compute the set of characters used in the replacement list.
+        # We need this later when computing the tree.
+        chars = set()
+        for full, repl in self.replacements:
+            chars.update(full)
+            for word in repl:
+                chars.update(word)
+        self.replacement_charset = ''.join(chars)
+
+
+    def save_rules(self, conn):
+        """ Save the rules in the property table of the given database.
+            the rules can be loaded again by handing in a connection into
+            the constructor of the class.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
+        set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements))
+        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
+
+
+class ICUNameProcessor:
+
+    def __init__(self, rules):
+        self.normalizer = Transliterator.createFromRules("icu_normalization",
+                                                         rules.norm_rules)
+        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
+                                                       rules.trans_rules)
+        self.search = Transliterator.createFromRules("icu_search",
+                                                     rules.search_rules)
+
+        self.replacements = datrie.Trie(rules.replacement_charset)
+        for full, repl in rules.replacements:
+            self.replacements[full] = repl
+
+
+    def get_normalized(self, name):
+        """ Normalize the given name, i.e. remove all elements not relevant
+            for search.
+        """
+        return self.normalizer.transliterate(name)
+
+    def get_variants_ascii(self, norm_name):
+        """ Compute the spelling variants for the given normalized name
+            and transliterate the result.
+        """
+        baseform = ' ' + norm_name + ' '
+        variants = ['']
+
+        startpos = 0
+        pos = 0
+        while pos < len(baseform):
+            full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                               (None, None))
+            if full is not None:
+                done = baseform[startpos:pos]
+                variants = [v + done + r for v, r in itertools.product(variants, repl)]
+                startpos = pos + len(full)
+                pos = startpos
+            else:
+                pos += 1
+
+        if startpos == 0:
+            return [self.to_ascii.transliterate(norm_name)]
+
+        return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants]
+
+
+    def get_search_normalized(self, name):
+        """ Return the normalized version of the name (including transliteration)
+            to be applied at search time.
+        """
+        return self.search.transliterate(name)
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

new file mode 100644 (file)

index 0000000..3b72116
--- /dev/null
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -0,0 +1,161 @@
+"""
+Helper class to create ICU rules from a configuration file.
+"""
+import io
+import yaml
+import logging
+from collections import defaultdict
+import itertools
+
+from icu import Transliterator
+
+from nominatim.errors import UsageError
+
+LOG = logging.getLogger()
+
+
+class ICURuleLoader:
+    """ Compiler for ICU rules from a tokenizer configuration file.
+    """
+
+    def __init__(self, configfile):
+        self.configfile = configfile
+
+        if configfile.suffix == '.yaml':
+            self._load_from_yaml()
+        else:
+            raise UsageError("Unknown format of tokenizer configuration.")
+
+
+    def get_search_rules(self):
+        """ Returns the ICU rules to be used during search.
+            The rules combine normalization, compound decomposition (including
+            abbreviated compounds) and transliteration.
+        """
+        # First apply the normalization rules.
+        rules = io.StringIO()
+        rules.write(self.normalization_rules)
+
+        # For all compound suffixes: add them in their full and any abbreviated form.
+        suffixes = set()
+        for suffix in self.compound_suffixes:
+            suffixes.add(suffix)
+            suffixes.update(self.abbreviations.get(suffix, []))
+
+        for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
+            rules.write("'{0} ' > ' {0} ';".format(suffix))
+
+        # Finally add transliteration.
+        rules.write(self.transliteration_rules)
+        return rules.getvalue()
+
+    def get_normalization_rules(self):
+        """ Return rules for normalisation of a term.
+        """
+        return self.normalization_rules
+
+    def get_transliteration_rules(self):
+        """ Return the rules for converting a string into its asciii representation.
+        """
+        return self.transliteration_rules
+
+    def get_replacement_pairs(self):
+        """ Returns the list of possible compound decompositions with
+            application of abbreviations included.
+            The result is a list of pairs: the first item is the sequence to
+            replace, the second is a list of replacements.
+        """
+        synonyms = defaultdict(set)
+
+        for full, abbr in self.abbreviations.items():
+            key = ' ' + full + ' '
+            # Entries in the abbreviation list always apply to full words:
+            synonyms[key].update((' ' + a + ' ' for a in abbr))
+            # Replacements are optional, so add a noop
+            synonyms[key].add(key)
+
+        # Entries in the compound list expand to themselves and to
+        # abbreviations.
+        for suffix in self.compound_suffixes:
+            keyset = synonyms[suffix + ' ']
+            keyset.add(' ' + suffix + ' ')
+            keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
+            # The terms the entries are shortended to, need to be decompunded as well.
+            for abbr in self.abbreviations.get(suffix, []):
+                synonyms[abbr + ' '].add(' ' + abbr + ' ')
+
+        # sort the resulting list by descending length (longer matches are prefered).
+        sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+
+        return [(k, list(synonyms[k])) for k in sorted_keys]
+
+
+    def _load_from_yaml(self):
+        rules = yaml.load(self.configfile.read_text())
+
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
+        self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
+
+
+    def _get_section(self, rules, section):
+        """ Get the section named 'section' from the rules. If the section does
+            not exist, raise a usage error with a meaningful message.
+        """
+        if section not in rules:
+            LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
+                      section, str(self.configfile))
+            raise UsageError("Syntax error in tokenizer configuration file.")
+
+        return rules[section]
+
+
+    def _cfg_to_icu_rules(self, rules, section):
+        """ Load an ICU ruleset from the given section. If the section is a
+            simple string, it is interpreted as a file name and the rules are
+            loaded verbatim from the given file. The filename is expected to be
+            relative to the tokenizer rule file. If the section is a list then
+            each line is assumed to be a rule. All rules are concatenated and returned.
+        """
+        content = self._get_section(rules, section)
+
+        if isinstance(content, str):
+            return (self.configfile.parent / content).read_text().replace('\n', ' ')
+
+        return ';'.join(content) + ';'
+
+
+    def _parse_compound_suffix_list(self, rules):
+        if not rules:
+            self.compound_suffixes = set()
+            return
+
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+
+        # Make sure all suffixes are in their normalised form.
+        self.compound_suffixes = set((norm.transliterate(s) for s in rules))
+
+
+    def _parse_abbreviation_list(self, rules):
+        self.abbreviations = defaultdict(list)
+
+        if not rules:
+            return
+
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+
+        for rule in rules:
+            parts = rule.split('=>')
+            if len(parts) != 2:
+                LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
+                raise UsageError("Syntax error in tokenizer configuration file.")
+
+            # Make sure all terms match the normalised version.
+            fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
+            abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
+
+            for full, abbr in itertools.product(fullterms, abbrterms):
+                self.abbreviations[full].append(abbr)
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 689318d7e87bc79024b5b0c645db4252f19b62ae..eb8502377f1c9e0a65a5b7d2808e3e6c9ab226dd 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -18,11 +18,11 @@ import psycopg2.extras
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  
-DBCFG_NORMALIZATION = "tokenizer_normalization"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
-DBCFG_TRANSLITERATION = "tokenizer_transliteration"
-DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
+DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
@@ -41,9 +41,9 @@ class LegacyICUTokenizer:
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.normalization = None
-        self.transliteration = None
-        self.abbreviations = None
+        self.naming_rules = None
+        self.term_normalization = None
+        self.max_word_frequency = None
  
  
      def init_new_db(self, config, init_db=True):
@@ -55,14 +55,14 @@ class LegacyICUTokenizer:
          if config.TOKENIZER_CONFIG:
              cfgfile = Path(config.TOKENIZER_CONFIG)
          else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
+            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  
-        rules = json.loads(cfgfile.read_text())
-        self._load_transliteration(rules['normalization'], cfgfile.parent)
-        self.abbreviations = rules["abbreviations"]
-        self.normalization = config.TERM_NORMALIZATION
+        loader = ICURuleLoader(cfgfile)
+        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.term_normalization = config.TERM_NORMALIZATION
+        self.max_word_frequency = config.MAX_WORD_FREQUENCY
  
-        self._install_php(config)
+        self._install_php(config.lib_dir.php)
          self._save_config(config)
  
          if init_db:
@@ -70,19 +70,13 @@ class LegacyICUTokenizer:
              self._init_db_tables(config)
  
  
-    def _load_transliteration(self, rules, cfg_path):
-        if isinstance(rules, str):
-            self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
-        else:
-            self.transliteration = ';'.join(rules) + ';'
-
      def init_from_project(self):
          """ Initialise the tokenizer from the project directory.
          """
          with connect(self.dsn) as conn:
-            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
-            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
-            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
+            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
      def finalize_import(self, config):
@@ -132,26 +126,20 @@ class LegacyICUTokenizer:
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("trans", self.transliteration)
-        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
+        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
  
-    def _install_php(self, config):
+    def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
-        abbr_inverse = list(zip(*self.abbreviations))
          php_file = self.data_dir / "tokenizer.php"
          php_file.write_text(dedent("""\
              <?php
-            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration', "{0.transliteration}");
-            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
-            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config,
-                       "','".join(abbr_inverse[0]),
-                       "','".join(abbr_inverse[1]))))
+            @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
+            @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
+            require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
+            """.format(self, phpdir)))
  
  
      def _save_config(self, config):
@@ -159,10 +147,10 @@ class LegacyICUTokenizer:
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+            self.naming_rules.save_rules(conn)
+
              set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
-            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
-            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
+            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
      def _init_db_tables(self, config):
@@ -178,15 +166,14 @@ class LegacyICUTokenizer:
  
              # get partial words and their frequencies
              words = Counter()
-            with self.name_analyzer() as analyzer:
-                with conn.cursor(name="words") as cur:
-                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+            name_proc = ICUNameProcessor(self.naming_rules)
+            with conn.cursor(name="words") as cur:
+                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
  
-                    for name, cnt in cur:
-                        term = analyzer.make_standard_word(name)
-                        if term:
-                            for word in term.split():
-                                words[word] += cnt
+                for name, cnt in cur:
+                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                        for term in word.split():
+                            words[term] += cnt
  
              # copy them back into the word table
              copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
@@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
          normalization.
      """
  
-    def __init__(self, dsn, normalizer, transliterator, abbreviations):
+    def __init__(self, dsn, name_proc):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.normalizer = normalizer
-        self.transliterator = transliterator
-        self.abbreviations = abbreviations
+        self.name_processor = name_proc
  
          self._cache = _TokenCache()
  
@@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
          tokens = {}
          for word in words:
              if word.startswith('#'):
-                tokens[word] = ' ' + self.make_standard_word(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
              else:
-                tokens[word] = self.make_standard_word(word)
+                tokens[word] = self.name_processor.get_normalized(word)
  
          with conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
@@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
          return [(k, v, ids[v]) for k, v in tokens.items()]
  
  
-    def normalize(self, phrase):
-        """ Normalize the given phrase, i.e. remove all properties that
-            are irrelevant for search.
-        """
-        return self.normalizer.transliterate(phrase)
-
      @staticmethod
      def normalize_postcode(postcode):
          """ Convert the postcode to a standardized form.
@@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
          return postcode.strip().upper()
  
  
-    @functools.lru_cache(maxsize=1024)
-    def make_standard_word(self, name):
-        """ Create the normalised version of the input.
-        """
-        norm = ' ' + self.transliterator.transliterate(name) + ' '
-        for full, abbr in self.abbreviations:
-            if full in norm:
-                norm = norm.replace(full, abbr)
-
-        return norm.strip()
-
-
      def _make_standard_hnr(self, hnr):
          """ Create a normalised version of a housenumber.
  
              This function takes minor shortcuts on transliteration.
          """
-        if hnr.isdigit():
-            return hnr
-
-        return self.transliterator.transliterate(hnr)
+        return self.name_processor.get_search_normalized(hnr)
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
@@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
                  else:
                      copystr.write(postcode)
                      copystr.write('\t ')
-                    copystr.write(self.transliterator.transliterate(postcode))
+                    copystr.write(self.name_processor.get_search_normalized(postcode))
                      copystr.write('\tplace\tpostcode\t0\n')
  
              if to_delete:
@@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
-        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
              if to_add:
                  copystr = io.StringIO()
                  for word, cls, typ, oper in to_add:
-                    term = self.make_standard_word(word)
+                    term = self.name_processor.get_search_normalized(word)
                      if term:
                          copystr.write(word)
                          copystr.write('\t ')
@@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
-        full_names = set((self.make_standard_word(n) for n in names))
-        full_names.discard('')
-        self._add_normalized_country_names(country_code, full_names)
-
+        word_tokens = set()
+        for name in self._compute_full_names(names):
+            if name:
+                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
  
-    def _add_normalized_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
-        """
-        word_tokens = set((' ' + name for name in names))
          with self.conn.cursor() as cur:
              # Get existing names
              cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
          names = place.get('name')
  
          if names:
-            full_names = self._compute_full_names(names)
+            fulls, partials = self._compute_name_tokens(names)
  
-            token_info.add_names(self.conn, full_names)
+            token_info.add_names(fulls, partials)
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalized_country_names(country_feature.lower(),
-                                                   full_names)
+                self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
  
@@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
                  elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                      hnrs.append(value)
                  elif key == 'street':
-                    token_info.add_street(self.conn, self.make_standard_word(value))
+                    token_info.add_street(*self._compute_name_tokens({'name': value}))
                  elif key == 'place':
-                    token_info.add_place(self.conn, self.make_standard_word(value))
+                    token_info.add_place(*self._compute_name_tokens({'name': value}))
                  elif not key.startswith('_') and \
                       key not in ('country', 'full'):
-                    addr_terms.append((key, self.make_standard_word(value)))
+                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
  
              if hnrs:
                  hnrs = self._split_housenumbers(hnrs)
                  token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
  
              if addr_terms:
-                token_info.add_address_terms(self.conn, addr_terms)
+                token_info.add_address_terms(addr_terms)
  
          return token_info.data
  
  
+    def _compute_name_tokens(self, names):
+        """ Computes the full name and partial name tokens for the given
+            dictionary of names.
+        """
+        full_names = self._compute_full_names(names)
+        full_tokens = set()
+        partial_tokens = set()
+
+        for name in full_names:
+            norm_name = self.name_processor.get_normalized(name)
+            full, part = self._cache.names.get(norm_name, (None, None))
+            if full is None:
+                variants = self.name_processor.get_variants_ascii(norm_name)
+                with self.conn.cursor() as cur:
+                    cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+                                (norm_name, variants))
+                    full, part = cur.fetchone()
+
+                self._cache.names[norm_name] = (full, part)
+
+            full_tokens.add(full)
+            partial_tokens.update(part)
+
+        return full_tokens, partial_tokens
+
+
      def _compute_full_names(self, names):
          """ Return the set of all full name word ids to be used with the
              given dictionary of names.
          """
          full_names = set()
          for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            word = self.make_standard_word(name)
-            if word:
-                full_names.add(word)
+            full_names.add(name.strip())
  
-                brace_split = name.split('(', 2)
-                if len(brace_split) > 1:
-                    word = self.make_standard_word(brace_split[0])
-                    if word:
-                        full_names.add(word)
+            brace_idx = name.find('(')
+            if brace_idx >= 0:
+                full_names.add(name[:brace_idx].strip())
  
          return full_names
  
@@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
-                term = self.make_standard_word(postcode)
+                term = self.name_processor.get_search_normalized(postcode)
                  if not term:
                      return
  
@@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
                                  """, (' ' + term, postcode))
                  self._cache.postcodes.add(postcode)
  
+
      @staticmethod
      def _split_housenumbers(hnrs):
          if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -530,7 +512,7 @@ class _TokenInfo:
      """ Collect token information to be sent back to the database.
      """
      def __init__(self, cache):
-        self.cache = cache
+        self._cache = cache
          self.data = {}
  
      @staticmethod
@@ -538,86 +520,44 @@ class _TokenInfo:
          return '{%s}' % ','.join((str(s) for s in tokens))
  
  
-    def add_names(self, conn, names):
+    def add_names(self, fulls, partials):
          """ Adds token information for the normalised names.
          """
-        # Start with all partial names
-        terms = set((part for ns in names for part in ns.split()))
-        # Add the full names
-        terms.update((' ' + n for n in names))
-
-        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
+        self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
  
  
      def add_housenumbers(self, conn, hnrs):
          """ Extract housenumber information from a list of normalised
              housenumbers.
          """
-        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
+        self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, conn, street):
+    def add_street(self, fulls, partials):
          """ Add addr:street match terms.
          """
-        if not street:
-            return
-
-        term = ' ' + street
+        if fulls:
+            self.data['street'] = self._mk_array(fulls)
  
-        tid = self.cache.names.get(term)
  
-        if tid is None:
-            with conn.cursor() as cur:
-                cur.execute("""SELECT word_id FROM word
-                                WHERE word_token = %s
-                                      and class is null and type is null""",
-                            (term, ))
-                if cur.rowcount > 0:
-                    tid = cur.fetchone()[0]
-                    self.cache.names[term] = tid
-
-        if tid is not None:
-            self.data['street'] = '{%d}' % tid
-
-
-    def add_place(self, conn, place):
+    def add_place(self, fulls, partials):
          """ Add addr:place search and match terms.
          """
-        if not place:
-            return
-
-        partial_ids = self.cache.get_term_tokens(conn, place.split())
-        tid = self.cache.get_term_tokens(conn, [' ' + place])
-
-        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
-        self.data['place_match'] = '{%s}' % tid[0]
+        if fulls:
+            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
+            self.data['place_match'] = self._mk_array(fulls)
  
  
-    def add_address_terms(self, conn, terms):
+    def add_address_terms(self, terms):
          """ Add additional address terms.
          """
          tokens = {}
  
-        for key, value in terms:
-            if not value:
-                continue
-            partial_ids = self.cache.get_term_tokens(conn, value.split())
-            term = ' ' + value
-            tid = self.cache.names.get(term)
-
-            if tid is None:
-                with conn.cursor() as cur:
-                    cur.execute("""SELECT word_id FROM word
-                                    WHERE word_token = %s
-                                          and class is null and type is null""",
-                                (term, ))
-                    if cur.rowcount > 0:
-                        tid = cur.fetchone()[0]
-                        self.cache.names[term] = tid
-
-            tokens[key] = [self._mk_array(partial_ids),
-                           '{%s}' % ('' if tid is None else str(tid))]
+        for key, fulls, partials in terms:
+            if fulls:
+                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
+                               self._mk_array(fulls)]
  
          if tokens:
              self.data['addr'] = tokens
@@ -635,32 +575,6 @@ class _TokenCache:
          self.housenumbers = {}
  
  
-    def get_term_tokens(self, conn, terms):
-        """ Get token ids for a list of terms, looking them up in the database
-            if necessary.
-        """
-        tokens = []
-        askdb = []
-
-        for term in terms:
-            token = self.names.get(term)
-            if token is None:
-                askdb.append(term)
-            elif token != 0:
-                tokens.append(token)
-
-        if askdb:
-            with conn.cursor() as cur:
-                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
-                            (askdb, ))
-                for term, tid in cur:
-                    self.names[term] = tid
-                    if tid != 0:
-                        tokens.append(tid)
-
-        return tokens
-
-
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
              database if necessary.
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index d6fbc2cda6987bc18d708970ed2d2e767c7ac8b2..bb37115bf814054eccf12863c926db756f420024 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -404,7 +404,7 @@ class LegacyNameAnalyzer:
                              FROM unnest(%s)n) y
                        WHERE NOT EXISTS(SELECT * FROM word
                                         WHERE word_token = lookup_token and country_code = %s))
-                """, (country_code, names, country_code))
+                """, (country_code, list(names.values()), country_code))
  
  
      def process_place(self, place):
@@ -422,7 +422,7 @@ class LegacyNameAnalyzer:
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), list(names.values()))
+                self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
  
diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py

index 28a10ebeb742f907f4b4efc1c3b2abe9925d40fa..efbf2ec80c0c771dbea2e71390c5ba184b782280 100644 (file)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -272,15 +272,15 @@ def create_country_names(conn, tokenizer, languages=None):
  
          with tokenizer.name_analyzer() as analyzer:
              for code, name in cur:
-                names = [code]
+                names = {'countrycode' : code}
                  if code == 'gb':
-                    names.append('UK')
+                    names['short_name'] = 'UK'
                  if code == 'us':
-                    names.append('United States')
+                    names['short_name'] = 'United States'
  
                  # country names (only in languages as provided)
                  if name:
-                    names.extend((v for k, v in name.items() if _include_key(k)))
+                    names.update(((k, v) for k, v in name.items() if _include_key(k)))
  
                  analyzer.add_country_names(code, names)
  
diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml

new file mode 100644 (file)

index 0000000..34cd8b0
--- /dev/null
+++ b/settings/legacy_icu_tokenizer.yaml
@@ -0,0 +1,116 @@
+normalization:
+    - ":: NFD ()"
+    - "[[:Nonspacing Mark:] [:Cf:]] >"
+    - ":: lower ()"
+    - "ß > 'ss'" # German szet is unimbigiously equal to double ss
+    - "[[:Punctuation:][:Space:]]+ > ' '"
+    - ":: NFC ()"
+transliteration: icu_transliteration.rules
+compound_suffixes:
+    # Danish
+    - hal
+    - hallen
+    - hallerne
+    # German
+    - berg
+    - brücke
+    - fabrik
+    - gasse
+    - graben
+    - haus
+    - höhle
+    - hütte
+    - kapelle
+    - kogel
+    - pfad
+    - platz
+    - quelle
+    - spitze
+    - stiege
+    - strasse
+    - teich
+    - universität
+    - wald
+    - weg
+    - wiese
+    # Dutch
+    - gracht
+    - laan
+    - markt
+    - plein
+    - straat
+    - vliet
+    - weg
+    # Norwegian
+    - vei
+    - veien
+    - veg
+    - vegen
+    - gate
+    - gaten
+    - gata
+    - plass
+    - plassen
+    - sving
+    - svingen
+    # Finnish
+    - alue
+    - asema
+    - aukio
+    - kaari
+    - katu
+    - kuja
+    - kylä
+    - penger
+    - polku
+    - puistikko
+    - puisto
+    - raitti
+    - ranta
+    - rinne
+    - taival
+    - tie
+    - tori
+    - väylä
+    # Swedish
+    - väg
+    - vägen
+    - gatan
+    - gata
+    - gränd
+    - gränden
+    - stig
+    - stigen
+    - plats
+    - platsen
+abbreviations:
+    # German
+    - am => a
+    - an der => a d
+    - allgemeines krankenhaus => akh
+    - altstoffsammelzentrum => asz
+    - auf der => a d
+    - bach => b
+    - bad => b
+    - bahnhof => bhf,bf
+    - berg => bg
+    - bezirk => bez
+    - brücke => br
+    - burg => bg
+    - chaussee => ch
+    - deutsche,deutscher,deutsches => dt
+    - dorf => df
+    - doktor => dr
+    - fachhochschule => fh
+    - Freiwillige Feuerwehr => ff
+    - sankt => st
+    - strasse => str
+    - weg => wg
+    # English
+    - alley => al
+    - beach => bch
+    - street => st
+    - road => rd
+    - bridge => brdg
+
+
diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py

new file mode 100644 (file)

index 0000000..9c09bcc
--- /dev/null
+++ b/test/python/test_tokenizer_icu_name_processor.py
@@ -0,0 +1,60 @@
+"""
+Tests for import name normalisation and variant generation.
+"""
+from textwrap import dedent
+
+import pytest
+
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+
+from nominatim.errors import UsageError
+
+@pytest.fixture
+def cfgfile(tmp_path, suffix='.yaml'):
+    def _create_config(suffixes, abbr):
+        content = dedent("""\
+        normalization:
+            - ":: NFD ()"
+            - "[[:Nonspacing Mark:] [:Cf:]] >"
+            - ":: lower ()"
+            - "[[:Punctuation:][:Space:]]+ > ' '"
+            - ":: NFC ()"
+        transliteration:
+            - "::  Latin ()"
+        """)
+        content += "compound_suffixes:\n"
+        content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
+        content += "abbreviations:\n"
+        content += '\n'.join(("    - " + s for s in abbr)) + '\n'
+        fpath = tmp_path / ('test_config' + suffix)
+        fpath.write_text(dedent(content))
+        return fpath
+
+    return _create_config
+
+
+def test_simple_variants(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str',
+                     'prospekt => pr'])
+
+    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
+    proc = ICUNameProcessor(rules)
+
+    assert set(proc.get_normalized_variants("Bauwegstraße")) \
+            == {'bauweg straße', 'bauweg str'}
+    assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str']
+    assert proc.get_normalized_variants("holzweg") == ['holz weg']
+    assert proc.get_normalized_variants("hallo") == ['hallo']
+
+
+def test_multiple_replacements(cfgfile):
+    fpath = cfgfile([], ['saint => s,st', 'street => st'])
+
+    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
+    proc = ICUNameProcessor(rules)
+
+    assert set(proc.get_normalized_variants("Saint Johns Street")) == \
+            {'saint johns street', 's johns street', 'st johns street',
+             'saint johns st', 's johns st', 'st johns st'}
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py

new file mode 100644 (file)

index 0000000..d89e13b
--- /dev/null
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -0,0 +1,75 @@
+"""
+Tests for converting a config file to ICU rules.
+"""
+import pytest
+from textwrap import dedent
+
+from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+from nominatim.errors import UsageError
+
+from icu import Transliterator
+
+@pytest.fixture
+def cfgfile(tmp_path, suffix='.yaml'):
+    def _create_config(suffixes, abbr):
+        content = dedent("""\
+        normalization:
+            - ":: NFD ()"
+            - "[[:Nonspacing Mark:] [:Cf:]] >"
+            - ":: lower ()"
+            - "[[:Punctuation:][:Space:]]+ > ' '"
+            - ":: NFC ()"
+        transliteration:
+            - "::  Latin ()"
+        """)
+        content += "compound_suffixes:\n"
+        content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
+        content += "abbreviations:\n"
+        content += '\n'.join(("    - " + s for s in abbr)) + '\n'
+        fpath = tmp_path / ('test_config' + suffix)
+        fpath.write_text(dedent(content))
+        return fpath
+
+    return _create_config
+
+def test_missing_normalization(tmp_path):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalizatio:
+            - ":: NFD ()"
+        """))
+
+    with pytest.raises(UsageError):
+        ICURuleLoader(fpath)
+
+
+def test_get_search_rules(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str',
+                     'prospekt => pr'])
+
+    loader = ICURuleLoader(fpath)
+
+    rules = loader.get_search_rules()
+    trans = Transliterator.createFromRules("test", rules)
+
+    assert trans.transliterate(" Baumstraße ") == " baum straße "
+    assert trans.transliterate(" Baumstrasse ") == " baum strasse "
+    assert trans.transliterate(" Baumstr ") == " baum str "
+    assert trans.transliterate(" Baumwegstr ") == " baumweg str "
+    assert trans.transliterate(" Αθήνα ") == " athēna "
+    assert trans.transliterate(" проспект ") == " prospekt "
+
+
+def test_get_synonym_pairs(cfgfile):
+    fpath = cfgfile(['Weg', 'Strasse'],
+                    ['Strasse => str,st'])
+
+    loader = ICURuleLoader(fpath)
+
+    repl = loader.get_replacement_pairs()
+
+    assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
+                    ('strasse ', {' strasse ', ' str ', ' st '}),
+                    ('weg ', {' weg '})]
+
author	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 28 May 2021 20:06:13 +0000 (22:06 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
lib-sql/tokenizer/legacy_icu_tokenizer.sql		patch \| blob \| history
nominatim/tokenizer/icu_name_processor.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/icu_rule_loader.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
nominatim/tools/database_import.py		patch \| blob \| history
settings/legacy_icu_tokenizer.yaml	[new file with mode: 0644]	patch \| blob
test/python/test_tokenizer_icu_name_processor.py	[new file with mode: 0644]	patch \| blob
test/python/test_tokenizer_icu_rule_loader.py	[new file with mode: 0644]	patch \| blob