Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index eb8502377f1c9e0a65a5b7d2808e3e6c9ab226dd..c585c5afe0bf28bfa24590ed05cb165f6fd2dd01 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,20 +3,17 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
-import functools
-import io
  import itertools
  import itertools
-import json
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
-from icu import Transliterator
  import psycopg2.extras
  
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  import psycopg2.extras
  
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
@@ -103,9 +100,7 @@ class LegacyICUTokenizer:
          """
          self.init_from_project()
  
          """
          self.init_from_project()
  
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
@@ -128,7 +123,7 @@ class LegacyICUTokenizer:
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
-
+    # pylint: disable=missing-format-attribute
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
@@ -168,22 +163,28 @@ class LegacyICUTokenizer:
              words = Counter()
              name_proc = ICUNameProcessor(self.naming_rules)
              with conn.cursor(name="words") as cur:
              words = Counter()
              name_proc = ICUNameProcessor(self.naming_rules)
              with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
  
                  for name, cnt in cur:
  
                  for name, cnt in cur:
+                    terms = set()
                      for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
                      for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        for term in word.split():
-                            words[term] += cnt
+                        if ' ' in word:
+                            terms.update(word.split())
+                    for term in terms:
+                        words[term] += cnt
  
              # copy them back into the word table
  
              # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
-
+            with CopyBuffer() as copystr:
+                for args in words.items():
+                    copystr.add(*args)
  
  
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['word_token', 'search_name_count'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null""")
  
              conn.commit()
  
  
              conn.commit()
  
@@ -219,7 +220,7 @@ class LegacyICUNameAnalyzer:
              self.conn = None
  
  
              self.conn = None
  
  
-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              otherwise is a partial name.
@@ -233,11 +234,11 @@ class LegacyICUNameAnalyzer:
          tokens = {}
          for word in words:
              if word.startswith('#'):
          tokens = {}
          for word in words:
              if word.startswith('#'):
-                tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
              else:
              else:
-                tokens[word] = self.name_processor.get_normalized(word)
+                tokens[word] = self.name_processor.get_search_normalized(word)
  
  
-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
                             FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                             WHERE word_token = t.term
              cur.execute("""SELECT word_token, word_id
                             FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                             WHERE word_token = t.term
@@ -245,7 +246,7 @@ class LegacyICUNameAnalyzer:
                          (list(tokens.values()), ))
              ids = {r[0]: r[1] for r in cur}
  
                          (list(tokens.values()), ))
              ids = {r[0]: r[1] for r in cur}
  
-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
  
  
      @staticmethod
  
  
      @staticmethod
@@ -270,7 +271,6 @@ class LegacyICUNameAnalyzer:
              table.
          """
          to_delete = []
              table.
          """
          to_delete = []
-        copystr = io.StringIO()
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
@@ -283,32 +283,31 @@ class LegacyICUNameAnalyzer:
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
-            for postcode, word in cur:
-                if postcode is None:
-                    to_delete.append(word)
-                else:
-                    copystr.write(postcode)
-                    copystr.write('\t ')
-                    copystr.write(self.name_processor.get_search_normalized(postcode))
-                    copystr.write('\tplace\tpostcode\t0\n')
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(
+                            postcode,
+                            ' ' + self.name_processor.get_search_normalized(postcode),
+                            'place', 'postcode', 0)
  
  
-            if to_delete:
-                cur.execute("""DELETE FROM WORD
-                               WHERE class ='place' and type = 'postcode'
-                                     and word = any(%s)
-                            """, (to_delete, ))
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE class ='place' and type = 'postcode'
+                                         and word = any(%s)
+                                """, (to_delete, ))
  
  
-            if copystr.getvalue():
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'search_name_count'])
+                copystr.copy_out(cur, 'word',
+                                 columns=['word', 'word_token', 'class', 'type',
+                                          'search_name_count'])
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
-        norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -320,40 +319,54 @@ class LegacyICUNameAnalyzer:
              for label, cls, typ, oper in cur:
                  existing_phrases.add((label, cls, typ, oper or '-'))
  
              for label, cls, typ, oper in cur:
                  existing_phrases.add((label, cls, typ, oper or '-'))
  
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.name_processor.get_search_normalized(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
-
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
-
-            if to_delete and should_replace:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+                 len(norm_phrases), added, deleted)
+
+
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(word, ' ' + term, cls, typ,
+                                oper if oper in ('in', 'near')  else None, 0)
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
+                             columns=['word', 'word_token', 'class', 'type',
+                                      'operator', 'search_name_count'])
+
+        return added
+
+
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
+        """
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            psycopg2.extras.execute_values(
+                cursor,
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE word = name and class = in_class and type = in_type
+                          and ((op = '-' and operator is null) or op = operator)""",
+                to_delete)
+
+        return len(to_delete)
  
  
      def add_country_names(self, country_code, names):
  
  
      def add_country_names(self, country_code, names):
@@ -438,6 +451,9 @@ class LegacyICUNameAnalyzer:
              full, part = self._cache.names.get(norm_name, (None, None))
              if full is None:
                  variants = self.name_processor.get_variants_ascii(norm_name)
              full, part = self._cache.names.get(norm_name, (None, None))
              if full is None:
                  variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                  (norm_name, variants))
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                  (norm_name, variants))
@@ -451,17 +467,19 @@ class LegacyICUNameAnalyzer:
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
  
-    def _compute_full_names(self, names):
+    @staticmethod
+    def _compute_full_names(names):
          """ Return the set of all full name word ids to be used with the
              given dictionary of names.
          """
          full_names = set()
          """ Return the set of all full name word ids to be used with the
              given dictionary of names.
          """
          full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            full_names.add(name.strip())
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
  
  
-            brace_idx = name.find('(')
-            if brace_idx >= 0:
-                full_names.add(name[:brace_idx].strip())
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
  
          return full_names
  
  
          return full_names
  
@@ -534,7 +552,7 @@ class _TokenInfo:
          self.data['hnr'] = ';'.join(hnrs)
  
  
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, fulls, partials):
+    def add_street(self, fulls, _):
          """ Add addr:street match terms.
          """
          if fulls:
          """ Add addr:street match terms.
          """
          if fulls: