Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index f3eb7b4ef4fd9fae8bfcf6f7ef538ded91cfa08e..c585c5afe0bf28bfa24590ed05cb165f6fd2dd01 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,7 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
-import io
  import itertools
  import logging
  import re
@@ -14,6 +13,7 @@ import psycopg2.extras
  
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
@@ -123,7 +123,7 @@ class LegacyICUTokenizer:
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
-
+    # pylint: disable=missing-format-attribute
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
@@ -163,22 +163,28 @@ class LegacyICUTokenizer:
              words = Counter()
              name_proc = ICUNameProcessor(self.naming_rules)
              with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
  
                  for name, cnt in cur:
+                    terms = set()
                      for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        for term in word.split():
-                            words[term] += cnt
+                        if ' ' in word:
+                            terms.update(word.split())
+                    for term in terms:
+                        words[term] += cnt
  
              # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
-
+            with CopyBuffer() as copystr:
+                for args in words.items():
+                    copystr.add(*args)
  
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['word_token', 'search_name_count'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null""")
  
              conn.commit()
  
@@ -265,7 +271,6 @@ class LegacyICUNameAnalyzer:
              table.
          """
          to_delete = []
-        copystr = io.StringIO()
          with self.conn.cursor() as cur:
              # This finds us the rows in location_postcode and word that are
              # missing in the other table.
@@ -278,26 +283,25 @@ class LegacyICUNameAnalyzer:
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
-            for postcode, word in cur:
-                if postcode is None:
-                    to_delete.append(word)
-                else:
-                    copystr.write(postcode)
-                    copystr.write('\t ')
-                    copystr.write(self.name_processor.get_search_normalized(postcode))
-                    copystr.write('\tplace\tpostcode\t0\n')
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(
+                            postcode,
+                            ' ' + self.name_processor.get_search_normalized(postcode),
+                            'place', 'postcode', 0)
  
-            if to_delete:
-                cur.execute("""DELETE FROM WORD
-                               WHERE class ='place' and type = 'postcode'
-                                     and word = any(%s)
-                            """, (to_delete, ))
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE class ='place' and type = 'postcode'
+                                         and word = any(%s)
+                                """, (to_delete, ))
  
-            if copystr.getvalue():
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'search_name_count'])
+                copystr.copy_out(cur, 'word',
+                                 columns=['word', 'word_token', 'class', 'type',
+                                          'search_name_count'])
  
  
      def update_special_phrases(self, phrases, should_replace):
@@ -331,34 +335,24 @@ class LegacyICUNameAnalyzer:
          """
          to_add = new_phrases - existing_phrases
  
-        copystr = io.StringIO()
          added = 0
-        for word, cls, typ, oper in to_add:
-            term = self.name_processor.get_search_normalized(word)
-            if term:
-                copystr.write(word)
-                copystr.write('\t ')
-                copystr.write(term)
-                copystr.write('\t')
-                copystr.write(cls)
-                copystr.write('\t')
-                copystr.write(typ)
-                copystr.write('\t')
-                copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                copystr.write('\t0\n')
-                added += 1
-
-
-        if copystr.tell() > 0:
-            copystr.seek(0)
-            cursor.copy_from(copystr, 'word',
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(word, ' ' + term, cls, typ,
+                                oper if oper in ('in', 'near')  else None, 0)
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
                               columns=['word', 'word_token', 'class', 'type',
                                        'operator', 'search_name_count'])
  
          return added
  
  
-    def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
          """ Remove all phrases from the databse that are no longer in the
              new phrase list.
          """
@@ -457,6 +451,9 @@ class LegacyICUNameAnalyzer:
              full, part = self._cache.names.get(norm_name, (None, None))
              if full is None:
                  variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                  (norm_name, variants))
@@ -476,12 +473,13 @@ class LegacyICUNameAnalyzer:
              given dictionary of names.
          """
          full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            full_names.add(name.strip())
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
  
-            brace_idx = name.find('(')
-            if brace_idx >= 0:
-                full_names.add(name[:brace_idx].strip())
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
  
          return full_names