Merge pull request #2401 from lonvia/port-add-data-to-python

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 960148890d0fe0a7ba465706d00a3097dbaf2d74..6d3d11c163eed81995b8c2c7c71f7870de5395ec 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -9,8 +9,6 @@ import re
  from textwrap import dedent
  from pathlib import Path
  
-import psycopg2.extras
-
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
@@ -123,7 +121,7 @@ class LegacyICUTokenizer:
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
-
+    # pylint: disable=missing-format-attribute
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
@@ -134,7 +132,7 @@ class LegacyICUTokenizer:
              @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
              @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
              require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, phpdir))) # pylint: disable=missing-format-attribute
+            """.format(self, phpdir)))
  
  
      def _save_config(self, config):
@@ -163,12 +161,17 @@ class LegacyICUTokenizer:
              words = Counter()
              name_proc = ICUNameProcessor(self.naming_rules)
              with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
  
                  for name, cnt in cur:
+                    terms = set()
                      for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        for term in word.split():
-                            words[term] += cnt
+                        if ' ' in word:
+                            terms.update(word.split())
+                    for term in terms:
+                        words[term] += cnt
  
              # copy them back into the word table
              with CopyBuffer() as copystr:
@@ -335,8 +338,8 @@ class LegacyICUNameAnalyzer:
              for word, cls, typ, oper in to_add:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
-                    copystr.add(word, term, cls, typ,
-                                oper if oper in ('in', 'near')  else None, 0)
+                    copystr.add(word, ' ' + term, cls, typ,
+                                oper if oper in ('in', 'near') else None, 0)
                      added += 1
  
              copystr.copy_out(cursor, 'word',
@@ -354,8 +357,7 @@ class LegacyICUNameAnalyzer:
          to_delete = existing_phrases - new_phrases
  
          if to_delete:
-            psycopg2.extras.execute_values(
-                cursor,
+            cursor.execute_values(
                  """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                      WHERE word = name and class = in_class and type = in_type
                            and ((op = '-' and operator is null) or op = operator)""",
@@ -381,9 +383,9 @@ class LegacyICUNameAnalyzer:
              if word_tokens:
                  cur.execute("""INSERT INTO word (word_id, word_token, country_code,
                                                   search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                               (SELECT nextval('seq_word'), token, %s, 0
                                  FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
  
  
      def process_place(self, place):
@@ -406,33 +408,36 @@ class LegacyICUNameAnalyzer:
                  self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
-
          if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(*self._compute_name_tokens({'name': value}))
-                elif key == 'place':
-                    token_info.add_place(*self._compute_name_tokens({'name': value}))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(addr_terms)
+            self._process_place_address(token_info, address)
  
          return token_info.data
  
  
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
@@ -446,6 +451,9 @@ class LegacyICUNameAnalyzer:
              full, part = self._cache.names.get(norm_name, (None, None))
              if full is None:
                  variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                  (norm_name, variants))
@@ -465,12 +473,13 @@ class LegacyICUNameAnalyzer:
              given dictionary of names.
          """
          full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            full_names.add(name.strip())
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
  
-            brace_idx = name.find('(')
-            if brace_idx >= 0:
-                full_names.add(name[:brace_idx].strip())
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
  
          return full_names