]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
index 960148890d0fe0a7ba465706d00a3097dbaf2d74..c585c5afe0bf28bfa24590ed05cb165f6fd2dd01 100644 (file)
@@ -123,7 +123,7 @@ class LegacyICUTokenizer:
         """
         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 
-
+    # pylint: disable=missing-format-attribute
     def _install_php(self, phpdir):
         """ Install the php script for the tokenizer.
         """
@@ -134,7 +134,7 @@ class LegacyICUTokenizer:
             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, phpdir))) # pylint: disable=missing-format-attribute
+            """.format(self, phpdir)))
 
 
     def _save_config(self, config):
@@ -163,12 +163,17 @@ class LegacyICUTokenizer:
             words = Counter()
             name_proc = ICUNameProcessor(self.naming_rules)
             with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
 
                 for name, cnt in cur:
+                    terms = set()
                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        for term in word.split():
-                            words[term] += cnt
+                        if ' ' in word:
+                            terms.update(word.split())
+                    for term in terms:
+                        words[term] += cnt
 
             # copy them back into the word table
             with CopyBuffer() as copystr:
@@ -335,7 +340,7 @@ class LegacyICUNameAnalyzer:
             for word, cls, typ, oper in to_add:
                 term = self.name_processor.get_search_normalized(word)
                 if term:
-                    copystr.add(word, term, cls, typ,
+                    copystr.add(word, ' ' + term, cls, typ,
                                 oper if oper in ('in', 'near')  else None, 0)
                     added += 1
 
@@ -446,6 +451,9 @@ class LegacyICUNameAnalyzer:
             full, part = self._cache.names.get(norm_name, (None, None))
             if full is None:
                 variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                 (norm_name, variants))
@@ -465,12 +473,13 @@ class LegacyICUNameAnalyzer:
             given dictionary of names.
         """
         full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            full_names.add(name.strip())
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
 
-            brace_idx = name.find('(')
-            if brace_idx >= 0:
-                full_names.add(name[:brace_idx].strip())
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
 
         return full_names