fix Python linitin errors

author Sarah Hoffmann <lonvia@denofr.de>

Sun, 25 Jul 2021 13:30:47 +0000 (15:30 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sun, 25 Jul 2021 13:30:47 +0000 (15:30 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 1f8096ff2848df94f326578496e1009666960d6f..a887ae286834e6005a97ddd53da467b4af54f1ff 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -79,7 +79,6 @@ class LegacyICUTokenizer:
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        pass
  
  
      def update_sql_functions(self, config):
@@ -156,25 +155,12 @@ class LegacyICUTokenizer:
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
-            words = Counter()
-            name_proc = ICUNameProcessor(self.naming_rules)
-            with conn.cursor(name="words") as cur:
-                cur.execute(""" SELECT v, count(*) FROM
-                                  (SELECT svals(name) as v FROM place)x
-                                WHERE length(v) < 75 GROUP BY v""")
-
-                for name, cnt in cur:
-                    terms = set()
-                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        if ' ' in word:
-                            terms.update(word.split())
-                    for term in terms:
-                        words[term] += cnt
+            words = self._count_partial_terms(conn)
  
              # copy them back into the word table
              with CopyBuffer() as copystr:
-                for k, v in words.items():
-                    copystr.add('w', k, json.dumps({'count': v}))
+                for term, cnt in words.items():
+                    copystr.add('w', term, json.dumps({'count': cnt}))
  
                  with conn.cursor() as cur:
                      copystr.copy_out(cur, 'word',
@@ -184,6 +170,27 @@ class LegacyICUTokenizer:
  
              conn.commit()
  
+    def _count_partial_terms(self, conn):
+        """ Count the partial terms from the names in the place table.
+        """
+        words = Counter()
+        name_proc = ICUNameProcessor(self.naming_rules)
+
+        with conn.cursor(name="words") as cur:
+            cur.execute(""" SELECT v, count(*) FROM
+                              (SELECT svals(name) as v FROM place)x
+                            WHERE length(v) < 75 GROUP BY v""")
+
+            for name, cnt in cur:
+                terms = set()
+                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                    if ' ' in word:
+                        terms.update(word.split())
+                for term in terms:
+                    words[term] += cnt
+
+        return words
+
  
  class LegacyICUNameAnalyzer:
      """ The legacy analyzer uses the ICU library for splitting names.
author	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 25 Jul 2021 13:30:47 +0000 (15:30 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)