]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
index eb8502377f1c9e0a65a5b7d2808e3e6c9ab226dd..c585c5afe0bf28bfa24590ed05cb165f6fd2dd01 100644 (file)
@@ -3,20 +3,17 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
-import functools
-import io
 import itertools
 import itertools
-import json
 import logging
 import re
 from textwrap import dedent
 from pathlib import Path
 
 import logging
 import re
 from textwrap import dedent
 from pathlib import Path
 
-from icu import Transliterator
 import psycopg2.extras
 
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 import psycopg2.extras
 
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
@@ -103,9 +100,7 @@ class LegacyICUTokenizer:
         """
         self.init_from_project()
 
         """
         self.init_from_project()
 
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
@@ -128,7 +123,7 @@ class LegacyICUTokenizer:
         """
         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 
         """
         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 
-
+    # pylint: disable=missing-format-attribute
     def _install_php(self, phpdir):
         """ Install the php script for the tokenizer.
         """
     def _install_php(self, phpdir):
         """ Install the php script for the tokenizer.
         """
@@ -168,22 +163,28 @@ class LegacyICUTokenizer:
             words = Counter()
             name_proc = ICUNameProcessor(self.naming_rules)
             with conn.cursor(name="words") as cur:
             words = Counter()
             name_proc = ICUNameProcessor(self.naming_rules)
             with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
 
                 for name, cnt in cur:
 
                 for name, cnt in cur:
+                    terms = set()
                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        for term in word.split():
-                            words[term] += cnt
+                        if ' ' in word:
+                            terms.update(word.split())
+                    for term in terms:
+                        words[term] += cnt
 
             # copy them back into the word table
 
             # copy them back into the word table
-            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
-
+            with CopyBuffer() as copystr:
+                for args in words.items():
+                    copystr.add(*args)
 
 
-            with conn.cursor() as cur:
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
-                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                               WHERE word_id is null""")
+                with conn.cursor() as cur:
+                    copystr.copy_out(cur, 'word',
+                                     columns=['word_token', 'search_name_count'])
+                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+                                   WHERE word_id is null""")
 
             conn.commit()
 
 
             conn.commit()
 
@@ -219,7 +220,7 @@ class LegacyICUNameAnalyzer:
             self.conn = None
 
 
             self.conn = None
 
 
-    def get_word_token_info(self, conn, words):
+    def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
             otherwise is a partial name.
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
             otherwise is a partial name.
@@ -233,11 +234,11 @@ class LegacyICUNameAnalyzer:
         tokens = {}
         for word in words:
             if word.startswith('#'):
         tokens = {}
         for word in words:
             if word.startswith('#'):
-                tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
+                tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
             else:
             else:
-                tokens[word] = self.name_processor.get_normalized(word)
+                tokens[word] = self.name_processor.get_search_normalized(word)
 
 
-        with conn.cursor() as cur:
+        with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                            WHERE word_token = t.term
             cur.execute("""SELECT word_token, word_id
                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
                            WHERE word_token = t.term
@@ -245,7 +246,7 @@ class LegacyICUNameAnalyzer:
                         (list(tokens.values()), ))
             ids = {r[0]: r[1] for r in cur}
 
                         (list(tokens.values()), ))
             ids = {r[0]: r[1] for r in cur}
 
-        return [(k, v, ids[v]) for k, v in tokens.items()]
+        return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 
 
     @staticmethod
 
 
     @staticmethod
@@ -270,7 +271,6 @@ class LegacyICUNameAnalyzer:
             table.
         """
         to_delete = []
             table.
         """
         to_delete = []
-        copystr = io.StringIO()
         with self.conn.cursor() as cur:
             # This finds us the rows in location_postcode and word that are
             # missing in the other table.
         with self.conn.cursor() as cur:
             # This finds us the rows in location_postcode and word that are
             # missing in the other table.
@@ -283,32 +283,31 @@ class LegacyICUNameAnalyzer:
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
-            for postcode, word in cur:
-                if postcode is None:
-                    to_delete.append(word)
-                else:
-                    copystr.write(postcode)
-                    copystr.write('\t ')
-                    copystr.write(self.name_processor.get_search_normalized(postcode))
-                    copystr.write('\tplace\tpostcode\t0\n')
+            with CopyBuffer() as copystr:
+                for postcode, word in cur:
+                    if postcode is None:
+                        to_delete.append(word)
+                    else:
+                        copystr.add(
+                            postcode,
+                            ' ' + self.name_processor.get_search_normalized(postcode),
+                            'place', 'postcode', 0)
 
 
-            if to_delete:
-                cur.execute("""DELETE FROM WORD
-                               WHERE class ='place' and type = 'postcode'
-                                     and word = any(%s)
-                            """, (to_delete, ))
+                if to_delete:
+                    cur.execute("""DELETE FROM WORD
+                                   WHERE class ='place' and type = 'postcode'
+                                         and word = any(%s)
+                                """, (to_delete, ))
 
 
-            if copystr.getvalue():
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'search_name_count'])
+                copystr.copy_out(cur, 'word',
+                                 columns=['word', 'word_token', 'class', 'type',
+                                          'search_name_count'])
 
 
     def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
 
 
     def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
-        norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -320,40 +319,54 @@ class LegacyICUNameAnalyzer:
             for label, cls, typ, oper in cur:
                 existing_phrases.add((label, cls, typ, oper or '-'))
 
             for label, cls, typ, oper in cur:
                 existing_phrases.add((label, cls, typ, oper or '-'))
 
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.name_processor.get_search_normalized(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
-
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
-
-            if to_delete and should_replace:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
 
         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 
         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+                 len(norm_phrases), added, deleted)
+
+
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        added = 0
+        with CopyBuffer() as copystr:
+            for word, cls, typ, oper in to_add:
+                term = self.name_processor.get_search_normalized(word)
+                if term:
+                    copystr.add(word, ' ' + term, cls, typ,
+                                oper if oper in ('in', 'near')  else None, 0)
+                    added += 1
+
+            copystr.copy_out(cursor, 'word',
+                             columns=['word', 'word_token', 'class', 'type',
+                                      'operator', 'search_name_count'])
+
+        return added
+
+
+    @staticmethod
+    def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
+        """
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            psycopg2.extras.execute_values(
+                cursor,
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE word = name and class = in_class and type = in_type
+                          and ((op = '-' and operator is null) or op = operator)""",
+                to_delete)
+
+        return len(to_delete)
 
 
     def add_country_names(self, country_code, names):
 
 
     def add_country_names(self, country_code, names):
@@ -438,6 +451,9 @@ class LegacyICUNameAnalyzer:
             full, part = self._cache.names.get(norm_name, (None, None))
             if full is None:
                 variants = self.name_processor.get_variants_ascii(norm_name)
             full, part = self._cache.names.get(norm_name, (None, None))
             if full is None:
                 variants = self.name_processor.get_variants_ascii(norm_name)
+                if not variants:
+                    continue
+
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                 (norm_name, variants))
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                                 (norm_name, variants))
@@ -451,17 +467,19 @@ class LegacyICUNameAnalyzer:
         return full_tokens, partial_tokens
 
 
         return full_tokens, partial_tokens
 
 
-    def _compute_full_names(self, names):
+    @staticmethod
+    def _compute_full_names(names):
         """ Return the set of all full name word ids to be used with the
             given dictionary of names.
         """
         full_names = set()
         """ Return the set of all full name word ids to be used with the
             given dictionary of names.
         """
         full_names = set()
-        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
-            full_names.add(name.strip())
+        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+            if name:
+                full_names.add(name)
 
 
-            brace_idx = name.find('(')
-            if brace_idx >= 0:
-                full_names.add(name[:brace_idx].strip())
+                brace_idx = name.find('(')
+                if brace_idx >= 0:
+                    full_names.add(name[:brace_idx].strip())
 
         return full_names
 
 
         return full_names
 
@@ -534,7 +552,7 @@ class _TokenInfo:
         self.data['hnr'] = ';'.join(hnrs)
 
 
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, fulls, partials):
+    def add_street(self, fulls, _):
         """ Add addr:street match terms.
         """
         if fulls:
         """ Add addr:street match terms.
         """
         if fulls: