X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/f44af49df9b86c9a35db000f7a42e65a6c4307ba..87907916ff06741bd4627101d30e6e11b8ea1a1a:/nominatim/tokenizer/legacy_icu_tokenizer.py

diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 09d4059e..2bd22c72 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,6 +3,7 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
+import functools
 import io
 import itertools
 import json
@@ -34,7 +35,7 @@ def create(dsn, data_dir):
 class LegacyICUTokenizer:
     """ This tokenizer uses libICU to covert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
-        normalization routines in Nominatm 3.
+        normalization routines in Nominatim 3.
     """
 
     def __init__(self, dsn, data_dir):
@@ -126,22 +127,25 @@ class LegacyICUTokenizer:
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
         norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("normalizer", self.transliteration)
+        trans = Transliterator.createFromRules("trans", self.transliteration)
         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 
 
     def _install_php(self, config):
         """ Install the php script for the tokenizer.
         """
+        abbr_inverse = list(zip(*self.abbreviations))
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent("""\
             <?php
             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
-            @define('CONST_Transliteration'. "{0.transliteration}");
-            # XXX abreviations
+            @define('CONST_Transliteration', "{0.transliteration}");
+            @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, config)))
+            """.format(self, config,
+                       "','".join(abbr_inverse[0]),
+                       "','".join(abbr_inverse[1]))))
 
 
     def _save_config(self, config):
@@ -181,7 +185,9 @@ class LegacyICUTokenizer:
             # copy them back into the word table
             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 
+
             with conn.cursor() as cur:
+                copystr.seek(0)
                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                                WHERE word_id is null""")
@@ -202,7 +208,6 @@ class LegacyICUNameAnalyzer:
         self.normalizer = normalizer
         self.transliterator = transliterator
         self.abbreviations = abbreviations
-        #psycopg2.extras.register_hstore(self.conn)
 
         self._cache = _TokenCache()
 
@@ -223,14 +228,54 @@ class LegacyICUNameAnalyzer:
             self.conn = None
 
 
+    def get_word_token_info(self, conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        tokens = {}
+        for word in words:
+            if word.startswith('#'):
+                tokens[word] = ' ' + self.make_standard_word(word[1:])
+            else:
+                tokens[word] = self.make_standard_word(word)
+
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = t.term
+                                 and class is null and country_code is null""",
+                        (list(tokens.values()), ))
+            ids = {r[0]: r[1] for r in cur}
+
+        return [(k, v, ids[v]) for k, v in tokens.items()]
+
+
     def normalize(self, phrase):
         """ Normalize the given phrase, i.e. remove all properties that
             are irrelevant for search.
         """
         return self.normalizer.transliterate(phrase)
 
+    @staticmethod
+    def normalize_postcode(postcode):
+        """ Convert the postcode to a standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
+        """
+        return postcode.strip().upper()
+
+
+    @functools.lru_cache(maxsize=1024)
     def make_standard_word(self, name):
-        """ Create the normalised version of the name.
+        """ Create the normalised version of the input.
         """
         norm = ' ' + self.transliterator.transliterate(name) + ' '
         for full, abbr in self.abbreviations:
@@ -250,27 +295,47 @@ class LegacyICUNameAnalyzer:
 
         return self.transliterator.transliterate(hnr)
 
-    def add_postcodes_from_db(self):
-        """ Add postcodes from the location_postcode table to the word table.
+    def update_postcodes_from_db(self):
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
         """
+        to_delete = []
         copystr = io.StringIO()
         with self.conn.cursor() as cur:
-            cur.execute("SELECT distinct(postcode) FROM location_postcode")
-            for (postcode, ) in cur:
-                copystr.write(postcode)
-                copystr.write('\t ')
-                copystr.write(self.transliterator.transliterate(postcode))
-                copystr.write('\tplace\tpostcode\t0\n')
-
-            cur.copy_from(copystr, 'word',
-                          columns=['word', 'word_token', 'class', 'type',
-                                   'search_name_count'])
-            # Don't really need an ID for postcodes....
-            # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-            #                WHERE word_id is null and type = 'postcode'""")
-
-
-    def update_special_phrases(self, phrases):
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT word FROM word
+                                WHERE class ='place' and type = 'postcode') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")
+
+            for postcode, word in cur:
+                if postcode is None:
+                    to_delete.append(word)
+                else:
+                    copystr.write(postcode)
+                    copystr.write('\t ')
+                    copystr.write(self.transliterator.transliterate(postcode))
+                    copystr.write('\tplace\tpostcode\t0\n')
+
+            if to_delete:
+                cur.execute("""DELETE FROM WORD
+                               WHERE class ='place' and type = 'postcode'
+                                     and word = any(%s)
+                            """, (to_delete, ))
+
+            if copystr.getvalue():
+                copystr.seek(0)
+                cur.copy_from(copystr, 'word',
+                              columns=['word', 'word_token', 'class', 'type',
+                                       'search_name_count'])
+
+
+    def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -304,11 +369,12 @@ class LegacyICUNameAnalyzer:
                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
                         copystr.write('\t0\n')
 
+                copystr.seek(0)
                 cur.copy_from(copystr, 'word',
                               columns=['word', 'word_token', 'class', 'type',
                                        'operator', 'search_name_count'])
 
-            if to_delete:
+            if to_delete and should_replace:
                 psycopg2.extras.execute_values(
                     cur,
                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
@@ -325,24 +391,25 @@ class LegacyICUNameAnalyzer:
         """
         full_names = set((self.make_standard_word(n) for n in names))
         full_names.discard('')
-        self._add_normalised_country_names(country_code, full_names)
+        self._add_normalized_country_names(country_code, full_names)
 
 
-    def _add_normalised_country_names(self, country_code, names):
+    def _add_normalized_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
+        word_tokens = set((' ' + name for name in names))
         with self.conn.cursor() as cur:
             # Get existing names
             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
                         (country_code, ))
-            new_names = names.difference((t[0] for t in cur))
+            word_tokens.difference_update((t[0] for t in cur))
 
-            if new_names:
+            if word_tokens:
                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
                                                  search_name_count)
                                (SELECT nextval('seq_word'), token, '{}', 0
                                 FROM unnest(%s) as token)
-                            """.format(country_code), (list(new_names),))
+                            """.format(country_code), (list(word_tokens),))
 
 
     def process_place(self, place):
@@ -356,14 +423,13 @@ class LegacyICUNameAnalyzer:
         names = place.get('name')
 
         if names:
-            full_names = set((self.make_standard_word(name) for name in names.values()))
-            full_names.discard('')
+            full_names = self._compute_full_names(names)
 
             token_info.add_names(self.conn, full_names)
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalised_country_names(country_feature.lower(),
+                self._add_normalized_country_names(country_feature.lower(),
                                                    full_names)
 
         address = place.get('address')
@@ -394,25 +460,47 @@ class LegacyICUNameAnalyzer:
         return token_info.data
 
 
+    def _compute_full_names(self, names):
+        """ Return the set of all full name word ids to be used with the
+            given dictionary of names.
+        """
+        full_names = set()
+        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
+            word = self.make_standard_word(name)
+            if word:
+                full_names.add(word)
+
+                brace_split = name.split('(', 2)
+                if len(brace_split) > 1:
+                    word = self.make_standard_word(brace_split[0])
+                    if word:
+                        full_names.add(word)
+
+        return full_names
+
+
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
-        if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
-            term = self.make_standard_word(postcode)
-            if not term:
-                return
-
-            with self.conn.cursor() as cur:
-                # no word_id needed for postcodes
-                cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                 search_name_count)
-                               (SELECT pc, %s, 'place', 'postcode', 0
-                                FROM (VALUES (%s)) as v(pc)
-                                WHERE NOT EXISTS
-                                 (SELECT * FROM word
-                                  WHERE word = pc and class='place' and type='postcode'))
-                            """, (' ' + term, postcode))
-            self._cache.postcodes.add(postcode)
+        if re.search(r'[:,;]', postcode) is None:
+            postcode = self.normalize_postcode(postcode)
+
+            if postcode not in self._cache.postcodes:
+                term = self.make_standard_word(postcode)
+                if not term:
+                    return
+
+                with self.conn.cursor() as cur:
+                    # no word_id needed for postcodes
+                    cur.execute("""INSERT INTO word (word, word_token, class, type,
+                                                     search_name_count)
+                                   (SELECT pc, %s, 'place', 'postcode', 0
+                                    FROM (VALUES (%s)) as v(pc)
+                                    WHERE NOT EXISTS
+                                     (SELECT * FROM word
+                                      WHERE word = pc and class='place' and type='postcode'))
+                                """, (' ' + term, postcode))
+                self._cache.postcodes.add(postcode)
 
     @staticmethod
     def _split_housenumbers(hnrs):
@@ -449,8 +537,6 @@ class _TokenInfo:
         """
         # Start with all partial names
         terms = set((part for ns in names for part in ns.split()))
-        # Add partials for the full terms (TO BE REMOVED)
-        terms.update((n for n in names))
         # Add the full names
         terms.update((' ' + n for n in names))