]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_tokenizer.py
Merge pull request #2384 from lonvia/actions-add-icu-tokenizer
[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
index ac26c5cd5d160257ad66f30fe289d6bd9b791e12..6040f88f154bad6eb25e81c9498017d914db3e88 100644 (file)
@@ -5,6 +5,7 @@ from collections import OrderedDict
 import logging
 import re
 import shutil
 import logging
 import re
 import shutil
+from textwrap import dedent
 
 from icu import Transliterator
 import psycopg2
 
 from icu import Transliterator
 import psycopg2
@@ -87,7 +88,7 @@ class LegacyTokenizer:
         self.normalization = None
 
 
         self.normalization = None
 
 
-    def init_new_db(self, config):
+    def init_new_db(self, config, init_db=True):
         """ Set up a new tokenizer for the database.
 
             This copies all necessary data in the project directory to make
         """ Set up a new tokenizer for the database.
 
             This copies all necessary data in the project directory to make
@@ -99,13 +100,16 @@ class LegacyTokenizer:
 
         self.normalization = config.TERM_NORMALIZATION
 
 
         self.normalization = config.TERM_NORMALIZATION
 
+        self._install_php(config)
+
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
             self._save_config(conn, config)
             conn.commit()
 
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
             self._save_config(conn, config)
             conn.commit()
 
-        self.update_sql_functions(config)
-        self._init_db_tables(config)
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
 
 
     def init_from_project(self):
 
 
     def init_from_project(self):
@@ -115,6 +119,15 @@ class LegacyTokenizer:
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
 
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
 
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
     def update_sql_functions(self, config):
         """ Reimport the SQL functions for this tokenizer.
         """
     def update_sql_functions(self, config):
         """ Reimport the SQL functions for this tokenizer.
         """
@@ -128,6 +141,33 @@ class LegacyTokenizer:
                               modulepath=modulepath)
 
 
                               modulepath=modulepath)
 
 
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
     def migrate_database(self, config):
         """ Initialise the project directory of an existing database for
             use with this tokenizer.
     def migrate_database(self, config):
         """ Initialise the project directory of an existing database for
             use with this tokenizer.
@@ -165,6 +205,18 @@ class LegacyTokenizer:
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
+
+
     def _init_db_tables(self, config):
         """ Set up the word table and fill it with pre-computed word
             frequencies.
     def _init_db_tables(self, config):
         """ Set up the word table and fill it with pre-computed word
             frequencies.
@@ -219,6 +271,32 @@ class LegacyNameAnalyzer:
             self.conn = None
 
 
             self.conn = None
 
 
+    def get_word_token_info(self, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT t.term, word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = (CASE
+                                   WHEN left(t.term, 1) = '#' THEN
+                                     ' ' || make_standard_name(substring(t.term from 2))
+                                   ELSE
+                                     make_standard_name(t.term)
+                                   END)
+                                 and class is null and country_code is null""",
+                        (words, ))
+
+            return [(r[0], r[1], r[2]) for r in cur]
+
+
     def normalize(self, phrase):
         """ Normalize the given phrase, i.e. remove all properties that
             are irrelevant for search.
     def normalize(self, phrase):
         """ Normalize the given phrase, i.e. remove all properties that
             are irrelevant for search.
@@ -226,16 +304,54 @@ class LegacyNameAnalyzer:
         return self.normalizer.transliterate(phrase)
 
 
         return self.normalizer.transliterate(phrase)
 
 
-    def add_postcodes_from_db(self):
-        """ Add postcodes from the location_postcode table to the word table.
+    @staticmethod
+    def normalize_postcode(postcode):
+        """ Convert the postcode to a standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            'token_normalized_postcode()'.
+        """
+        return postcode.strip().upper()
+
+
+    def update_postcodes_from_db(self):
+        """ Update postcode tokens in the word table from the location_postcode
+            table.
         """
         with self.conn.cursor() as cur:
         """
         with self.conn.cursor() as cur:
-            cur.execute("""SELECT count(create_postcode_id(pc))
-                           FROM (SELECT distinct(postcode) as pc
-                                 FROM location_postcode) x""")
+            # This finds us the rows in location_postcode and word that are
+            # missing in the other table.
+            cur.execute("""SELECT * FROM
+                            (SELECT pc, word FROM
+                              (SELECT distinct(postcode) as pc FROM location_postcode) p
+                              FULL JOIN
+                              (SELECT word FROM word
+                                WHERE class ='place' and type = 'postcode') w
+                              ON pc = word) x
+                           WHERE pc is null or word is null""")
+
+            to_delete = []
+            to_add = []
+
+            for postcode, word in cur:
+                if postcode is None:
+                    to_delete.append(word)
+                else:
+                    to_add.append(postcode)
+
+            if to_delete:
+                cur.execute("""DELETE FROM WORD
+                               WHERE class ='place' and type = 'postcode'
+                                     and word = any(%s)
+                            """, (to_delete, ))
+            if to_add:
+                cur.execute("""SELECT count(create_postcode_id(pc))
+                               FROM unnest(%s) as pc
+                            """, (to_add, ))
+
 
 
 
 
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
         """ Replace the search index for special phrases with the new phrases.
         """
         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -258,13 +374,13 @@ class LegacyNameAnalyzer:
                     cur,
                     """ INSERT INTO word (word_id, word_token, word, class, type,
                                           search_name_count, operator)
                     cur,
                     """ INSERT INTO word (word_id, word_token, word, class, type,
                                           search_name_count, operator)
-                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                        (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
                                 class, type, 0,
                                 CASE WHEN op in ('in', 'near') THEN op ELSE null END
                            FROM (VALUES %s) as v(name, class, type, op))""",
                     to_add)
 
                                 class, type, 0,
                                 CASE WHEN op in ('in', 'near') THEN op ELSE null END
                            FROM (VALUES %s) as v(name, class, type, op))""",
                     to_add)
 
-            if to_delete:
+            if to_delete and should_replace:
                 psycopg2.extras.execute_values(
                     cur,
                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                 psycopg2.extras.execute_values(
                     cur,
                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
@@ -283,11 +399,11 @@ class LegacyNameAnalyzer:
             cur.execute(
                 """INSERT INTO word (word_id, word_token, country_code)
                    (SELECT nextval('seq_word'), lookup_token, %s
             cur.execute(
                 """INSERT INTO word (word_id, word_token, country_code)
                    (SELECT nextval('seq_word'), lookup_token, %s
-                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                      FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
                             FROM unnest(%s)n) y
                       WHERE NOT EXISTS(SELECT * FROM word
                                        WHERE word_token = lookup_token and country_code = %s))
                             FROM unnest(%s)n) y
                       WHERE NOT EXISTS(SELECT * FROM word
                                        WHERE word_token = lookup_token and country_code = %s))
-                """, (country_code, names, country_code))
+                """, (country_code, list(names.values()), country_code))
 
 
     def process_place(self, place):
 
 
     def process_place(self, place):
@@ -305,7 +421,7 @@ class LegacyNameAnalyzer:
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), list(names.values()))
+                self.add_country_names(country_feature.lower(), names)
 
         address = place.get('address')
 
 
         address = place.get('address')
 
@@ -337,12 +453,8 @@ class LegacyNameAnalyzer:
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
-        def _create_postcode_from_db(pcode):
-            with self.conn.cursor() as cur:
-                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
-
         if re.search(r'[:,;]', postcode) is None:
         if re.search(r'[:,;]', postcode) is None:
-            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+            self._cache.add_postcode(self.conn, self.normalize_postcode(postcode))
 
 
 class _TokenInfo:
 
 
 class _TokenInfo:
@@ -400,10 +512,9 @@ class _TokenInfo:
         """
         def _get_place(name):
             with conn.cursor() as cur:
         """
         def _get_place(name):
             with conn.cursor() as cur:
-                cur.execute("""SELECT (addr_ids_from_name(%s)
-                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                       word_ids_from_name(%s)::text""",
                                       word_ids_from_name(%s)::text""",
-                            (name, name, name))
+                            (name, name))
                 return cur.fetchone()
 
         self.data['place_search'], self.data['place_match'] = \
                 return cur.fetchone()
 
         self.data['place_search'], self.data['place_match'] = \
@@ -473,16 +584,19 @@ class _TokenCache:
                            FROM generate_series(1, 100) as i""")
             self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
 
                            FROM generate_series(1, 100) as i""")
             self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
 
-        # Get postcodes that are already saved
-        postcodes = OrderedDict()
-        with conn.cursor() as cur:
-            cur.execute("""SELECT word FROM word
-                           WHERE class ='place' and type = 'postcode'""")
-            for row in cur:
-                postcodes[row[0]] = None
-        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+        # For postcodes remember the ones that have already been added
+        self.postcodes = set()
 
     def get_housenumber(self, number):
         """ Get a housenumber token from the cache.
         """
         return self._cached_housenumbers.get(number)
 
     def get_housenumber(self, number):
         """ Get a housenumber token from the cache.
         """
         return self._cached_housenumbers.get(number)
+
+
+    def add_postcode(self, conn, postcode):
+        """ Make sure the given postcode is in the database.
+        """
+        if postcode not in self.postcodes:
+            with conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
+            self.postcodes.add(postcode)