]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_tokenizer.py
add PHP part for new ICU-base tokenizer
[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
index 0aacb57f536f17cc1a07e10b74943991e281f71c..2f060b84aa98761b653f15a41f68566dc31cb2d8 100644 (file)
@@ -5,7 +5,9 @@ from collections import OrderedDict
 import logging
 import re
 import shutil
+from textwrap import dedent
 
+from icu import Transliterator
 import psycopg2
 import psycopg2.extras
 
@@ -86,7 +88,7 @@ class LegacyTokenizer:
         self.normalization = None
 
 
-    def init_new_db(self, config):
+    def init_new_db(self, config, init_db=True):
         """ Set up a new tokenizer for the database.
 
             This copies all necessary data in the project directory to make
@@ -98,13 +100,16 @@ class LegacyTokenizer:
 
         self.normalization = config.TERM_NORMALIZATION
 
+        self._install_php(config)
+
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
             self._save_config(conn, config)
             conn.commit()
 
-        self.update_sql_functions(config)
-        self._init_db_tables(config)
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
 
 
     def init_from_project(self):
@@ -114,6 +119,15 @@ class LegacyTokenizer:
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
 
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
     def update_sql_functions(self, config):
         """ Reimport the SQL functions for this tokenizer.
         """
@@ -127,6 +141,33 @@ class LegacyTokenizer:
                               modulepath=modulepath)
 
 
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
     def migrate_database(self, config):
         """ Initialise the project directory of an existing database for
             use with this tokenizer.
@@ -134,6 +175,7 @@ class LegacyTokenizer:
             This is a special migration function for updating existing databases
             to new software versions.
         """
+        self.normalization = config.TERM_NORMALIZATION
         module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                      config.lib_dir.module,
                                      config.project_dir / 'module')
@@ -158,7 +200,21 @@ class LegacyTokenizer:
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyNameAnalyzer(self.dsn)
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
 
 
     def _init_db_tables(self, config):
@@ -182,7 +238,6 @@ class LegacyTokenizer:
         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 
 
-
 class LegacyNameAnalyzer:
     """ The legacy analyzer uses the special Postgresql module for
         splitting names.
@@ -191,9 +246,10 @@ class LegacyNameAnalyzer:
         normalization.
     """
 
-    def __init__(self, dsn):
+    def __init__(self, dsn, normalizer):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
+        self.normalizer = normalizer
         psycopg2.extras.register_hstore(self.conn)
 
         self._cache = _TokenCache(self.conn)
@@ -215,6 +271,13 @@ class LegacyNameAnalyzer:
             self.conn = None
 
 
+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+
     def add_postcodes_from_db(self):
         """ Add postcodes from the location_postcode table to the word table.
         """
@@ -223,6 +286,62 @@ class LegacyNameAnalyzer:
                            FROM (SELECT distinct(postcode) as pc
                                  FROM location_postcode) x""")
 
+
+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, names, country_code))
+
+
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
@@ -231,16 +350,38 @@ class LegacyNameAnalyzer:
         """
         token_info = _TokenInfo(self._cache)
 
-        token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
+        names = place.get('name')
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(country_feature.lower(), list(names.values()))
 
         address = place.get('address')
 
         if address:
-            self._add_postcode(address.get('postcode'))
-            token_info.add_housenumbers(self.conn, address)
-            token_info.add_address_parent(self.conn, address.get('street'),
-                                          address.get('place'))
-            token_info.add_address_parts(self.conn, address)
+            hnrs = []
+            addr_terms = []
+            for key, value in address.items():
+                if key == 'postcode':
+                    self._add_postcode(value)
+                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                    hnrs.append(value)
+                elif key == 'street':
+                    token_info.add_street(self.conn, value)
+                elif key == 'place':
+                    token_info.add_place(self.conn, value)
+                elif not key.startswith('_') and \
+                     key not in ('country', 'full'):
+                    addr_terms.append((key, value))
+
+            if hnrs:
+                token_info.add_housenumbers(self.conn, hnrs)
+
+            if addr_terms:
+                token_info.add_address_terms(self.conn, addr_terms)
 
         return token_info.data
 
@@ -248,14 +389,12 @@ class LegacyNameAnalyzer:
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
-        if not postcode or re.search(r'[:,;]', postcode) is not None:
-            return
-
         def _create_postcode_from_db(pcode):
             with self.conn.cursor() as cur:
                 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
 
-        self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+        if re.search(r'[:,;]', postcode) is None:
+            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
 
 
 class _TokenInfo:
@@ -266,32 +405,18 @@ class _TokenInfo:
         self.data = {}
 
 
-    def add_names(self, conn, names, country_feature):
+    def add_names(self, conn, names):
         """ Add token information for the names of the place.
         """
-        if not names:
-            return
-
         with conn.cursor() as cur:
             # Create the token IDs for all names.
             self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
                                             (names, ))
 
-            # Add country tokens to word table if necessary.
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                cur.execute("SELECT create_country(%s, %s)",
-                            (names, country_feature.lower()))
-
 
-    def add_housenumbers(self, conn, address):
+    def add_housenumbers(self, conn, hnrs):
         """ Extract housenumber information from the address.
         """
-        hnrs = [v for k, v in address.items()
-                if k in ('housenumber', 'streetnumber', 'conscriptionnumber')]
-
-        if not hnrs:
-            return
-
         if len(hnrs) == 1:
             token = self.cache.get_housenumber(hnrs[0])
             if token is not None:
@@ -312,27 +437,33 @@ class _TokenInfo:
             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 
 
-    def add_address_parent(self, conn, street, place):
-        """ Extract the tokens for street and place terms.
+    def add_street(self, conn, street):
+        """ Add addr:street match terms.
         """
-        def _get_streetplace(name):
+        def _get_street(name):
             with conn.cursor() as cur:
-                cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+
+        self.data['street'] = self.cache.streets.get(street, _get_street)
+
+
+    def add_place(self, conn, place):
+        """ Add addr:place search and match terms.
+        """
+        def _get_place(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT (addr_ids_from_name(%s)
+                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
                                       word_ids_from_name(%s)::text""",
                             (name, name, name))
                 return cur.fetchone()
 
-        if street:
-            self.data['street_search'], self.data['street_match'] = \
-                self.cache.streets.get(street, _get_streetplace)
+        self.data['place_search'], self.data['place_match'] = \
+            self.cache.places.get(place, _get_place)
 
-        if place:
-            self.data['place_search'], self.data['place_match'] = \
-                self.cache.streets.get(place, _get_streetplace)
 
-
-    def add_address_parts(self, conn, address):
-        """ Extract address terms.
+    def add_address_terms(self, conn, terms):
+        """ Add additional address terms.
         """
         def _get_address_term(name):
             with conn.cursor() as cur:
@@ -342,14 +473,10 @@ class _TokenInfo:
                 return cur.fetchone()
 
         tokens = {}
-        for key, value in address.items():
-            if not key.startswith('_') and \
-               key not in ('country', 'street', 'place', 'postcode', 'full',
-                           'housenumber', 'streetnumber', 'conscriptionnumber'):
-                tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+        for key, value in terms:
+            tokens[key] = self.cache.address_terms.get(value, _get_address_term)
 
-        if tokens:
-            self.data['addr'] = tokens
+        self.data['addr'] = tokens
 
 
 class _LRU:
@@ -357,9 +484,11 @@ class _LRU:
         produce the item when there is a cache miss.
     """
 
-    def __init__(self, maxsize=128):
-        self.data = OrderedDict()
+    def __init__(self, maxsize=128, init_data=None):
+        self.data = init_data or OrderedDict()
         self.maxsize = maxsize
+        if init_data is not None and len(init_data) > maxsize:
+            self.maxsize = len(init_data)
 
     def get(self, key, generator):
         """ Get the item with the given key from the cache. If nothing
@@ -386,7 +515,6 @@ class _TokenCache:
     """
     def __init__(self, conn):
         # various LRU caches
-        self.postcodes = _LRU(maxsize=32)
         self.streets = _LRU(maxsize=256)
         self.places = _LRU(maxsize=128)
         self.address_terms = _LRU(maxsize=1024)
@@ -397,6 +525,14 @@ class _TokenCache:
                            FROM generate_series(1, 100) as i""")
             self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
 
+        # Get postcodes that are already saved
+        postcodes = OrderedDict()
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word FROM word
+                           WHERE class ='place' and type = 'postcode'""")
+            for row in cur:
+                postcodes[row[0]] = None
+        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
 
     def get_housenumber(self, number):
         """ Get a housenumber token from the cache.