]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_tokenizer.py
port legacy tokenizer to new postcode handling
[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
index 3b8f75692f964e9c2e84dc3ada92b156dd0afb7b..36fd5722441a12e92d24434d5dc1317497dd27bc 100644 (file)
@@ -74,10 +74,10 @@ def _check_module(module_dir, conn):
     with conn.cursor() as cur:
         try:
             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
-                           RETURNS text AS '{}/nominatim.so', 'transliteration'
+                           RETURNS text AS %s, 'transliteration'
                            LANGUAGE c IMMUTABLE STRICT;
                            DROP FUNCTION nominatim_test_import_func(text)
-                        """.format(module_dir))
+                        """, (f'{module_dir}/nominatim.so', ))
         except psycopg2.DatabaseError as err:
             LOG.fatal("Error accessing database module: %s", err)
             raise UsageError("Database module cannot be accessed.") from err
@@ -250,12 +250,12 @@ class LegacyTokenizer(AbstractTokenizer):
         php_file = self.data_dir / "tokenizer.php"
 
         if not php_file.exists() or overwrite:
-            php_file.write_text(dedent("""\
+            php_file.write_text(dedent(f"""\
                 <?php
-                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
-                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
-                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)))
+                @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
+                @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
+                require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                """), encoding='utf-8')
 
 
     def _init_db_tables(self, config):
@@ -337,8 +337,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
         return self.normalizer.transliterate(phrase)
 
 
-    @staticmethod
-    def normalize_postcode(postcode):
+    def normalize_postcode(self, postcode):
         """ Convert the postcode to a standardized form.
 
             This function must yield exactly the same result as the SQL function
@@ -468,15 +467,17 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
             if key == 'postcode':
                 # Make sure the normalized postcode is present in the word table.
                 if re.search(r'[:,;]', value) is None:
-                    self._cache.add_postcode(self.conn,
-                                             self.normalize_postcode(value))
+                    norm_pc = self.normalize_postcode(value)
+                    token_info.set_postcode(norm_pc)
+                    self._cache.add_postcode(self.conn, norm_pc)
             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                 hnrs.append(value)
             elif key == 'street':
                 token_info.add_street(self.conn, value)
             elif key == 'place':
                 token_info.add_place(self.conn, value)
-            elif not key.startswith('_') and key not in ('country', 'full'):
+            elif not key.startswith('_') \
+                 and key not in ('country', 'full', 'inclusion'):
                 addr_terms.append((key, value))
 
         if hnrs:
@@ -527,6 +528,11 @@ class _TokenInfo:
             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 
 
+    def set_postcode(self, postcode):
+        """ Set or replace the postcode token with the given value.
+        """
+        self.data['postcode'] = postcode
+
     def add_street(self, conn, street):
         """ Add addr:street match terms.
         """