]> git.openstreetmap.org Git - nominatim.git/commitdiff
rename legacy_icu tokenizer to icu tokenizer
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 17 Aug 2021 21:11:47 +0000 (23:11 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 17 Aug 2021 21:11:47 +0000 (23:11 +0200)
The new icu tokenizer is now no longer compatible with the old
legacy tokenizer in terms of data structures. Therefore there
is also no longer a need to refer to the legacy tokenizer in the
name.

CMakeLists.txt
docs/admin/Tokenizers.md
lib-php/tokenizer/icu_tokenizer.php [moved from lib-php/tokenizer/legacy_icu_tokenizer.php with 100% similarity]
lib-sql/tokenizer/icu_tokenizer.sql [moved from lib-sql/tokenizer/legacy_icu_tokenizer.sql with 100% similarity]
nominatim/tokenizer/icu_tokenizer.py [moved from nominatim/tokenizer/legacy_icu_tokenizer.py with 98% similarity]
settings/icu_tokenizer.yaml [moved from settings/legacy_icu_tokenizer.yaml with 100% similarity]
test/Makefile
test/bdd/steps/nominatim_environment.py
test/bdd/steps/steps_db_ops.py
test/python/test_tokenizer_icu.py [moved from test/python/test_tokenizer_legacy_icu.py with 96% similarity]

index 0b2d7b11ec291528a62d2b45b518227ac63a7661..ef76a4affc2b1e611abaceaa74684f0564b38653 100644 (file)
@@ -258,6 +258,6 @@ install(FILES settings/env.defaults
               settings/import-address.style
               settings/import-full.style
               settings/import-extratags.style
               settings/import-address.style
               settings/import-full.style
               settings/import-extratags.style
-              settings/legacy_icu_tokenizer.yaml
+              settings/icu_tokenizer.yaml
               settings/icu-rules/extended-unicode-to-asccii.yaml
         DESTINATION ${NOMINATIM_CONFIGDIR})
               settings/icu-rules/extended-unicode-to-asccii.yaml
         DESTINATION ${NOMINATIM_CONFIGDIR})
index f3454f67df0714be7a75f7afb5303a96f3109743..6f8898c8ee70690d88aabd63661b758c9ed37b38 100644 (file)
@@ -52,6 +52,12 @@ The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
 
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
 
+To enable the tokenizer add the following line to your project configuration:
+
+```
+NOMINATIM_TOKENIZER=icu
+```
+
 ### How it works
 
 On import the tokenizer processes names in the following four stages:
 ### How it works
 
 On import the tokenizer processes names in the following four stages:
similarity index 98%
rename from nominatim/tokenizer/legacy_icu_tokenizer.py
rename to nominatim/tokenizer/icu_tokenizer.py
index 44034f842622f08257878b69d392af1f47b00df7..cb4112049fb7e8173b835fa1638db0f6ee3a7cc4 100644 (file)
@@ -52,7 +52,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
         if config.TOKENIZER_CONFIG:
             cfgfile = Path(config.TOKENIZER_CONFIG)
         else:
-            cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
+            cfgfile = config.config_dir / 'icu_tokenizer.yaml'
 
         loader = ICURuleLoader(cfgfile)
         self.naming_rules = ICUNameProcessorRules(loader=loader)
 
         loader = ICURuleLoader(cfgfile)
         self.naming_rules = ICUNameProcessorRules(loader=loader)
@@ -88,7 +88,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         with connect(self.dsn) as conn:
             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
             sqlp = SQLPreprocessor(conn, config)
         with connect(self.dsn) as conn:
             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
                               max_word_freq=max_word_freq)
 
 
                               max_word_freq=max_word_freq)
 
 
@@ -98,7 +98,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         self.init_from_project()
 
         if self.naming_rules is None:
         self.init_from_project()
 
         if self.naming_rules is None:
-            return "Configuration for tokenizer 'legacy_icu' are missing."
+            return "Configuration for tokenizer 'icu' are missing."
 
         return None
 
 
         return None
 
@@ -130,7 +130,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
             @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
             @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
-            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
+            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
     def _save_config(self, config):
 
 
     def _save_config(self, config):
index b8afdf9b2b9eb3f240f68468a1efc8e6a3d63dca..6dd9a349405eebb39a72e48b43a6ab095544986f 100644 (file)
@@ -5,7 +5,7 @@ bdd:
        cd bdd && behave -DREMOVE_TEMPLATE=1
 
 icu:
        cd bdd && behave -DREMOVE_TEMPLATE=1
 
 icu:
-       cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu
+       cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=icu
 
 php:
        cd php && phpunit ./
 
 php:
        cd php && phpunit ./
index 1deb43f38a2f4fae6fb8820226903701b0ea55bd..76f90cfa316af280262a066bbf5da77ccdb3d45d 100644 (file)
@@ -201,7 +201,7 @@ class NominatimEnvironment:
                     self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                     self.run_nominatim('freeze')
 
                     self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                     self.run_nominatim('freeze')
 
-                    if self.tokenizer != 'legacy_icu':
+                    if self.tokenizer != 'icu':
                         phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                         run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                     else:
                         phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                         run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                     else:
index ac61fc67356aa8ab04274fe69bf9b28f0eddeffd..d1f27235642f390433552f4177baf6521b80ec43 100644 (file)
@@ -280,7 +280,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
     plist.sort()
 
     with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
     plist.sort()
 
     with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'legacy_icu':
+        if nctx.tokenizer == 'icu':
             cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                         (plist,))
         else:
             cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                         (plist,))
         else:
similarity index 96%
rename from test/python/test_tokenizer_legacy_icu.py
rename to test/python/test_tokenizer_icu.py
index ed489662550b8f100308612311f1d6ea51a02d40..5ec434b6f4b349902ca743106a9199f1382979bc 100644 (file)
@@ -6,7 +6,7 @@ import yaml
 
 import pytest
 
 
 import pytest
 
-from nominatim.tokenizer import legacy_icu_tokenizer
+from nominatim.tokenizer import icu_tokenizer
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
@@ -26,7 +26,7 @@ def test_config(def_config, tmp_path):
     sqldir = tmp_path / 'sql'
     sqldir.mkdir()
     (sqldir / 'tokenizer').mkdir()
     sqldir = tmp_path / 'sql'
     sqldir.mkdir()
     (sqldir / 'tokenizer').mkdir()
-    (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
     shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
                 str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
 
     shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
                 str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
 
@@ -41,7 +41,7 @@ def tokenizer_factory(dsn, tmp_path, property_table,
     (tmp_path / 'tokenizer').mkdir()
 
     def _maker():
     (tmp_path / 'tokenizer').mkdir()
 
     def _maker():
-        return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+        return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
 
     return _maker
 
 
     return _maker
 
@@ -57,7 +57,7 @@ def db_prop(temp_db_conn):
 @pytest.fixture
 def analyzer(tokenizer_factory, test_config, monkeypatch,
              temp_db_with_extensions, tmp_path):
 @pytest.fixture
 def analyzer(tokenizer_factory, test_config, monkeypatch,
              temp_db_with_extensions, tmp_path):
-    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+    sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
     sql.write_text("SELECT 'a';")
 
     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
     sql.write_text("SELECT 'a';")
 
     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
@@ -146,8 +146,8 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
 
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
 
-    assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-    assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
+    assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
+    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
 
 
 def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
 
 
 def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@@ -187,11 +187,11 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
     tok.init_new_db(test_config)
     monkeypatch.undo()
 
     tok.init_new_db(test_config)
     monkeypatch.undo()
 
-    assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
 
     table_factory('test', 'txt TEXT')
 
 
     table_factory('test', 'txt TEXT')
 
-    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+    func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
     func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
 
     tok.update_sql_functions(test_config)
     func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
 
     tok.update_sql_functions(test_config)