The new icu tokenizer is now no longer compatible with the old
legacy tokenizer in terms of data structures. Therefore there
is also no longer a need to refer to the legacy tokenizer in the
name.
settings/import-address.style
settings/import-full.style
settings/import-extratags.style
settings/import-address.style
settings/import-full.style
settings/import-extratags.style
- settings/legacy_icu_tokenizer.yaml
+ settings/icu_tokenizer.yaml
settings/icu-rules/extended-unicode-to-asccii.yaml
DESTINATION ${NOMINATIM_CONFIGDIR})
settings/icu-rules/extended-unicode-to-asccii.yaml
DESTINATION ${NOMINATIM_CONFIGDIR})
normalize names and queries. It also offers configurable decomposition and
abbreviation handling.
normalize names and queries. It also offers configurable decomposition and
abbreviation handling.
+To enable the tokenizer add the following line to your project configuration:
+
+```
+NOMINATIM_TOKENIZER=icu
+```
+
### How it works
On import the tokenizer processes names in the following four stages:
### How it works
On import the tokenizer processes names in the following four stages:
if config.TOKENIZER_CONFIG:
cfgfile = Path(config.TOKENIZER_CONFIG)
else:
if config.TOKENIZER_CONFIG:
cfgfile = Path(config.TOKENIZER_CONFIG)
else:
- cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
+ cfgfile = config.config_dir / 'icu_tokenizer.yaml'
loader = ICURuleLoader(cfgfile)
self.naming_rules = ICUNameProcessorRules(loader=loader)
loader = ICURuleLoader(cfgfile)
self.naming_rules = ICUNameProcessorRules(loader=loader)
with connect(self.dsn) as conn:
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
sqlp = SQLPreprocessor(conn, config)
with connect(self.dsn) as conn:
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
sqlp = SQLPreprocessor(conn, config)
- sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
+ sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
max_word_freq=max_word_freq)
max_word_freq=max_word_freq)
self.init_from_project()
if self.naming_rules is None:
self.init_from_project()
if self.naming_rules is None:
- return "Configuration for tokenizer 'legacy_icu' are missing."
+ return "Configuration for tokenizer 'icu' are missing."
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
- require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
+ require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
def _save_config(self, config):
def _save_config(self, config):
cd bdd && behave -DREMOVE_TEMPLATE=1
icu:
cd bdd && behave -DREMOVE_TEMPLATE=1
icu:
- cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu
+ cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=icu
php:
cd php && phpunit ./
php:
cd php && phpunit ./
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
self.run_nominatim('freeze')
- if self.tokenizer != 'legacy_icu':
+ if self.tokenizer != 'icu':
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
else:
phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
else:
plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
plist.sort()
with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
- if nctx.tokenizer == 'legacy_icu':
+ if nctx.tokenizer == 'icu':
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
(plist,))
else:
cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
(plist,))
else:
-from nominatim.tokenizer import legacy_icu_tokenizer
+from nominatim.tokenizer import icu_tokenizer
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
sqldir = tmp_path / 'sql'
sqldir.mkdir()
(sqldir / 'tokenizer').mkdir()
sqldir = tmp_path / 'sql'
sqldir.mkdir()
(sqldir / 'tokenizer').mkdir()
- (sqldir / 'tokenizer' / 'legacy_icu_tokenizer.sql').write_text("SELECT 'a'")
+ (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
(tmp_path / 'tokenizer').mkdir()
def _maker():
(tmp_path / 'tokenizer').mkdir()
def _maker():
- return legacy_icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+ return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
@pytest.fixture
def analyzer(tokenizer_factory, test_config, monkeypatch,
temp_db_with_extensions, tmp_path):
@pytest.fixture
def analyzer(tokenizer_factory, test_config, monkeypatch,
temp_db_with_extensions, tmp_path):
- sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+ sql = tmp_path / 'sql' / 'tokenizer' / 'icu_tokenizer.sql'
sql.write_text("SELECT 'a';")
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
sql.write_text("SELECT 'a';")
monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
tok = tokenizer_factory()
tok.init_new_db(test_config)
tok = tokenizer_factory()
tok.init_new_db(test_config)
- assert db_prop(legacy_icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
- assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
+ assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
+ assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
tok.init_new_db(test_config)
monkeypatch.undo()
tok.init_new_db(test_config)
monkeypatch.undo()
- assert db_prop(legacy_icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+ assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
table_factory('test', 'txt TEXT')
table_factory('test', 'txt TEXT')
- func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_icu_tokenizer.sql'
+ func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
tok.update_sql_functions(test_config)
func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
tok.update_sql_functions(test_config)