The TERM_NORMALIZATION config option is no longer applicable.
That was already documented but not yet implemented.
from textwrap import dedent
from nominatim.db.connection import connect
from textwrap import dedent
from nominatim.db.connection import connect
-from nominatim.db.properties import set_property, get_property
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
self.dsn = dsn
self.data_dir = data_dir
self.loader = None
self.dsn = dsn
self.data_dir = data_dir
self.loader = None
- self.term_normalization = None
def init_new_db(self, config, init_db=True):
def init_new_db(self, config, init_db=True):
"""
self.loader = ICURuleLoader(config)
"""
self.loader = ICURuleLoader(config)
- self.term_normalization = config.TERM_NORMALIZATION
-
self._install_php(config.lib_dir.php)
self._save_config()
self._install_php(config.lib_dir.php)
self._save_config()
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
- self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
def finalize_import(self, config):
def finalize_import(self, config):
def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
+ # Will throw an error if there is an issue.
self.init_from_project(config)
self.init_from_project(config)
- if self.term_normalization is None:
- return "Configuration for tokenizer 'icu' are missing."
-
- return None
-
def update_statistics(self):
""" Recompute frequencies for all name words.
def update_statistics(self):
""" Recompute frequencies for all name words.
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
- @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+ @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
"""
with connect(self.dsn) as conn:
self.loader.save_config_to_db(conn)
"""
with connect(self.dsn) as conn:
self.loader.save_config_to_db(conn)
- set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
def _init_db_tables(self, config):
def _init_db_tables(self, config):
import pytest
from nominatim.tokenizer import icu_tokenizer
import pytest
from nominatim.tokenizer import icu_tokenizer
-from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
+import nominatim.tokenizer.icu_rule_loader
from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
'token-analysis': [{'analyzer': 'generic',
'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
'token-analysis': [{'analyzer': 'generic',
'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
- tok.loader = ICURuleLoader(test_config)
+ tok.loader = nominatim.tokenizer.icu_rule_loader.ICURuleLoader(test_config)
return tok.name_analyzer()
return tok.name_analyzer()
SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""")
-def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
- monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
-
+def test_init_new(tokenizer_factory, test_config, db_prop):
tok = tokenizer_factory()
tok.init_new_db(test_config)
tok = tokenizer_factory()
tok.init_new_db(test_config)
- assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
+ assert db_prop(nominatim.tokenizer.icu_rule_loader.DBCFG_IMPORT_NORM_RULES) \
+ .startswith(':: lower ();')
def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
assert temp_db_cursor.table_exists('word')
assert temp_db_cursor.table_exists('word')
-def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
- monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+def test_init_from_project(test_config, tokenizer_factory):
tok = tokenizer_factory()
tok.init_new_db(test_config)
tok = tokenizer_factory()
tok.init_new_db(test_config)
tok = tokenizer_factory()
tok.init_from_project(test_config)
assert tok.loader is not None
tok = tokenizer_factory()
tok.init_from_project(test_config)
assert tok.loader is not None
- assert tok.term_normalization == ':: lower();'
def test_update_sql_functions(db_prop, temp_db_cursor,
def test_update_sql_functions(db_prop, temp_db_cursor,