X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/925195725dfcb7f1a6795c50244c1df6cb7242ce..231250f2eb272b77d54e4b4b18bd85a80413ac34:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index cb411204..81b07568 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -8,7 +8,6 @@ import json import logging import re from textwrap import dedent -from pathlib import Path from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property @@ -18,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer -DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -40,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer): self.data_dir = data_dir self.naming_rules = None self.term_normalization = None - self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -49,18 +46,13 @@ class LegacyICUTokenizer(AbstractTokenizer): This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ - if config.TOKENIZER_CONFIG: - cfgfile = Path(config.TOKENIZER_CONFIG) - else: - cfgfile = config.config_dir / 'icu_tokenizer.yaml' - - loader = ICURuleLoader(cfgfile) + loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG')) self.naming_rules = ICUNameProcessorRules(loader=loader) self.term_normalization = config.TERM_NORMALIZATION - self.max_word_frequency = config.MAX_WORD_FREQUENCY self._install_php(config.lib_dir.php) - self._save_config(config) + self._save_config() if init_db: self.update_sql_functions(config) @@ -73,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.naming_rules = ICUNameProcessorRules(conn=conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) - self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, _): @@ -86,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: - max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql', - max_word_freq=max_word_freq) + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') def check_database(self): @@ -127,20 +116,19 @@ class LegacyICUTokenizer(AbstractTokenizer): php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent(f"""\