From: Sarah Hoffmann Date: Wed, 2 Apr 2025 18:20:04 +0000 (+0200) Subject: remove automatic setup of tokenizer directory X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/186f562dd7208ce7174d12c8cbe18428f185fd36?hp=-c remove automatic setup of tokenizer directory ICU tokenizer doesn't need any extra data anymore, so it doesn't make sense to create a directory which then remains empty. If a tokenizer needs such a directory in the future, it needs to create it on its own and make sure to handle the situation correctly where no project directory is used at all. --- 186f562dd7208ce7174d12c8cbe18428f185fd36 diff --git a/src/nominatim_db/tokenizer/base.py b/src/nominatim_db/tokenizer/base.py index 4b96cb23..af2816ec 100644 --- a/src/nominatim_db/tokenizer/base.py +++ b/src/nominatim_db/tokenizer/base.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Abstract class definitions for tokenizers. These base classes are here @@ -10,7 +10,6 @@ mainly for documentation purposes. """ from abc import ABC, abstractmethod from typing import List, Tuple, Dict, Any, Optional, Iterable -from pathlib import Path from ..typing import Protocol from ..config import Configuration @@ -232,6 +231,6 @@ class TokenizerModule(Protocol): own tokenizer. """ - def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer: + def create(self, dsn: str) -> AbstractTokenizer: """ Factory for new tokenizers. """ diff --git a/src/nominatim_db/tokenizer/factory.py b/src/nominatim_db/tokenizer/factory.py index 70b2b0be..570f9f86 100644 --- a/src/nominatim_db/tokenizer/factory.py +++ b/src/nominatim_db/tokenizer/factory.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Functions for creating a tokenizer or initialising the right one for an @@ -52,19 +52,10 @@ def create_tokenizer(config: Configuration, init_db: bool = True, if module_name is None: module_name = config.TOKENIZER - # Create the directory for the tokenizer data - assert config.project_dir is not None - basedir = config.project_dir / 'tokenizer' - if not basedir.exists(): - basedir.mkdir() - elif not basedir.is_dir(): - LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir) - raise UsageError("Tokenizer setup failed.") - # Import and initialize the tokenizer. tokenizer_module = _import_tokenizer(module_name) - tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) + tokenizer = tokenizer_module.create(config.get_libpq_dsn()) tokenizer.init_new_db(config, init_db=init_db) with connect(config.get_libpq_dsn()) as conn: @@ -80,10 +71,6 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer: and initialises it. """ assert config.project_dir is not None - basedir = config.project_dir / 'tokenizer' - if not basedir.is_dir(): - # Directory will be repopulated by tokenizer below. - basedir.mkdir() with connect(config.get_libpq_dsn()) as conn: name = properties.get_property(conn, 'tokenizer') @@ -94,7 +81,7 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer: tokenizer_module = _import_tokenizer(name) - tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) + tokenizer = tokenizer_module.create(config.get_libpq_dsn()) tokenizer.init_from_project(config) return tokenizer diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 297c9ef9..889bf531 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Tokenizer implementing normalisation as used before Nominatim 4 but using @@ -12,7 +12,6 @@ from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \ Dict, Set, Iterable import itertools import logging -from pathlib import Path from psycopg.types.json import Jsonb from psycopg import sql as pysql @@ -38,10 +37,10 @@ WORD_TYPES = (('country_names', 'C'), ('housenumbers', 'H')) -def create(dsn: str, data_dir: Path) -> 'ICUTokenizer': +def create(dsn: str) -> 'ICUTokenizer': """ Create a new instance of the tokenizer provided by this module. """ - return ICUTokenizer(dsn, data_dir) + return ICUTokenizer(dsn) class ICUTokenizer(AbstractTokenizer): @@ -50,9 +49,8 @@ class ICUTokenizer(AbstractTokenizer): normalization routines in Nominatim 3. """ - def __init__(self, dsn: str, data_dir: Path) -> None: + def __init__(self, dsn: str) -> None: self.dsn = dsn - self.data_dir = data_dir self.loader: Optional[ICURuleLoader] = None def init_new_db(self, config: Configuration, init_db: bool = True) -> None: diff --git a/test/python/conftest.py b/test/python/conftest.py index b2ab99ed..046ee5a6 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -234,6 +234,6 @@ def tokenizer_mock(monkeypatch, property_table): property_table.set('tokenizer', 'dummy') def _create_tokenizer(): - return dummy_tokenizer.DummyTokenizer(None, None) + return dummy_tokenizer.DummyTokenizer(None) return _create_tokenizer diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index 08554129..ce74004a 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -11,17 +11,16 @@ from nominatim_db.data.place_info import PlaceInfo from nominatim_db.config import Configuration -def create(dsn, data_dir): +def create(dsn): """ Create a new instance of the tokenizer provided by this module. """ - return DummyTokenizer(dsn, data_dir) + return DummyTokenizer(dsn) class DummyTokenizer: - def __init__(self, dsn, data_dir): + def __init__(self, dsn): self.dsn = dsn - self.data_dir = data_dir self.init_state = None self.analyser_cache = {} diff --git a/test/python/tokenizer/test_factory.py b/test/python/tokenizer/test_factory.py index 4f8d2cfe..106cdaaf 100644 --- a/test/python/tokenizer/test_factory.py +++ b/test/python/tokenizer/test_factory.py @@ -32,24 +32,9 @@ class TestFactory: assert isinstance(tokenizer, DummyTokenizer) assert tokenizer.init_state == "new" - assert (self.config.project_dir / 'tokenizer').is_dir() assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy' - def test_setup_tokenizer_dir_exists(self): - (self.config.project_dir / 'tokenizer').mkdir() - - tokenizer = factory.create_tokenizer(self.config) - - assert isinstance(tokenizer, DummyTokenizer) - assert tokenizer.init_state == "new" - - def test_setup_tokenizer_dir_failure(self): - (self.config.project_dir / 'tokenizer').write_text("foo") - - with pytest.raises(UsageError): - factory.create_tokenizer(self.config) - def test_load_tokenizer(self): factory.create_tokenizer(self.config) @@ -64,7 +49,6 @@ class TestFactory: self.config.project_dir = self.config.project_dir factory.get_tokenizer_for_db(self.config) - assert (self.config.project_dir / 'tokenizer').exists() def test_load_missing_property(self, temp_db_cursor): factory.create_tokenizer(self.config) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 12cef894..6d2e9ce7 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -39,12 +39,9 @@ def test_config(project_env, tmp_path): @pytest.fixture -def tokenizer_factory(dsn, tmp_path, property_table, - sql_preprocessor, place_table, word_table): - (tmp_path / 'tokenizer').mkdir() - +def tokenizer_factory(dsn, property_table, sql_preprocessor, place_table, word_table): def _maker(): - return icu_tokenizer.create(dsn, tmp_path / 'tokenizer') + return icu_tokenizer.create(dsn) return _maker diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py index b03c9748..7610b1be 100644 --- a/test/python/tools/test_postcodes.py +++ b/test/python/tools/test_postcodes.py @@ -63,7 +63,7 @@ class MockPostcodeTable: @pytest.fixture def tokenizer(): - return dummy_tokenizer.DummyTokenizer(None, None) + return dummy_tokenizer.DummyTokenizer(None) @pytest.fixture