remove automatic setup of tokenizer directory

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
diff --git a/src/nominatim_db/tokenizer/base.py b/src/nominatim_db/tokenizer/base.py

index 4b96cb235e10ccf80326f38ea8455cc75080d1dc..af2816ecd055a360f3d79b6730f12c676623e385 100644 (file)
--- a/src/nominatim_db/tokenizer/base.py
+++ b/src/nominatim_db/tokenizer/base.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Abstract class definitions for tokenizers. These base classes are here
  # For a full list of authors see the git log.
  """
  Abstract class definitions for tokenizers. These base classes are here
@@ -10,7 +10,6 @@ mainly for documentation purposes.
  """
  from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any, Optional, Iterable
  """
  from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any, Optional, Iterable
-from pathlib import Path
  
  from ..typing import Protocol
  from ..config import Configuration
  
  from ..typing import Protocol
  from ..config import Configuration
@@ -232,6 +231,6 @@ class TokenizerModule(Protocol):
          own tokenizer.
      """
  
          own tokenizer.
      """
  
-    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+    def create(self, dsn: str) -> AbstractTokenizer:
          """ Factory for new tokenizers.
          """
          """ Factory for new tokenizers.
          """
diff --git a/src/nominatim_db/tokenizer/factory.py b/src/nominatim_db/tokenizer/factory.py

index 70b2b0beed0b7718e9dfd19582540b2302cb75b0..570f9f865f5aab0ff0e5ec974049ac284b9847d5 100644 (file)
--- a/src/nominatim_db/tokenizer/factory.py
+++ b/src/nominatim_db/tokenizer/factory.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Functions for creating a tokenizer or initialising the right one for an
  # For a full list of authors see the git log.
  """
  Functions for creating a tokenizer or initialising the right one for an
@@ -52,19 +52,10 @@ def create_tokenizer(config: Configuration, init_db: bool = True,
      if module_name is None:
          module_name = config.TOKENIZER
  
      if module_name is None:
          module_name = config.TOKENIZER
  
-    # Create the directory for the tokenizer data
-    assert config.project_dir is not None
-    basedir = config.project_dir / 'tokenizer'
-    if not basedir.exists():
-        basedir.mkdir()
-    elif not basedir.is_dir():
-        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
-        raise UsageError("Tokenizer setup failed.")
-
      # Import and initialize the tokenizer.
      tokenizer_module = _import_tokenizer(module_name)
  
      # Import and initialize the tokenizer.
      tokenizer_module = _import_tokenizer(module_name)
  
-    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn())
      tokenizer.init_new_db(config, init_db=init_db)
  
      with connect(config.get_libpq_dsn()) as conn:
      tokenizer.init_new_db(config, init_db=init_db)
  
      with connect(config.get_libpq_dsn()) as conn:
@@ -80,10 +71,6 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
          and initialises it.
      """
      assert config.project_dir is not None
          and initialises it.
      """
      assert config.project_dir is not None
-    basedir = config.project_dir / 'tokenizer'
-    if not basedir.is_dir():
-        # Directory will be repopulated by tokenizer below.
-        basedir.mkdir()
  
      with connect(config.get_libpq_dsn()) as conn:
          name = properties.get_property(conn, 'tokenizer')
  
      with connect(config.get_libpq_dsn()) as conn:
          name = properties.get_property(conn, 'tokenizer')
@@ -94,7 +81,7 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
  
      tokenizer_module = _import_tokenizer(name)
  
  
      tokenizer_module = _import_tokenizer(name)
  
-    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn())
      tokenizer.init_from_project(config)
  
      return tokenizer
      tokenizer.init_from_project(config)
  
      return tokenizer
diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py

index 297c9ef9a401e02413d8c223f9633c0d4ad36e33..889bf5315e960dbd8c0f1834a1d128f629ce5df4 100644 (file)
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  # For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
@@ -12,7 +12,6 @@ from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
                     Dict, Set, Iterable
  import itertools
  import logging
                     Dict, Set, Iterable
  import itertools
  import logging
-from pathlib import Path
  
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
  
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
@@ -38,10 +37,10 @@ WORD_TYPES = (('country_names', 'C'),
                ('housenumbers', 'H'))
  
  
                ('housenumbers', 'H'))
  
  
-def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+def create(dsn: str) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
      """ Create a new instance of the tokenizer provided by this module.
      """
-    return ICUTokenizer(dsn, data_dir)
+    return ICUTokenizer(dsn)
  
  
  class ICUTokenizer(AbstractTokenizer):
  
  
  class ICUTokenizer(AbstractTokenizer):
@@ -50,9 +49,8 @@ class ICUTokenizer(AbstractTokenizer):
          normalization routines in Nominatim 3.
      """
  
          normalization routines in Nominatim 3.
      """
  
-    def __init__(self, dsn: str, data_dir: Path) -> None:
+    def __init__(self, dsn: str) -> None:
          self.dsn = dsn
          self.dsn = dsn
-        self.data_dir = data_dir
          self.loader: Optional[ICURuleLoader] = None
  
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
          self.loader: Optional[ICURuleLoader] = None
  
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
diff --git a/test/python/conftest.py b/test/python/conftest.py

index b2ab99ed3514b4ebe3370bae8a6816928ef6eb14..046ee5a6e2c669f558914157253d709e78be4283 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -234,6 +234,6 @@ def tokenizer_mock(monkeypatch, property_table):
      property_table.set('tokenizer', 'dummy')
  
      def _create_tokenizer():
      property_table.set('tokenizer', 'dummy')
  
      def _create_tokenizer():
-        return dummy_tokenizer.DummyTokenizer(None, None)
+        return dummy_tokenizer.DummyTokenizer(None)
  
      return _create_tokenizer
  
      return _create_tokenizer
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

index 08554129257b74fbc43e526d4621deafac42f27f..ce74004ae7bb90dc72ed6be6c44920769fdd4a4b 100644 (file)
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -11,17 +11,16 @@ from nominatim_db.data.place_info import PlaceInfo
  from nominatim_db.config import Configuration
  
  
  from nominatim_db.config import Configuration
  
  
-def create(dsn, data_dir):
+def create(dsn):
      """ Create a new instance of the tokenizer provided by this module.
      """
      """ Create a new instance of the tokenizer provided by this module.
      """
-    return DummyTokenizer(dsn, data_dir)
+    return DummyTokenizer(dsn)
  
  
  class DummyTokenizer:
  
  
  
  class DummyTokenizer:
  
-    def __init__(self, dsn, data_dir):
+    def __init__(self, dsn):
          self.dsn = dsn
          self.dsn = dsn
-        self.data_dir = data_dir
          self.init_state = None
          self.analyser_cache = {}
  
          self.init_state = None
          self.analyser_cache = {}
  
diff --git a/test/python/tokenizer/test_factory.py b/test/python/tokenizer/test_factory.py

index 4f8d2cfecf69f2a74f67e3bb3205d59d8a2cce3d..106cdaaf053b17581c1ae31d33cbaa5e591d9978 100644 (file)
--- a/test/python/tokenizer/test_factory.py
+++ b/test/python/tokenizer/test_factory.py
@@ -32,24 +32,9 @@ class TestFactory:
  
          assert isinstance(tokenizer, DummyTokenizer)
          assert tokenizer.init_state == "new"
  
          assert isinstance(tokenizer, DummyTokenizer)
          assert tokenizer.init_state == "new"
-        assert (self.config.project_dir / 'tokenizer').is_dir()
  
          assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
  
  
          assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
  
-    def test_setup_tokenizer_dir_exists(self):
-        (self.config.project_dir / 'tokenizer').mkdir()
-
-        tokenizer = factory.create_tokenizer(self.config)
-
-        assert isinstance(tokenizer, DummyTokenizer)
-        assert tokenizer.init_state == "new"
-
-    def test_setup_tokenizer_dir_failure(self):
-        (self.config.project_dir / 'tokenizer').write_text("foo")
-
-        with pytest.raises(UsageError):
-            factory.create_tokenizer(self.config)
-
      def test_load_tokenizer(self):
          factory.create_tokenizer(self.config)
  
      def test_load_tokenizer(self):
          factory.create_tokenizer(self.config)
  
@@ -64,7 +49,6 @@ class TestFactory:
          self.config.project_dir = self.config.project_dir
  
          factory.get_tokenizer_for_db(self.config)
          self.config.project_dir = self.config.project_dir
  
          factory.get_tokenizer_for_db(self.config)
-        assert (self.config.project_dir / 'tokenizer').exists()
  
      def test_load_missing_property(self, temp_db_cursor):
          factory.create_tokenizer(self.config)
  
      def test_load_missing_property(self, temp_db_cursor):
          factory.create_tokenizer(self.config)
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py

index 12cef894f863cb2336b1be26240e28fb4a8ba28e..6d2e9ce778a648de0f64692dd5187c759d2b72d7 100644 (file)
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -39,12 +39,9 @@ def test_config(project_env, tmp_path):
  
  
  @pytest.fixture
  
  
  @pytest.fixture
-def tokenizer_factory(dsn, tmp_path, property_table,
-                      sql_preprocessor, place_table, word_table):
-    (tmp_path / 'tokenizer').mkdir()
-
+def tokenizer_factory(dsn, property_table, sql_preprocessor, place_table, word_table):
      def _maker():
      def _maker():
-        return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+        return icu_tokenizer.create(dsn)
  
      return _maker
  
  
      return _maker
  
diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py

index b03c9748441a69537859b404dc125db55411ff18..7610b1bede1183a2c3d4ab4f03b45204a378d0c4 100644 (file)
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -63,7 +63,7 @@ class MockPostcodeTable:
  
  @pytest.fixture
  def tokenizer():
  
  @pytest.fixture
  def tokenizer():
-    return dummy_tokenizer.DummyTokenizer(None, None)
+    return dummy_tokenizer.DummyTokenizer(None)
  
  
  @pytest.fixture
  
  
  @pytest.fixture
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
src/nominatim_db/tokenizer/base.py		patch \| blob \| history
src/nominatim_db/tokenizer/factory.py		patch \| blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py		patch \| blob \| history
test/python/conftest.py		patch \| blob \| history
test/python/dummy_tokenizer.py		patch \| blob \| history
test/python/tokenizer/test_factory.py		patch \| blob \| history
test/python/tokenizer/test_icu.py		patch \| blob \| history
test/python/tools/test_postcodes.py		patch \| blob \| history