remove automatic setup of tokenizer directory

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
diff --git a/src/nominatim_db/tokenizer/base.py b/src/nominatim_db/tokenizer/base.py

index 4b96cb235e10ccf80326f38ea8455cc75080d1dc..af2816ecd055a360f3d79b6730f12c676623e385 100644 (file)
--- a/src/nominatim_db/tokenizer/base.py
+++ b/src/nominatim_db/tokenizer/base.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Abstract class definitions for tokenizers. These base classes are here
@@ -10,7 +10,6 @@ mainly for documentation purposes.
  """
  from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any, Optional, Iterable
-from pathlib import Path
  
  from ..typing import Protocol
  from ..config import Configuration
@@ -232,6 +231,6 @@ class TokenizerModule(Protocol):
          own tokenizer.
      """
  
-    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+    def create(self, dsn: str) -> AbstractTokenizer:
          """ Factory for new tokenizers.
          """
diff --git a/src/nominatim_db/tokenizer/factory.py b/src/nominatim_db/tokenizer/factory.py

index 70b2b0beed0b7718e9dfd19582540b2302cb75b0..570f9f865f5aab0ff0e5ec974049ac284b9847d5 100644 (file)
--- a/src/nominatim_db/tokenizer/factory.py
+++ b/src/nominatim_db/tokenizer/factory.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Functions for creating a tokenizer or initialising the right one for an
@@ -52,19 +52,10 @@ def create_tokenizer(config: Configuration, init_db: bool = True,
      if module_name is None:
          module_name = config.TOKENIZER
  
-    # Create the directory for the tokenizer data
-    assert config.project_dir is not None
-    basedir = config.project_dir / 'tokenizer'
-    if not basedir.exists():
-        basedir.mkdir()
-    elif not basedir.is_dir():
-        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
-        raise UsageError("Tokenizer setup failed.")
-
      # Import and initialize the tokenizer.
      tokenizer_module = _import_tokenizer(module_name)
  
-    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn())
      tokenizer.init_new_db(config, init_db=init_db)
  
      with connect(config.get_libpq_dsn()) as conn:
@@ -80,10 +71,6 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
          and initialises it.
      """
      assert config.project_dir is not None
-    basedir = config.project_dir / 'tokenizer'
-    if not basedir.is_dir():
-        # Directory will be repopulated by tokenizer below.
-        basedir.mkdir()
  
      with connect(config.get_libpq_dsn()) as conn:
          name = properties.get_property(conn, 'tokenizer')
@@ -94,7 +81,7 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
  
      tokenizer_module = _import_tokenizer(name)
  
-    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn())
      tokenizer.init_from_project(config)
  
      return tokenizer
diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py

index 297c9ef9a401e02413d8c223f9633c0d4ad36e33..889bf5315e960dbd8c0f1834a1d128f629ce5df4 100644 (file)
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
@@ -12,7 +12,6 @@ from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
                     Dict, Set, Iterable
  import itertools
  import logging
-from pathlib import Path
  
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
@@ -38,10 +37,10 @@ WORD_TYPES = (('country_names', 'C'),
                ('housenumbers', 'H'))
  
  
-def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+def create(dsn: str) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
-    return ICUTokenizer(dsn, data_dir)
+    return ICUTokenizer(dsn)
  
  
  class ICUTokenizer(AbstractTokenizer):
@@ -50,9 +49,8 @@ class ICUTokenizer(AbstractTokenizer):
          normalization routines in Nominatim 3.
      """
  
-    def __init__(self, dsn: str, data_dir: Path) -> None:
+    def __init__(self, dsn: str) -> None:
          self.dsn = dsn
-        self.data_dir = data_dir
          self.loader: Optional[ICURuleLoader] = None
  
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
diff --git a/test/python/conftest.py b/test/python/conftest.py

index b2ab99ed3514b4ebe3370bae8a6816928ef6eb14..046ee5a6e2c669f558914157253d709e78be4283 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -234,6 +234,6 @@ def tokenizer_mock(monkeypatch, property_table):
      property_table.set('tokenizer', 'dummy')
  
      def _create_tokenizer():
-        return dummy_tokenizer.DummyTokenizer(None, None)
+        return dummy_tokenizer.DummyTokenizer(None)
  
      return _create_tokenizer
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

index 08554129257b74fbc43e526d4621deafac42f27f..ce74004ae7bb90dc72ed6be6c44920769fdd4a4b 100644 (file)
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -11,17 +11,16 @@ from nominatim_db.data.place_info import PlaceInfo
  from nominatim_db.config import Configuration
  
  
-def create(dsn, data_dir):
+def create(dsn):
      """ Create a new instance of the tokenizer provided by this module.
      """
-    return DummyTokenizer(dsn, data_dir)
+    return DummyTokenizer(dsn)
  
  
  class DummyTokenizer:
  
-    def __init__(self, dsn, data_dir):
+    def __init__(self, dsn):
          self.dsn = dsn
-        self.data_dir = data_dir
          self.init_state = None
          self.analyser_cache = {}
  
diff --git a/test/python/tokenizer/test_factory.py b/test/python/tokenizer/test_factory.py

index 4f8d2cfecf69f2a74f67e3bb3205d59d8a2cce3d..106cdaaf053b17581c1ae31d33cbaa5e591d9978 100644 (file)
--- a/test/python/tokenizer/test_factory.py
+++ b/test/python/tokenizer/test_factory.py
@@ -32,24 +32,9 @@ class TestFactory:
  
          assert isinstance(tokenizer, DummyTokenizer)
          assert tokenizer.init_state == "new"
-        assert (self.config.project_dir / 'tokenizer').is_dir()
  
          assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
  
-    def test_setup_tokenizer_dir_exists(self):
-        (self.config.project_dir / 'tokenizer').mkdir()
-
-        tokenizer = factory.create_tokenizer(self.config)
-
-        assert isinstance(tokenizer, DummyTokenizer)
-        assert tokenizer.init_state == "new"
-
-    def test_setup_tokenizer_dir_failure(self):
-        (self.config.project_dir / 'tokenizer').write_text("foo")
-
-        with pytest.raises(UsageError):
-            factory.create_tokenizer(self.config)
-
      def test_load_tokenizer(self):
          factory.create_tokenizer(self.config)
  
@@ -64,7 +49,6 @@ class TestFactory:
          self.config.project_dir = self.config.project_dir
  
          factory.get_tokenizer_for_db(self.config)
-        assert (self.config.project_dir / 'tokenizer').exists()
  
      def test_load_missing_property(self, temp_db_cursor):
          factory.create_tokenizer(self.config)
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py

index 12cef894f863cb2336b1be26240e28fb4a8ba28e..6d2e9ce778a648de0f64692dd5187c759d2b72d7 100644 (file)
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -39,12 +39,9 @@ def test_config(project_env, tmp_path):
  
  
  @pytest.fixture
-def tokenizer_factory(dsn, tmp_path, property_table,
-                      sql_preprocessor, place_table, word_table):
-    (tmp_path / 'tokenizer').mkdir()
-
+def tokenizer_factory(dsn, property_table, sql_preprocessor, place_table, word_table):
      def _maker():
-        return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+        return icu_tokenizer.create(dsn)
  
      return _maker
  
diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py

index b03c9748441a69537859b404dc125db55411ff18..7610b1bede1183a2c3d4ab4f03b45204a378d0c4 100644 (file)
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -63,7 +63,7 @@ class MockPostcodeTable:
  
  @pytest.fixture
  def tokenizer():
-    return dummy_tokenizer.DummyTokenizer(None, None)
+    return dummy_tokenizer.DummyTokenizer(None)
  
  
  @pytest.fixture
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 2 Apr 2025 18:20:04 +0000 (20:20 +0200)
src/nominatim_db/tokenizer/base.py		patch \| blob \| history
src/nominatim_db/tokenizer/factory.py		patch \| blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py		patch \| blob \| history
test/python/conftest.py		patch \| blob \| history
test/python/dummy_tokenizer.py		patch \| blob \| history
test/python/tokenizer/test_factory.py		patch \| blob \| history
test/python/tokenizer/test_icu.py		patch \| blob \| history
test/python/tools/test_postcodes.py		patch \| blob \| history