ICU tokenizer doesn't need any extra data anymore, so it doesn't
make sense to create a directory which then remains empty. If a
tokenizer needs such a directory in the future, it needs to create
it on its own and make sure to handle the situation correctly where
no project directory is used at all.
#
# This file is part of Nominatim. (https://nominatim.org)
#
#
# This file is part of Nominatim. (https://nominatim.org)
#
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
-from pathlib import Path
from ..typing import Protocol
from ..config import Configuration
from ..typing import Protocol
from ..config import Configuration
- def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+ def create(self, dsn: str) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""
""" Factory for new tokenizers.
"""
#
# This file is part of Nominatim. (https://nominatim.org)
#
#
# This file is part of Nominatim. (https://nominatim.org)
#
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for creating a tokenizer or initialising the right one for an
# For a full list of authors see the git log.
"""
Functions for creating a tokenizer or initialising the right one for an
if module_name is None:
module_name = config.TOKENIZER
if module_name is None:
module_name = config.TOKENIZER
- # Create the directory for the tokenizer data
- assert config.project_dir is not None
- basedir = config.project_dir / 'tokenizer'
- if not basedir.exists():
- basedir.mkdir()
- elif not basedir.is_dir():
- LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
- raise UsageError("Tokenizer setup failed.")
-
# Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
# Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
- tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+ tokenizer = tokenizer_module.create(config.get_libpq_dsn())
tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
and initialises it.
"""
assert config.project_dir is not None
and initialises it.
"""
assert config.project_dir is not None
- basedir = config.project_dir / 'tokenizer'
- if not basedir.is_dir():
- # Directory will be repopulated by tokenizer below.
- basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
tokenizer_module = _import_tokenizer(name)
tokenizer_module = _import_tokenizer(name)
- tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+ tokenizer = tokenizer_module.create(config.get_libpq_dsn())
tokenizer.init_from_project(config)
return tokenizer
tokenizer.init_from_project(config)
return tokenizer
#
# This file is part of Nominatim. (https://nominatim.org)
#
#
# This file is part of Nominatim. (https://nominatim.org)
#
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
Dict, Set, Iterable
import itertools
import logging
Dict, Set, Iterable
import itertools
import logging
-from pathlib import Path
from psycopg.types.json import Jsonb
from psycopg import sql as pysql
from psycopg.types.json import Jsonb
from psycopg import sql as pysql
-def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
+def create(dsn: str) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
""" Create a new instance of the tokenizer provided by this module.
"""
- return ICUTokenizer(dsn, data_dir)
+ return ICUTokenizer(dsn)
class ICUTokenizer(AbstractTokenizer):
class ICUTokenizer(AbstractTokenizer):
normalization routines in Nominatim 3.
"""
normalization routines in Nominatim 3.
"""
- def __init__(self, dsn: str, data_dir: Path) -> None:
+ def __init__(self, dsn: str) -> None:
- self.data_dir = data_dir
self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
property_table.set('tokenizer', 'dummy')
def _create_tokenizer():
property_table.set('tokenizer', 'dummy')
def _create_tokenizer():
- return dummy_tokenizer.DummyTokenizer(None, None)
+ return dummy_tokenizer.DummyTokenizer(None)
from nominatim_db.config import Configuration
from nominatim_db.config import Configuration
-def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
"""
""" Create a new instance of the tokenizer provided by this module.
"""
- return DummyTokenizer(dsn, data_dir)
+ return DummyTokenizer(dsn)
- def __init__(self, dsn, data_dir):
+ def __init__(self, dsn):
- self.data_dir = data_dir
self.init_state = None
self.analyser_cache = {}
self.init_state = None
self.analyser_cache = {}
assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "new"
assert isinstance(tokenizer, DummyTokenizer)
assert tokenizer.init_state == "new"
- assert (self.config.project_dir / 'tokenizer').is_dir()
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
- def test_setup_tokenizer_dir_exists(self):
- (self.config.project_dir / 'tokenizer').mkdir()
-
- tokenizer = factory.create_tokenizer(self.config)
-
- assert isinstance(tokenizer, DummyTokenizer)
- assert tokenizer.init_state == "new"
-
- def test_setup_tokenizer_dir_failure(self):
- (self.config.project_dir / 'tokenizer').write_text("foo")
-
- with pytest.raises(UsageError):
- factory.create_tokenizer(self.config)
-
def test_load_tokenizer(self):
factory.create_tokenizer(self.config)
def test_load_tokenizer(self):
factory.create_tokenizer(self.config)
self.config.project_dir = self.config.project_dir
factory.get_tokenizer_for_db(self.config)
self.config.project_dir = self.config.project_dir
factory.get_tokenizer_for_db(self.config)
- assert (self.config.project_dir / 'tokenizer').exists()
def test_load_missing_property(self, temp_db_cursor):
factory.create_tokenizer(self.config)
def test_load_missing_property(self, temp_db_cursor):
factory.create_tokenizer(self.config)
-def tokenizer_factory(dsn, tmp_path, property_table,
- sql_preprocessor, place_table, word_table):
- (tmp_path / 'tokenizer').mkdir()
-
+def tokenizer_factory(dsn, property_table, sql_preprocessor, place_table, word_table):
- return icu_tokenizer.create(dsn, tmp_path / 'tokenizer')
+ return icu_tokenizer.create(dsn)
@pytest.fixture
def tokenizer():
@pytest.fixture
def tokenizer():
- return dummy_tokenizer.DummyTokenizer(None, None)
+ return dummy_tokenizer.DummyTokenizer(None)