+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
"""
Functions for creating a tokenizer or initialising the right one for an
existing database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created.
"""
+from typing import Optional
import logging
import importlib
+from pathlib import Path
-from ..errors import UsageError
-from ..db import properties
-from ..db.connection import connect
+from nominatim.errors import UsageError
+from nominatim.db import properties
+from nominatim.db.connection import connect
+from nominatim.config import Configuration
+from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
LOG = logging.getLogger()
-def _import_tokenizer(name):
+def _import_tokenizer(name: str) -> TokenizerModule:
""" Load the tokenizer.py module from project directory.
"""
- try:
- return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
- except ModuleNotFoundError as exp:
+ src_file = Path(__file__).parent / (name + '_tokenizer.py')
+ if not src_file.is_file():
LOG.fatal("No tokenizer named '%s' available. "
"Check the setting of NOMINATIM_TOKENIZER.", name)
- raise UsageError('Tokenizer not found') from exp
+ raise UsageError('Tokenizer not found')
+ return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
-def create_tokenizer(config, init_db=True, module_name=None):
+
+def create_tokenizer(config: Configuration, init_db: bool = True,
+ module_name: Optional[str] = None) -> AbstractTokenizer:
""" Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory
return tokenizer
-def get_tokenizer_for_db(config):
+def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
""" Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database
"""
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
- LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
- raise UsageError('Cannot initialize tokenizer.')
+ # Directory will be repopulated by tokenizer below.
+ basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
- tokenizer.init_from_project()
+ tokenizer.init_from_project(config)
return tokenizer