from pathlib import Path
import re
import shutil
-from textwrap import dedent
from icu import Transliterator
import psycopg
def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
+ LOG.warning('WARNING: the legacy tokenizer is deprecated '
+ 'and will be removed in Nominatim 5.0.')
return LegacyTokenizer(dsn, data_dir)
-def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
+def _install_module(config_module_path: str, src_dir: Optional[Path], module_dir: Path) -> str:
""" Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer
LOG.info("Using custom path for database module at '%s'", config_module_path)
return config_module_path
+ # Otherwise a source dir must be given.
+ if src_dir is None:
+ raise UsageError("The legacy tokenizer cannot be used with the Nominatim pip module.")
+
# Compatibility mode for builddir installations.
if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.')
self.normalization = config.TERM_NORMALIZATION
- self._install_php(config, overwrite=True)
-
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
config.lib_dir.module,
config.project_dir / 'module')
- self._install_php(config, overwrite=False)
-
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
return list(s[0] for s in cur)
- def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
- """ Install the php script for the tokenizer.
- """
- if config.lib_dir.php is not None:
- php_file = self.data_dir / "tokenizer.php"
-
- if not php_file.exists() or overwrite:
- php_file.write_text(dedent(f"""\
- <?php
- @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
- @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
- require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
- """), encoding='utf-8')
-
-
def _init_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.