X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/4da4cbfe27a576ae011430b2de205c74435e241b..7717bbf59d711d21818c89309dbe08e00b16250f:/src/nominatim_db/tokenizer/legacy_tokenizer.py diff --git a/src/nominatim_db/tokenizer/legacy_tokenizer.py b/src/nominatim_db/tokenizer/legacy_tokenizer.py index 136a7331..78805332 100644 --- a/src/nominatim_db/tokenizer/legacy_tokenizer.py +++ b/src/nominatim_db/tokenizer/legacy_tokenizer.py @@ -14,14 +14,14 @@ import logging from pathlib import Path import re import shutil -from textwrap import dedent from icu import Transliterator -import psycopg2 -import psycopg2.extras +import psycopg +from psycopg import sql as pysql from ..errors import UsageError -from ..db.connection import connect, Connection +from ..db.connection import connect, Connection, drop_tables, table_exists,\ + execute_scalar, register_hstore from ..config import Configuration from ..db import properties from ..db import utils as db_utils @@ -37,10 +37,12 @@ LOG = logging.getLogger() def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer': """ Create a new instance of the tokenizer provided by this module. """ + LOG.warning('WARNING: the legacy tokenizer is deprecated ' + 'and will be removed in Nominatim 5.0.') return LegacyTokenizer(dsn, data_dir) -def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str: +def _install_module(config_module_path: str, src_dir: Optional[Path], module_dir: Path) -> str: """ Copies the PostgreSQL normalisation module into the project directory if necessary. For historical reasons the module is saved in the '/module' subdirectory and not with the other tokenizer @@ -54,6 +56,10 @@ def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> LOG.info("Using custom path for database module at '%s'", config_module_path) return config_module_path + # Otherwise a source dir must be given. + if src_dir is None: + raise UsageError("The legacy tokenizer cannot be used with the Nominatim pip module.") + # Compatibility mode for builddir installations. if module_dir.exists() and src_dir.samefile(module_dir): LOG.info('Running from build directory. Leaving database module as is.') @@ -78,12 +84,12 @@ def _check_module(module_dir: str, conn: Connection) -> None: """ with conn.cursor() as cur: try: - cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) - RETURNS text AS %s, 'transliteration' - LANGUAGE c IMMUTABLE STRICT; - DROP FUNCTION nominatim_test_import_func(text) - """, (f'{module_dir}/nominatim.so', )) - except psycopg2.DatabaseError as err: + cur.execute(pysql.SQL("""CREATE FUNCTION nominatim_test_import_func(text) + RETURNS text AS {}, 'transliteration' + LANGUAGE c IMMUTABLE STRICT; + DROP FUNCTION nominatim_test_import_func(text) + """).format(pysql.Literal(f'{module_dir}/nominatim.so'))) + except psycopg.DatabaseError as err: LOG.fatal("Error accessing database module: %s", err) raise UsageError("Database module cannot be accessed.") from err @@ -113,8 +119,6 @@ class LegacyTokenizer(AbstractTokenizer): self.normalization = config.TERM_NORMALIZATION - self._install_php(config, overwrite=True) - with connect(self.dsn) as conn: _check_module(module_dir, conn) self._save_config(conn, config) @@ -138,8 +142,6 @@ class LegacyTokenizer(AbstractTokenizer): config.lib_dir.module, config.project_dir / 'module') - self._install_php(config, overwrite=False) - def finalize_import(self, config: Configuration) -> None: """ Do any required postprocessing to make the tokenizer data ready for use. @@ -179,11 +181,10 @@ class LegacyTokenizer(AbstractTokenizer): * Can nominatim.so be accessed by the database user? """ with connect(self.dsn) as conn: - with conn.cursor() as cur: - try: - out = cur.scalar("SELECT make_standard_name('a')") - except psycopg2.Error as err: - return hint.format(error=str(err)) + try: + out = execute_scalar(conn, "SELECT make_standard_name('a')") + except psycopg.Error as err: + return hint.format(error=str(err)) if out != 'a': return hint.format(error='Unexpected result for make_standard_name()') @@ -214,9 +215,9 @@ class LegacyTokenizer(AbstractTokenizer): """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: - if conn.table_exists('search_name'): + if table_exists(conn, 'search_name'): + drop_tables(conn, "word_frequencies") with conn.cursor() as cur: - cur.drop_table("word_frequencies") LOG.info("Computing word frequencies") cur.execute("""CREATE TEMP TABLE word_frequencies AS SELECT unnest(name_vector) as id, count(*) @@ -226,7 +227,7 @@ class LegacyTokenizer(AbstractTokenizer): cur.execute("""UPDATE word SET search_name_count = count FROM word_frequencies WHERE word_token like ' %' and word_id = id""") - cur.drop_table("word_frequencies") + drop_tables(conn, "word_frequencies") conn.commit() @@ -266,21 +267,6 @@ class LegacyTokenizer(AbstractTokenizer): return list(s[0] for s in cur) - def _install_php(self, config: Configuration, overwrite: bool = True) -> None: - """ Install the php script for the tokenizer. - """ - if config.lib_dir.php is not None: - php_file = self.data_dir / "tokenizer.php" - - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: """ Set up the word table and fill it with pre-computed word frequencies. @@ -313,10 +299,10 @@ class LegacyNameAnalyzer(AbstractAnalyzer): """ def __init__(self, dsn: str, normalizer: Any): - self.conn: Optional[Connection] = connect(dsn).connection + self.conn: Optional[Connection] = connect(dsn) self.conn.autocommit = True self.normalizer = normalizer - psycopg2.extras.register_hstore(self.conn) + register_hstore(self.conn) self._cache = _TokenCache(self.conn) @@ -406,7 +392,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer): """, (to_delete, )) if to_add: cur.execute("""SELECT count(create_postcode_id(pc)) - FROM unnest(%s) as pc + FROM unnest(%s::text[]) as pc """, (to_add, )) @@ -423,7 +409,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer): with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("""SELECT word, class, type, operator FROM word + cur.execute("""SELECT word, class as cls, type, operator FROM word WHERE class != 'place' OR (type != 'house' AND type != 'postcode')""") for label, cls, typ, oper in cur: @@ -433,18 +419,19 @@ class LegacyNameAnalyzer(AbstractAnalyzer): to_delete = existing_phrases - norm_phrases if to_add: - cur.execute_values( + cur.executemany( """ INSERT INTO word (word_id, word_token, word, class, type, search_name_count, operator) (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name, class, type, 0, CASE WHEN op in ('in', 'near') THEN op ELSE null END - FROM (VALUES %s) as v(name, class, type, op))""", + FROM (VALUES (%s, %s, %s, %s)) as v(name, class, type, op))""", to_add) if to_delete and should_replace: - cur.execute_values( - """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) + cur.executemany( + """ DELETE FROM word + USING (VALUES (%s, %s, %s, %s)) as v(name, in_class, in_type, op) WHERE word = name and class = in_class and type = in_type and ((op = '-' and operator is null) or op = operator)""", to_delete) @@ -463,7 +450,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer): """INSERT INTO word (word_id, word_token, country_code) (SELECT nextval('seq_word'), lookup_token, %s FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token - FROM unnest(%s)n) y + FROM unnest(%s::TEXT[])n) y WHERE NOT EXISTS(SELECT * FROM word WHERE word_token = lookup_token and country_code = %s)) """, (country_code, list(names.values()), country_code)) @@ -536,9 +523,8 @@ class _TokenInfo: def add_names(self, conn: Connection, names: Mapping[str, str]) -> None: """ Add token information for the names of the place. """ - with conn.cursor() as cur: - # Create the token IDs for all names. - self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text", + # Create the token IDs for all names. + self.data['names'] = execute_scalar(conn, "SELECT make_keywords(%s)::text", (names, )) @@ -576,9 +562,8 @@ class _TokenInfo: """ Add addr:street match terms. """ def _get_street(name: str) -> Optional[str]: - with conn.cursor() as cur: - return cast(Optional[str], - cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))) + return cast(Optional[str], + execute_scalar(conn, "SELECT word_ids_from_name(%s)::text", (name, ))) tokens = self.cache.streets.get(street, _get_street) self.data['street'] = tokens or '{}'