From: Sarah Hoffmann Date: Fri, 13 Dec 2024 10:53:10 +0000 (+0100) Subject: generalize normalization step for search query X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/2b87c016db1dd7a03b5cafe385209529a7457fc6 generalize normalization step for search query It is now possible to configure functions for changing the query input before it is analysed by the tokenizer. Code is a cleaned-up version of the implementation by @miku. --- diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index c5a809c6..530df1a6 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -1,3 +1,5 @@ +query-preprocessing: + - step: normalize normalization: - ":: lower ()" - ":: Hans-Hant" diff --git a/src/nominatim_api/connection.py b/src/nominatim_api/connection.py index e104745e..04268dc3 100644 --- a/src/nominatim_api/connection.py +++ b/src/nominatim_api/connection.py @@ -18,6 +18,7 @@ from .typing import SaFromClause from .sql.sqlalchemy_schema import SearchTables from .sql.sqlalchemy_types import Geometry from .logging import log +from .config import Configuration T = TypeVar('T') @@ -31,9 +32,11 @@ class SearchConnection: def __init__(self, conn: AsyncConnection, tables: SearchTables, - properties: Dict[str, Any]) -> None: + properties: Dict[str, Any], + config: Configuration) -> None: self.connection = conn self.t = tables + self.config = config self._property_cache = properties self._classtables: Optional[Set[str]] = None self.query_timeout: Optional[int] = None diff --git a/src/nominatim_api/core.py b/src/nominatim_api/core.py index 3cf9e989..b98c0ba7 100644 --- a/src/nominatim_api/core.py +++ b/src/nominatim_api/core.py @@ -184,7 +184,7 @@ class NominatimAPIAsync: assert self._tables is not None async with self._engine.begin() as conn: - yield SearchConnection(conn, self._tables, self._property_cache) + yield SearchConnection(conn, self._tables, self._property_cache, self.config) async def status(self) -> StatusResult: """ Return the status of the database. diff --git a/src/nominatim_api/query_preprocessing/__init__.py b/src/nominatim_api/query_preprocessing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/nominatim_api/query_preprocessing/base.py b/src/nominatim_api/query_preprocessing/base.py new file mode 100644 index 00000000..1e0afd05 --- /dev/null +++ b/src/nominatim_api/query_preprocessing/base.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2024 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Common data types and protocols for preprocessing. +""" +from typing import List, Callable + +from ..typing import Protocol +from ..search import query as qmod +from .config import QueryConfig + +QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]] + + +class QueryHandler(Protocol): + """ Protocol for query modules. + """ + def create(self, config: QueryConfig) -> QueryProcessingFunc: + """ + Create a function for sanitizing a place. + Arguments: + config: A dictionary with the additional configuration options + specified in the tokenizer configuration + normalizer: A instance to transliterate text + Return: + The result is a list modified by the preprocessor. + """ + pass diff --git a/src/nominatim_api/query_preprocessing/config.py b/src/nominatim_api/query_preprocessing/config.py new file mode 100644 index 00000000..1948945c --- /dev/null +++ b/src/nominatim_api/query_preprocessing/config.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2024 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Configuration for Sanitizers. +""" +from typing import Any, TYPE_CHECKING +from collections import UserDict + +# working around missing generics in Python < 3.8 +# See https://github.com/python/typing/issues/60#issuecomment-869757075 +if TYPE_CHECKING: + _BaseUserDict = UserDict[str, Any] +else: + _BaseUserDict = UserDict + + +class QueryConfig(_BaseUserDict): + """ The `QueryConfig` class is a read-only dictionary + with configuration options for the preprocessor. + In addition to the usual dictionary functions, the class provides + accessors to standard preprocessor options that are used by many of the + preprocessors. + """ + + def set_normalizer(self, normalizer: Any) -> 'QueryConfig': + """ Set the normalizer function to be used. + """ + self['_normalizer'] = normalizer + + return self diff --git a/src/nominatim_api/query_preprocessing/normalize.py b/src/nominatim_api/query_preprocessing/normalize.py new file mode 100644 index 00000000..0c12b93f --- /dev/null +++ b/src/nominatim_api/query_preprocessing/normalize.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2024 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Normalize query test using an ICU transliterator. +""" +from typing import cast + +from .config import QueryConfig +from .base import QueryProcessingFunc +from ..search.query import Phrase + + +def create(config: QueryConfig) -> QueryProcessingFunc: + normalizer = config.get('_normalizer') + + if not normalizer: + return lambda p: p + + return lambda phrases: list( + filter(lambda p: p.text, + (Phrase(p.ptype, cast(str, normalizer.transliterate(p.text))) + for p in phrases))) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index fa14531a..5976fbec 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -16,12 +16,14 @@ from icu import Transliterator import sqlalchemy as sa +from ..errors import UsageError from ..typing import SaRow from ..sql.sqlalchemy_types import Json from ..connection import SearchConnection from ..logging import log -from ..search import query as qmod -from ..search.query_analyzer_factory import AbstractQueryAnalyzer +from . import query as qmod +from ..query_preprocessing.config import QueryConfig +from .query_analyzer_factory import AbstractQueryAnalyzer DB_TO_TOKEN_TYPE = { @@ -151,6 +153,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator', _make_transliterator) + await self._setup_preprocessing() + if 'word' not in self.conn.t.meta.tables: sa.Table('word', self.conn.t.meta, sa.Column('word_id', sa.Integer), @@ -159,15 +163,36 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): sa.Column('word', sa.Text), sa.Column('info', Json)) + async def _setup_preprocessing(self) -> None: + """ Load the rules for preprocessing and set up the handlers. + """ + + rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG') + preprocessing_rules = rules.get('query-preprocessing', []) + + self.preprocessors = [] + + for func in preprocessing_rules: + if 'step' not in func: + raise UsageError("Preprocessing rule is missing the 'step' attribute.") + if not isinstance(func['step'], str): + raise UsageError("'step' attribute must be a simple string.") + + module = self.conn.config.load_plugin_module( + func['step'], 'nominatim_api.query_preprocessing') + self.preprocessors.append( + module.create(QueryConfig(func).set_normalizer(self.normalizer))) + async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: """ Analyze the given list of phrases and return the tokenized query. """ log().section('Analyze query (using ICU tokenizer)') - normalized = list(filter(lambda p: p.text, - (qmod.Phrase(p.ptype, self.normalize_text(p.text)) - for p in phrases))) - query = qmod.QueryStruct(normalized) + for func in self.preprocessors: + phrases = func(phrases) + query = qmod.QueryStruct(phrases) + log().var_dump('Normalized query', query.source) if not query.source: return query diff --git a/src/nominatim_api/typing.py b/src/nominatim_api/typing.py index 89aa4428..be9e9b58 100644 --- a/src/nominatim_api/typing.py +++ b/src/nominatim_api/typing.py @@ -21,9 +21,11 @@ if TYPE_CHECKING: from typing import Any import sqlalchemy as sa import os - from typing_extensions import (TypeAlias as TypeAlias) + from typing_extensions import (TypeAlias as TypeAlias, + Protocol as Protocol) else: TypeAlias = str + Protocol = object StrPath = Union[str, 'os.PathLike[str]'] diff --git a/test/python/api/query_processing/test_normalize.py b/test/python/api/query_processing/test_normalize.py new file mode 100644 index 00000000..db8bbe0b --- /dev/null +++ b/test/python/api/query_processing/test_normalize.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2024 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for normalizing search queries. +""" +from pathlib import Path + +import pytest + +from icu import Transliterator + +import nominatim_api.search.query as qmod +from nominatim_api.query_preprocessing.config import QueryConfig +from nominatim_api.query_preprocessing import normalize + +def run_preprocessor_on(query, norm): + normalizer = Transliterator.createFromRules("normalization", norm) + proc = normalize.create(QueryConfig().set_normalizer(normalizer)) + + return proc(query) + + +def test_normalize_simple(): + norm = ':: lower();' + query = [qmod.Phrase(qmod.PhraseType.NONE, 'Hallo')] + + out = run_preprocessor_on(query, norm) + + assert len(out) == 1 + assert out == [qmod.Phrase(qmod.PhraseType.NONE, 'hallo')]