+query-preprocessing:
+ - step: normalize
normalization:
- ":: lower ()"
- ":: Hans-Hant"
from .sql.sqlalchemy_schema import SearchTables
from .sql.sqlalchemy_types import Geometry
from .logging import log
+from .config import Configuration
T = TypeVar('T')
def __init__(self, conn: AsyncConnection,
tables: SearchTables,
- properties: Dict[str, Any]) -> None:
+ properties: Dict[str, Any],
+ config: Configuration) -> None:
self.connection = conn
self.t = tables
+ self.config = config
self._property_cache = properties
self._classtables: Optional[Set[str]] = None
self.query_timeout: Optional[int] = None
assert self._tables is not None
async with self._engine.begin() as conn:
- yield SearchConnection(conn, self._tables, self._property_cache)
+ yield SearchConnection(conn, self._tables, self._property_cache, self.config)
async def status(self) -> StatusResult:
""" Return the status of the database.
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for preprocessing.
+"""
+from typing import List, Callable
+
+from ..typing import Protocol
+from ..search import query as qmod
+from .config import QueryConfig
+
+QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]]
+
+
+class QueryHandler(Protocol):
+ """ Protocol for query modules.
+ """
+ def create(self, config: QueryConfig) -> QueryProcessingFunc:
+ """
+ Create a function for sanitizing a place.
+ Arguments:
+ config: A dictionary with the additional configuration options
+ specified in the tokenizer configuration
+ normalizer: A instance to transliterate text
+ Return:
+ The result is a list modified by the preprocessor.
+ """
+ pass
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Configuration for Sanitizers.
+"""
+from typing import Any, TYPE_CHECKING
+from collections import UserDict
+
+# working around missing generics in Python < 3.8
+# See https://github.com/python/typing/issues/60#issuecomment-869757075
+if TYPE_CHECKING:
+ _BaseUserDict = UserDict[str, Any]
+else:
+ _BaseUserDict = UserDict
+
+
+class QueryConfig(_BaseUserDict):
+ """ The `QueryConfig` class is a read-only dictionary
+ with configuration options for the preprocessor.
+ In addition to the usual dictionary functions, the class provides
+ accessors to standard preprocessor options that are used by many of the
+ preprocessors.
+ """
+
+ def set_normalizer(self, normalizer: Any) -> 'QueryConfig':
+ """ Set the normalizer function to be used.
+ """
+ self['_normalizer'] = normalizer
+
+ return self
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Normalize query test using an ICU transliterator.
+"""
+from typing import cast
+
+from .config import QueryConfig
+from .base import QueryProcessingFunc
+from ..search.query import Phrase
+
+
+def create(config: QueryConfig) -> QueryProcessingFunc:
+ normalizer = config.get('_normalizer')
+
+ if not normalizer:
+ return lambda p: p
+
+ return lambda phrases: list(
+ filter(lambda p: p.text,
+ (Phrase(p.ptype, cast(str, normalizer.transliterate(p.text)))
+ for p in phrases)))
import sqlalchemy as sa
+from ..errors import UsageError
from ..typing import SaRow
from ..sql.sqlalchemy_types import Json
from ..connection import SearchConnection
from ..logging import log
-from ..search import query as qmod
-from ..search.query_analyzer_factory import AbstractQueryAnalyzer
+from . import query as qmod
+from ..query_preprocessing.config import QueryConfig
+from .query_analyzer_factory import AbstractQueryAnalyzer
DB_TO_TOKEN_TYPE = {
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
_make_transliterator)
+ await self._setup_preprocessing()
+
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word', sa.Text),
sa.Column('info', Json))
+ async def _setup_preprocessing(self) -> None:
+ """ Load the rules for preprocessing and set up the handlers.
+ """
+
+ rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
+ config='TOKENIZER_CONFIG')
+ preprocessing_rules = rules.get('query-preprocessing', [])
+
+ self.preprocessors = []
+
+ for func in preprocessing_rules:
+ if 'step' not in func:
+ raise UsageError("Preprocessing rule is missing the 'step' attribute.")
+ if not isinstance(func['step'], str):
+ raise UsageError("'step' attribute must be a simple string.")
+
+ module = self.conn.config.load_plugin_module(
+ func['step'], 'nominatim_api.query_preprocessing')
+ self.preprocessors.append(
+ module.create(QueryConfig(func).set_normalizer(self.normalizer)))
+
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
tokenized query.
"""
log().section('Analyze query (using ICU tokenizer)')
- normalized = list(filter(lambda p: p.text,
- (qmod.Phrase(p.ptype, self.normalize_text(p.text))
- for p in phrases)))
- query = qmod.QueryStruct(normalized)
+ for func in self.preprocessors:
+ phrases = func(phrases)
+ query = qmod.QueryStruct(phrases)
+
log().var_dump('Normalized query', query.source)
if not query.source:
return query
from typing import Any
import sqlalchemy as sa
import os
- from typing_extensions import (TypeAlias as TypeAlias)
+ from typing_extensions import (TypeAlias as TypeAlias,
+ Protocol as Protocol)
else:
TypeAlias = str
+ Protocol = object
StrPath = Union[str, 'os.PathLike[str]']
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2024 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for normalizing search queries.
+"""
+from pathlib import Path
+
+import pytest
+
+from icu import Transliterator
+
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import normalize
+
+def run_preprocessor_on(query, norm):
+ normalizer = Transliterator.createFromRules("normalization", norm)
+ proc = normalize.create(QueryConfig().set_normalizer(normalizer))
+
+ return proc(query)
+
+
+def test_normalize_simple():
+ norm = ':: lower();'
+ query = [qmod.Phrase(qmod.PhraseType.NONE, 'Hallo')]
+
+ out = run_preprocessor_on(query, norm)
+
+ assert len(out) == 1
+ assert out == [qmod.Phrase(qmod.PhraseType.NONE, 'hallo')]