From: TuringVerified Date: Thu, 13 Mar 2025 14:31:21 +0000 (+0530) Subject: Add generic preprocessor X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4665ea3e773b5620f5b65cf2396a91192cc8cda0?ds=inline;hp=--cc Add generic preprocessor --- 4665ea3e773b5620f5b65cf2396a91192cc8cda0 diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index bb81f80b..5c79041e 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -1,5 +1,13 @@ query-preprocessing: - step: split_japanese_phrases + - step: regex_replace + replacements: + - pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses + replace: '' + - pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses + replace: '' + - pattern: https?://[^\s]* # Filter URLs starting with http or https + replace: '' - step: normalize normalization: - ":: lower ()" diff --git a/src/nominatim_api/query_preprocessing/regex_replace.py b/src/nominatim_api/query_preprocessing/regex_replace.py new file mode 100644 index 00000000..883fa991 --- /dev/null +++ b/src/nominatim_api/query_preprocessing/regex_replace.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This file replaces values based on pre-defined regex rules: +""" +from typing import List +import re + +from .config import QueryConfig +from .base import QueryProcessingFunc +from ..search.query import Phrase + + +class _GenericPreprocessing: + + def __init__(self, config: QueryConfig) -> None: + self.config = config + + def split_phrase(self, phrase: Phrase) -> Phrase: + """ + This function performs replacements on the given text using regex patterns. + """ + + if phrase.text is None: + return phrase + + match_patterns = self.config.get('replacements', 'Key not found') + for item in match_patterns: + phrase.text = re.sub(item['pattern'], item['replace'], phrase.text) + + return phrase + + def __call__(self, phrases: List[Phrase]) -> List[Phrase]: + """Apply regex replacements to the given addresses. + """ + return [self.split_phrase(p) for p in phrases] + + +def create(config: QueryConfig) -> QueryProcessingFunc: + """ Create a function for generic preprocessing. + """ + return _GenericPreprocessing(config)