From: TuringVerified Date: Thu, 20 Mar 2025 14:30:34 +0000 (+0530) Subject: Update documentation, optimise regex_replace, add tests X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/6d5a4a20c5813054215efcb20dda52d76082e54a?hp=--cc Update documentation, optimise regex_replace, add tests --- 6d5a4a20c5813054215efcb20dda52d76082e54a diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index d290c148..2430be7e 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -67,7 +67,12 @@ Here is an example configuration file: ``` yaml query-preprocessing: - - normalize + - step: regex_replace + replacements: + - pattern: https?://[^\s]* # Filter URLs starting with http or https + replace: '' + - step: normalize + normalization: - ":: lower ()" - "ß > 'ss'" # German szet is unambiguously equal to double ss @@ -88,8 +93,8 @@ token-analysis: replacements: ['ä', 'ae'] ``` -The configuration file contains four sections: -`normalization`, `transliteration`, `sanitizers` and `token-analysis`. +The configuration file contains five sections: +`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`. #### Query preprocessing @@ -106,6 +111,17 @@ The following is a list of preprocessors that are shipped with Nominatim. heading_level: 6 docstring_section_style: spacy +::: nominatim_api.query_preprocessing.regex_replace + options: + members: False + heading_level: 6 + docstring_section_style: spacy + description: + This option runs any given regex pattern on the input and replaces values accordingly + replacements: + - pattern: regex pattern + replace: string to replace with + #### Normalization and Transliteration diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 5c79041e..bb81f80b 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -1,13 +1,5 @@ query-preprocessing: - step: split_japanese_phrases - - step: regex_replace - replacements: - - pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses - replace: '' - - pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses - replace: '' - - pattern: https?://[^\s]* # Filter URLs starting with http or https - replace: '' - step: normalize normalization: - ":: lower ()" diff --git a/src/nominatim_api/query_preprocessing/regex_replace.py b/src/nominatim_api/query_preprocessing/regex_replace.py index 883fa991..b711d54b 100644 --- a/src/nominatim_api/query_preprocessing/regex_replace.py +++ b/src/nominatim_api/query_preprocessing/regex_replace.py @@ -20,24 +20,25 @@ class _GenericPreprocessing: def __init__(self, config: QueryConfig) -> None: self.config = config + match_patterns = self.config.get('replacements', 'Key not found') + self.compiled_patterns = [ + (re.compile(item['pattern']), item['replace']) for item in match_patterns + ] + def split_phrase(self, phrase: Phrase) -> Phrase: """ This function performs replacements on the given text using regex patterns. """ - - if phrase.text is None: - return phrase - - match_patterns = self.config.get('replacements', 'Key not found') - for item in match_patterns: - phrase.text = re.sub(item['pattern'], item['replace'], phrase.text) + for item in self.compiled_patterns: + phrase.text = item[0].sub(item[1], phrase.text) return phrase def __call__(self, phrases: List[Phrase]) -> List[Phrase]: """Apply regex replacements to the given addresses. """ - return [self.split_phrase(p) for p in phrases] + result = [p for p in map(self.split_phrase, phrases) if p.text.strip()] + return result if result else [] def create(config: QueryConfig) -> QueryProcessingFunc: diff --git a/test/python/api/query_processing/test_regex_replace.py b/test/python/api/query_processing/test_regex_replace.py new file mode 100644 index 00000000..288ac23e --- /dev/null +++ b/test/python/api/query_processing/test_regex_replace.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +''' +Tests for replacing values in an input using custom regex. +''' +import pytest + +import nominatim_api.search.query as qmod +from nominatim_api.query_preprocessing.config import QueryConfig +from nominatim_api.query_preprocessing import regex_replace + + +def run_preprocessor_on(query): + config = QueryConfig() + config.set_normalizer(None) + + config['replacements'] = [ + {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''}, # IPv4 + {'pattern': r'https?://\S+', 'replace': ''} # HTTP/HTTPS URLs + ] + + proc = regex_replace.create(config) + return proc(query) + + +@pytest.mark.parametrize('inp,outp', [ + (['45.67.89.101'], []), + (['198.51.100.23'], []), + (['203.0.113.255'], []), + (['http://www.openstreetmap.org'], []), + (['https://www.openstreetmap.org/edit'], []), + (['http://osm.org'], []), + (['https://www.openstreetmap.org/user/abc'], []), + (['https://tile.openstreetmap.org/12/2048/2048.png'], []), + (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']), + (['Use 203.0.113.255 for routing'], ['Use for routing']), + (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at and ']), + (['203.0.113.255', 'Some Address'], ['Some Address']), + (['https://osm.org', 'Another Place'], ['Another Place']), +]) +def test_split_phrases(inp, outp): + query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp] + + out = run_preprocessor_on(query) + expected_out = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp] + + assert out == expected_out, f"Expected {expected_out}, but got {out}"