``` yaml
query-preprocessing:
- - normalize
+ - step: regex_replace
+ replacements:
+ - pattern: https?://[^\s]* # Filter URLs starting with http or https
+ replace: ''
+ - step: normalize
+
normalization:
- ":: lower ()"
- "ß > 'ss'" # German szet is unambiguously equal to double ss
replacements: ['ä', 'ae']
```
-The configuration file contains four sections:
-`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
+The configuration file contains five sections:
+`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`.
#### Query preprocessing
heading_level: 6
docstring_section_style: spacy
+::: nominatim_api.query_preprocessing.regex_replace
+ options:
+ members: False
+ heading_level: 6
+ docstring_section_style: spacy
+ description:
+ This option runs any given regex pattern on the input and replaces values accordingly
+ replacements:
+ - pattern: regex pattern
+ replace: string to replace with
+
#### Normalization and Transliteration
def __init__(self, config: QueryConfig) -> None:
self.config = config
+ match_patterns = self.config.get('replacements', 'Key not found')
+ self.compiled_patterns = [
+ (re.compile(item['pattern']), item['replace']) for item in match_patterns
+ ]
+
def split_phrase(self, phrase: Phrase) -> Phrase:
"""
This function performs replacements on the given text using regex patterns.
"""
-
- if phrase.text is None:
- return phrase
-
- match_patterns = self.config.get('replacements', 'Key not found')
- for item in match_patterns:
- phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
+ for item in self.compiled_patterns:
+ phrase.text = item[0].sub(item[1], phrase.text)
return phrase
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
"""Apply regex replacements to the given addresses.
"""
- return [self.split_phrase(p) for p in phrases]
+ result = [p for p in map(self.split_phrase, phrases) if p.text.strip()]
+ return result if result else []
def create(config: QueryConfig) -> QueryProcessingFunc:
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+'''
+Tests for replacing values in an input using custom regex.
+'''
+import pytest
+
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import regex_replace
+
+
+def run_preprocessor_on(query):
+ config = QueryConfig()
+ config.set_normalizer(None)
+
+ config['replacements'] = [
+ {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''}, # IPv4
+ {'pattern': r'https?://\S+', 'replace': ''} # HTTP/HTTPS URLs
+ ]
+
+ proc = regex_replace.create(config)
+ return proc(query)
+
+
+@pytest.mark.parametrize('inp,outp', [
+ (['45.67.89.101'], []),
+ (['198.51.100.23'], []),
+ (['203.0.113.255'], []),
+ (['http://www.openstreetmap.org'], []),
+ (['https://www.openstreetmap.org/edit'], []),
+ (['http://osm.org'], []),
+ (['https://www.openstreetmap.org/user/abc'], []),
+ (['https://tile.openstreetmap.org/12/2048/2048.png'], []),
+ (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']),
+ (['Use 203.0.113.255 for routing'], ['Use for routing']),
+ (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at and ']),
+ (['203.0.113.255', 'Some Address'], ['Some Address']),
+ (['https://osm.org', 'Another Place'], ['Another Place']),
+])
+def test_split_phrases(inp, outp):
+ query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp]
+
+ out = run_preprocessor_on(query)
+ expected_out = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]
+
+ assert out == expected_out, f"Expected {expected_out}, but got {out}"