]> git.openstreetmap.org Git - nominatim.git/commitdiff
Update documentation, optimise regex_replace, add tests
authorTuringVerified <imerminu@tutanota.com>
Thu, 20 Mar 2025 14:30:34 +0000 (20:00 +0530)
committerTuringVerified <imerminu@tutanota.com>
Tue, 1 Apr 2025 13:24:30 +0000 (18:54 +0530)
docs/customize/Tokenizers.md
settings/icu_tokenizer.yaml
src/nominatim_api/query_preprocessing/regex_replace.py
test/python/api/query_processing/test_regex_replace.py [new file with mode: 0644]

index d290c14816ae162872f793e2b605346aace7bcc1..2430be7efdb69a033f1ac8d1fa7104a7aab1206c 100644 (file)
@@ -67,7 +67,12 @@ Here is an example configuration file:
 
 ``` yaml
 query-preprocessing:
-    - normalize
+    - step: regex_replace
+    replacements:
+        - pattern: https?://[^\s]* # Filter URLs starting with http or https
+          replace: ''
+    - step: normalize
+
 normalization:
     - ":: lower ()"
     - "ß > 'ss'" # German szet is unambiguously equal to double ss
@@ -88,8 +93,8 @@ token-analysis:
             replacements: ['ä', 'ae']
 ```
 
-The configuration file contains four sections:
-`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
+The configuration file contains five sections:
+`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`.
 
 #### Query preprocessing
 
@@ -106,6 +111,17 @@ The following is a list of preprocessors that are shipped with Nominatim.
         heading_level: 6
         docstring_section_style: spacy
 
+::: nominatim_api.query_preprocessing.regex_replace
+    options:
+        members: False
+        heading_level: 6
+        docstring_section_style: spacy
+    description: 
+        This option runs any given regex pattern on the input and replaces values accordingly
+    replacements:
+        - pattern: regex pattern
+          replace: string to replace with
+
 
 #### Normalization and Transliteration
 
index 5c79041eeb6aa10e3c02852439ce4c016ede5333..bb81f80bd2b70fd2ed3a95b20782023be5a1ab10 100644 (file)
@@ -1,13 +1,5 @@
 query-preprocessing:
     - step: split_japanese_phrases
-    - step: regex_replace
-      replacements:
-        - pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses
-          replace: ''
-        - pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses
-          replace: ''
-        - pattern: https?://[^\s]* # Filter URLs starting with http or https
-          replace: ''
     - step: normalize
 normalization:
     - ":: lower ()"
index 883fa99182a39ae87bf7d3237b8163d80887cf02..b711d54b7f6c0d823c73493c8d3dd6ad9332e4a0 100644 (file)
@@ -20,24 +20,25 @@ class _GenericPreprocessing:
     def __init__(self, config: QueryConfig) -> None:
         self.config = config
 
+        match_patterns = self.config.get('replacements', 'Key not found')
+        self.compiled_patterns = [
+            (re.compile(item['pattern']), item['replace']) for item in match_patterns
+            ]
+
     def split_phrase(self, phrase: Phrase) -> Phrase:
         """
         This function performs replacements on the given text using regex patterns.
         """
-
-        if phrase.text is None:
-            return phrase
-
-        match_patterns = self.config.get('replacements', 'Key not found')
-        for item in match_patterns:
-            phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
+        for item in self.compiled_patterns:
+            phrase.text = item[0].sub(item[1], phrase.text)
 
         return phrase
 
     def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
         """Apply regex replacements to the given addresses.
         """
-        return [self.split_phrase(p) for p in phrases]
+        result = [p for p in map(self.split_phrase, phrases) if p.text.strip()]
+        return result if result else []
 
 
 def create(config: QueryConfig) -> QueryProcessingFunc:
diff --git a/test/python/api/query_processing/test_regex_replace.py b/test/python/api/query_processing/test_regex_replace.py
new file mode 100644 (file)
index 0000000..288ac23
--- /dev/null
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+'''
+Tests for replacing values in an input using custom regex.
+'''
+import pytest
+
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import regex_replace
+
+
+def run_preprocessor_on(query):
+    config = QueryConfig()
+    config.set_normalizer(None)
+
+    config['replacements'] = [
+        {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''},  # IPv4
+        {'pattern': r'https?://\S+', 'replace': ''}  # HTTP/HTTPS URLs
+    ]
+
+    proc = regex_replace.create(config)
+    return proc(query)
+
+
+@pytest.mark.parametrize('inp,outp', [
+    (['45.67.89.101'], []),
+    (['198.51.100.23'], []),
+    (['203.0.113.255'], []),
+    (['http://www.openstreetmap.org'], []),
+    (['https://www.openstreetmap.org/edit'], []),
+    (['http://osm.org'], []),
+    (['https://www.openstreetmap.org/user/abc'], []),
+    (['https://tile.openstreetmap.org/12/2048/2048.png'], []),
+    (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']),
+    (['Use 203.0.113.255 for routing'], ['Use  for routing']),
+    (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at  and ']),
+    (['203.0.113.255', 'Some Address'], ['Some Address']),
+    (['https://osm.org', 'Another Place'], ['Another Place']),
+])
+def test_split_phrases(inp, outp):
+    query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp]
+
+    out = run_preprocessor_on(query)
+    expected_out = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]
+
+    assert out == expected_out, f"Expected {expected_out}, but got {out}"