Update documentation, optimise regex_replace, add tests

author TuringVerified <imerminu@tutanota.com>

Thu, 20 Mar 2025 14:30:34 +0000 (20:00 +0530)

committer TuringVerified <imerminu@tutanota.com>

Tue, 1 Apr 2025 13:24:30 +0000 (18:54 +0530)
author TuringVerified <imerminu@tutanota.com>
Thu, 20 Mar 2025 14:30:34 +0000 (20:00 +0530)
committer TuringVerified <imerminu@tutanota.com>
Tue, 1 Apr 2025 13:24:30 +0000 (18:54 +0530)
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md

index d290c14816ae162872f793e2b605346aace7bcc1..2430be7efdb69a033f1ac8d1fa7104a7aab1206c 100644 (file)
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -67,7 +67,12 @@ Here is an example configuration file:
  
  ``` yaml
  query-preprocessing:
-    - normalize
+    - step: regex_replace
+    replacements:
+        - pattern: https?://[^\s]* # Filter URLs starting with http or https
+          replace: ''
+    - step: normalize
+
  normalization:
      - ":: lower ()"
      - "ß > 'ss'" # German szet is unambiguously equal to double ss
@@ -88,8 +93,8 @@ token-analysis:
              replacements: ['ä', 'ae']
  ```
  
-The configuration file contains four sections:
-`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
+The configuration file contains five sections:
+`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`.
  
  #### Query preprocessing
  
@@ -106,6 +111,17 @@ The following is a list of preprocessors that are shipped with Nominatim.
          heading_level: 6
          docstring_section_style: spacy
  
+::: nominatim_api.query_preprocessing.regex_replace
+    options:
+        members: False
+        heading_level: 6
+        docstring_section_style: spacy
+    description: 
+        This option runs any given regex pattern on the input and replaces values accordingly
+    replacements:
+        - pattern: regex pattern
+          replace: string to replace with
+
  
  #### Normalization and Transliteration
  
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index 5c79041eeb6aa10e3c02852439ce4c016ede5333..bb81f80bd2b70fd2ed3a95b20782023be5a1ab10 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -1,13 +1,5 @@
  query-preprocessing:
      - step: split_japanese_phrases
-    - step: regex_replace
-      replacements:
-        - pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses
-          replace: ''
-        - pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses
-          replace: ''
-        - pattern: https?://[^\s]* # Filter URLs starting with http or https
-          replace: ''
      - step: normalize
  normalization:
      - ":: lower ()"
diff --git a/src/nominatim_api/query_preprocessing/regex_replace.py b/src/nominatim_api/query_preprocessing/regex_replace.py

index 883fa99182a39ae87bf7d3237b8163d80887cf02..b711d54b7f6c0d823c73493c8d3dd6ad9332e4a0 100644 (file)
--- a/src/nominatim_api/query_preprocessing/regex_replace.py
+++ b/src/nominatim_api/query_preprocessing/regex_replace.py
@@ -20,24 +20,25 @@ class _GenericPreprocessing:
      def __init__(self, config: QueryConfig) -> None:
          self.config = config
  
+        match_patterns = self.config.get('replacements', 'Key not found')
+        self.compiled_patterns = [
+            (re.compile(item['pattern']), item['replace']) for item in match_patterns
+            ]
+
      def split_phrase(self, phrase: Phrase) -> Phrase:
          """
          This function performs replacements on the given text using regex patterns.
          """
-
-        if phrase.text is None:
-            return phrase
-
-        match_patterns = self.config.get('replacements', 'Key not found')
-        for item in match_patterns:
-            phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
+        for item in self.compiled_patterns:
+            phrase.text = item[0].sub(item[1], phrase.text)
  
          return phrase
  
      def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
          """Apply regex replacements to the given addresses.
          """
-        return [self.split_phrase(p) for p in phrases]
+        result = [p for p in map(self.split_phrase, phrases) if p.text.strip()]
+        return result if result else []
  
  
  def create(config: QueryConfig) -> QueryProcessingFunc:
diff --git a/test/python/api/query_processing/test_regex_replace.py b/test/python/api/query_processing/test_regex_replace.py

new file mode 100644 (file)

index 0000000..288ac23
--- /dev/null
+++ b/test/python/api/query_processing/test_regex_replace.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+'''
+Tests for replacing values in an input using custom regex.
+'''
+import pytest
+
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import regex_replace
+
+
+def run_preprocessor_on(query):
+    config = QueryConfig()
+    config.set_normalizer(None)
+
+    config['replacements'] = [
+        {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''},  # IPv4
+        {'pattern': r'https?://\S+', 'replace': ''}  # HTTP/HTTPS URLs
+    ]
+
+    proc = regex_replace.create(config)
+    return proc(query)
+
+
+@pytest.mark.parametrize('inp,outp', [
+    (['45.67.89.101'], []),
+    (['198.51.100.23'], []),
+    (['203.0.113.255'], []),
+    (['http://www.openstreetmap.org'], []),
+    (['https://www.openstreetmap.org/edit'], []),
+    (['http://osm.org'], []),
+    (['https://www.openstreetmap.org/user/abc'], []),
+    (['https://tile.openstreetmap.org/12/2048/2048.png'], []),
+    (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']),
+    (['Use 203.0.113.255 for routing'], ['Use  for routing']),
+    (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at  and ']),
+    (['203.0.113.255', 'Some Address'], ['Some Address']),
+    (['https://osm.org', 'Another Place'], ['Another Place']),
+])
+def test_split_phrases(inp, outp):
+    query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp]
+
+    out = run_preprocessor_on(query)
+    expected_out = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp]
+
+    assert out == expected_out, f"Expected {expected_out}, but got {out}"
author	TuringVerified <imerminu@tutanota.com>
	Thu, 20 Mar 2025 14:30:34 +0000 (20:00 +0530)
committer	TuringVerified <imerminu@tutanota.com>
	Tue, 1 Apr 2025 13:24:30 +0000 (18:54 +0530)
docs/customize/Tokenizers.md		patch \| blob \| history
settings/icu_tokenizer.yaml		patch \| blob \| history
src/nominatim_api/query_preprocessing/regex_replace.py		patch \| blob \| history
test/python/api/query_processing/test_regex_replace.py	[new file with mode: 0644]	patch \| blob