]> git.openstreetmap.org Git - nominatim.git/commitdiff
Add generic preprocessor
authorTuringVerified <imerminu@tutanota.com>
Thu, 13 Mar 2025 14:31:21 +0000 (20:01 +0530)
committerTuringVerified <imerminu@tutanota.com>
Tue, 1 Apr 2025 13:24:30 +0000 (18:54 +0530)
settings/icu_tokenizer.yaml
src/nominatim_api/query_preprocessing/regex_replace.py [new file with mode: 0644]

index bb81f80bd2b70fd2ed3a95b20782023be5a1ab10..5c79041eeb6aa10e3c02852439ce4c016ede5333 100644 (file)
@@ -1,5 +1,13 @@
 query-preprocessing:
     - step: split_japanese_phrases
+    - step: regex_replace
+      replacements:
+        - pattern: \b(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(?:\.(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}\b # Filter for IPv4 addresses
+          replace: ''
+        - pattern: \b(?:(?:[A-Fa-f0-9]{1,4}:){1,7}|:)(?:[A-Fa-f0-9]{1,4})?\b # Filter for IPv6 addresses
+          replace: ''
+        - pattern: https?://[^\s]* # Filter URLs starting with http or https
+          replace: ''
     - step: normalize
 normalization:
     - ":: lower ()"
diff --git a/src/nominatim_api/query_preprocessing/regex_replace.py b/src/nominatim_api/query_preprocessing/regex_replace.py
new file mode 100644 (file)
index 0000000..883fa99
--- /dev/null
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This file replaces values based on pre-defined regex rules:
+"""
+from typing import List
+import re
+
+from .config import QueryConfig
+from .base import QueryProcessingFunc
+from ..search.query import Phrase
+
+
+class _GenericPreprocessing:
+
+    def __init__(self, config: QueryConfig) -> None:
+        self.config = config
+
+    def split_phrase(self, phrase: Phrase) -> Phrase:
+        """
+        This function performs replacements on the given text using regex patterns.
+        """
+
+        if phrase.text is None:
+            return phrase
+
+        match_patterns = self.config.get('replacements', 'Key not found')
+        for item in match_patterns:
+            phrase.text = re.sub(item['pattern'], item['replace'], phrase.text)
+
+        return phrase
+
+    def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
+        """Apply regex replacements to the given addresses.
+        """
+        return [self.split_phrase(p) for p in phrases]
+
+
+def create(config: QueryConfig) -> QueryProcessingFunc:
+    """ Create a function for generic preprocessing.
+    """
+    return _GenericPreprocessing(config)