add japanese phrase preprocessing

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 8 Jan 2025 18:43:25 +0000 (19:43 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 9 Jan 2025 08:24:10 +0000 (09:24 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 8 Jan 2025 18:43:25 +0000 (19:43 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 9 Jan 2025 08:24:10 +0000 (09:24 +0100)
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index 437319fab801d53fbd5b2cb5581e1dd88a0ff79a..6cf30d59e55b961d32ae7fc1b9a08bf9118d1c62 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -1,4 +1,5 @@
  query-preprocessing:
+    - step: split_japanese_phrases
      - step: normalize
  normalization:
      - ":: lower ()"
diff --git a/src/nominatim_api/query_preprocessing/split_japanese_phrases.py b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py

new file mode 100644 (file)

index 0000000..7ab55b5
--- /dev/null
+++ b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This file divides Japanese addresses into three categories:
+prefecture, municipality, and other.
+The division is not strict but simple using these keywords.
+"""
+from typing import List
+import re
+
+from .config import QueryConfig
+from .base import QueryProcessingFunc
+from ..search.query import Phrase
+
+MATCH_PATTERNS = [
+    r'''
+                (...??[都都道府県縣])            # [group1] prefecture
+                (.+?[市区區町村])              # [group2] municipalities (city/wards/towns/villages)
+                (.+)                         # [group3] other words
+                ''',
+    r'''
+                (...??[都都道府県縣])            # [group1] prefecture
+                (.+)                         # [group3] other words
+                ''',
+    r'''
+                (.+?[市区區町村])              # [group2] municipalities (city/wards/towns/villages)
+                (.+)                         # [group3] other words
+                '''
+]
+
+
+class _JapanesePreprocessing:
+
+    def __init__(self, config: QueryConfig) -> None:
+        self.config = config
+
+    def split_phrase(self, phrase: Phrase) -> Phrase:
+        """
+        This function performs a division on the given text using a regular expression.
+        """
+        for pattern in MATCH_PATTERNS:
+            result = re.match(pattern, phrase.text, re.VERBOSE)
+            if result is not None:
+                return Phrase(phrase.ptype, ':'.join(result.groups()))
+
+        return phrase
+
+    def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
+        """Split a Japanese address using japanese_tokenizer.
+        """
+        return [self.split_phrase(p) for p in phrases]
+
+
+def create(config: QueryConfig) -> QueryProcessingFunc:
+    """ Create a function of japanese preprocessing.
+    """
+    return _JapanesePreprocessing(config)
diff --git a/test/python/api/query_processing/test_split_japanese_phrases.py b/test/python/api/query_processing/test_split_japanese_phrases.py

new file mode 100644 (file)

index 0000000..6055f9d
--- /dev/null
+++ b/test/python/api/query_processing/test_split_japanese_phrases.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for japanese phrase splitting.
+"""
+from pathlib import Path
+
+import pytest
+
+from icu import Transliterator
+
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import split_japanese_phrases
+
+def run_preprocessor_on(query):
+    proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
+
+    return proc(query)
+
+
+@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
+                                      ('大阪府大阪', '大阪府:大阪'),
+                                      ('大阪市大阪', '大阪市:大阪')])
+def test_split_phrases(inp, outp):
+    query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
+
+    out = run_preprocessor_on(query)
+
+    assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 8 Jan 2025 18:43:25 +0000 (19:43 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 9 Jan 2025 08:24:10 +0000 (09:24 +0100)
settings/icu_tokenizer.yaml		patch \| blob \| history
src/nominatim_api/query_preprocessing/split_japanese_phrases.py	[new file with mode: 0644]	patch \| blob
test/python/api/query_processing/test_split_japanese_phrases.py	[new file with mode: 0644]	patch \| blob