--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+# This file is part of Nominatim. (https://nominatim.org)
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+This file divides Japanese addresses into three categories:
+prefecture, municipality, and other.
+The division is not strict but simple using these keywords.
+from typing import List
+import re
+from .config import QueryConfig
+from .base import QueryProcessingFunc
+from ..search.query import Phrase
+ r'''
+ (...??[都都道府県縣]) # [group1] prefecture
+ (.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
+ (.+) # [group3] other words
+ ''',
+ r'''
+ (...??[都都道府県縣]) # [group1] prefecture
+ (.+) # [group3] other words
+ ''',
+ r'''
+ (.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
+ (.+) # [group3] other words
+ '''
+class _JapanesePreprocessing:
+ def __init__(self, config: QueryConfig) -> None:
+ self.config = config
+ def split_phrase(self, phrase: Phrase) -> Phrase:
+ """
+ This function performs a division on the given text using a regular expression.
+ """
+ for pattern in MATCH_PATTERNS:
+ result = re.match(pattern, phrase.text, re.VERBOSE)
+ if result is not None:
+ return Phrase(phrase.ptype, ':'.join(result.groups()))
+ return phrase
+ def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
+ """Split a Japanese address using japanese_tokenizer.
+ """
+ return [self.split_phrase(p) for p in phrases]
+def create(config: QueryConfig) -> QueryProcessingFunc:
+ """ Create a function of japanese preprocessing.
+ """
+ return _JapanesePreprocessing(config)
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+# This file is part of Nominatim. (https://nominatim.org)
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+Tests for japanese phrase splitting.
+from pathlib import Path
+import pytest
+from icu import Transliterator
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import split_japanese_phrases
+def run_preprocessor_on(query):
+ proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
+ return proc(query)
+@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
+ ('大阪府大阪', '大阪府:大阪'),
+ ('大阪市大阪', '大阪市:大阪')])
+def test_split_phrases(inp, outp):
+ query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
+ out = run_preprocessor_on(query)
+ assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]