From: Sarah Hoffmann Date: Wed, 8 Jan 2025 18:43:25 +0000 (+0100) Subject: add japanese phrase preprocessing X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/efc09a5cfcfa85663db7d2a4deb86808a0a72de6 add japanese phrase preprocessing Code adapted from GSOC code by @miku. --- diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 437319fa..6cf30d59 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -1,4 +1,5 @@ query-preprocessing: + - step: split_japanese_phrases - step: normalize normalization: - ":: lower ()" diff --git a/src/nominatim_api/query_preprocessing/split_japanese_phrases.py b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py new file mode 100644 index 00000000..7ab55b5f --- /dev/null +++ b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This file divides Japanese addresses into three categories: +prefecture, municipality, and other. +The division is not strict but simple using these keywords. +""" +from typing import List +import re + +from .config import QueryConfig +from .base import QueryProcessingFunc +from ..search.query import Phrase + +MATCH_PATTERNS = [ + r''' + (...??[都都道府県縣]) # [group1] prefecture + (.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages) + (.+) # [group3] other words + ''', + r''' + (...??[都都道府県縣]) # [group1] prefecture + (.+) # [group3] other words + ''', + r''' + (.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages) + (.+) # [group3] other words + ''' +] + + +class _JapanesePreprocessing: + + def __init__(self, config: QueryConfig) -> None: + self.config = config + + def split_phrase(self, phrase: Phrase) -> Phrase: + """ + This function performs a division on the given text using a regular expression. + """ + for pattern in MATCH_PATTERNS: + result = re.match(pattern, phrase.text, re.VERBOSE) + if result is not None: + return Phrase(phrase.ptype, ':'.join(result.groups())) + + return phrase + + def __call__(self, phrases: List[Phrase]) -> List[Phrase]: + """Split a Japanese address using japanese_tokenizer. + """ + return [self.split_phrase(p) for p in phrases] + + +def create(config: QueryConfig) -> QueryProcessingFunc: + """ Create a function of japanese preprocessing. + """ + return _JapanesePreprocessing(config) diff --git a/test/python/api/query_processing/test_split_japanese_phrases.py b/test/python/api/query_processing/test_split_japanese_phrases.py new file mode 100644 index 00000000..6055f9db --- /dev/null +++ b/test/python/api/query_processing/test_split_japanese_phrases.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for japanese phrase splitting. +""" +from pathlib import Path + +import pytest + +from icu import Transliterator + +import nominatim_api.search.query as qmod +from nominatim_api.query_preprocessing.config import QueryConfig +from nominatim_api.query_preprocessing import split_japanese_phrases + +def run_preprocessor_on(query): + proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None)) + + return proc(query) + + +@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'), + ('大阪府大阪', '大阪府:大阪'), + ('大阪市大阪', '大阪市:大阪')]) +def test_split_phrases(inp, outp): + query = [qmod.Phrase(qmod.PhraseType.NONE, inp)] + + out = run_preprocessor_on(query) + + assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]