src/nominatim_api/query_preprocessing/split_japanese_phrases.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2025 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 This file divides Japanese addresses into three categories:
   9 prefecture, municipality, and other.
  10 The division is not strict but simple using these keywords.
  11 """
  12 from typing import List
  13 import re
  14
  15 from .config import QueryConfig
  16 from .base import QueryProcessingFunc
  17 from ..search.query import Phrase
  18
  19 MATCH_PATTERNS = [
  20     r'''
  21                 (...??[都都道府県縣])            # [group1] prefecture
  22                 (.+?[市区區町村])              # [group2] municipalities (city/wards/towns/villages)
  23                 (.+)                         # [group3] other words
  24                 ''',
  25     r'''
  26                 (...??[都都道府県縣])            # [group1] prefecture
  27                 (.+)                         # [group3] other words
  28                 ''',
  29     r'''
  30                 (.+?[市区區町村])              # [group2] municipalities (city/wards/towns/villages)
  31                 (.+)                         # [group3] other words
  32                 '''
  33 ]
  34
  35
  36 class _JapanesePreprocessing:
  37
  38     def __init__(self, config: QueryConfig) -> None:
  39         self.config = config
  40
  41     def split_phrase(self, phrase: Phrase) -> Phrase:
  42         """
  43         This function performs a division on the given text using a regular expression.
  44         """
  45         for pattern in MATCH_PATTERNS:
  46             result = re.match(pattern, phrase.text, re.VERBOSE)
  47             if result is not None:
  48                 return Phrase(phrase.ptype, ':'.join(result.groups()))
  49
  50         return phrase
  51
  52     def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
  53         """Split a Japanese address using japanese_tokenizer.
  54         """
  55         return [self.split_phrase(p) for p in phrases]
  56
  57
  58 def create(config: QueryConfig) -> QueryProcessingFunc:
  59     """ Create a function of japanese preprocessing.
  60     """
  61     return _JapanesePreprocessing(config)