Merge remote-tracking branch 'upstream/master'

[nominatim.git] / src / nominatim_db / tokenizer / token_analysis / generic.py
diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py

index cd649e62f54a4fd37a430a7d1822b99d89aeba40..fa9dc4dfa54c66e6f25a408129ba327d228b38b0 100644 (file)
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Generic processor for names that creates abbreviation variants.
  # For a full list of authors see the git log.
  """
  Generic processor for names that creates abbreviation variants.
@@ -10,22 +10,21 @@ Generic processor for names that creates abbreviation variants.
  from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  import itertools
  
  from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  import itertools
  
-import datrie
-
-from nominatim_core.errors import UsageError
+from ...errors import UsageError
  from ...data.place_name import PlaceName
  from .config_variants import get_variant_config
  from .generic_mutation import MutationVariantGenerator
  from ...data.place_name import PlaceName
  from .config_variants import get_variant_config
  from .generic_mutation import MutationVariantGenerator
+from .simple_trie import SimpleTrie
+
+# Configuration section
  
  
-### Configuration section
  
  def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
      """ Extract and preprocess the configuration for this module.
      """
      config: Dict[str, Any] = {}
  
  
  def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
      """ Extract and preprocess the configuration for this module.
      """
      config: Dict[str, Any] = {}
  
-    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
-                                                                 normalizer)
+    config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
      config['variant_only'] = rules.get('mode', '') == 'variant-only'
  
      # parse mutation rules
      config['variant_only'] = rules.get('mode', '') == 'variant-only'
  
      # parse mutation rules
@@ -47,7 +46,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
      return config
  
  
      return config
  
  
-### Analysis section
+# Analysis section
  
  def create(normalizer: Any, transliterator: Any,
             config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
  
  def create(normalizer: Any, transliterator: Any,
             config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
@@ -67,24 +66,18 @@ class GenericTokenAnalysis:
          self.variant_only = config['variant_only']
  
          # Set up datrie
          self.variant_only = config['variant_only']
  
          # Set up datrie
-        if config['replacements']:
-            self.replacements = datrie.Trie(config['chars'])
-            for src, repllist in config['replacements']:
-                self.replacements[src] = repllist
-        else:
-            self.replacements = None
+        self.replacements: Optional[SimpleTrie[List[str]]] = \
+            SimpleTrie(config['replacements']) if config['replacements'] else None
  
          # set up mutation rules
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  
  
          # set up mutation rules
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  
-
      def get_canonical_id(self, name: PlaceName) -> str:
          """ Return the normalized form of the name. This is the standard form
              from which possible variants for the name can be derived.
          """
          return cast(str, self.norm.transliterate(name.name)).strip()
  
      def get_canonical_id(self, name: PlaceName) -> str:
          """ Return the normalized form of the name. This is the standard form
              from which possible variants for the name can be derived.
          """
          return cast(str, self.norm.transliterate(name.name)).strip()
  
-
      def compute_variants(self, norm_name: str) -> List[str]:
          """ Compute the spelling variants for the given normalized name
              and transliterate the result.
      def compute_variants(self, norm_name: str) -> List[str]:
          """ Compute the spelling variants for the given normalized name
              and transliterate the result.
@@ -96,7 +89,6 @@ class GenericTokenAnalysis:
  
          return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  
  
          return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  
-
      def _transliterate_unique_list(self, norm_name: str,
                                     iterable: Iterable[str]) -> Iterator[Optional[str]]:
          seen = set()
      def _transliterate_unique_list(self, norm_name: str,
                                     iterable: Iterable[str]) -> Iterator[Optional[str]]:
          seen = set()
@@ -108,7 +100,6 @@ class GenericTokenAnalysis:
                  seen.add(variant)
                  yield self.to_ascii.transliterate(variant).strip()
  
                  seen.add(variant)
                  yield self.to_ascii.transliterate(variant).strip()
  
-
      def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
          baseform = '^ ' + norm_name + ' ^'
          baselen = len(baseform)
      def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
          baseform = '^ ' + norm_name + ' ^'
          baselen = len(baseform)
@@ -119,10 +110,10 @@ class GenericTokenAnalysis:
              pos = 0
              force_space = False
              while pos < baselen:
              pos = 0
              force_space = False
              while pos < baselen:
-                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                                   (None, None))
-                if full is not None:
-                    done = baseform[startpos:pos]
+                frm = pos
+                repl, pos = self.replacements.longest_prefix(baseform, pos)
+                if repl is not None:
+                    done = baseform[startpos:frm]
                      partials = [v + done + r
                                  for v, r in itertools.product(partials, repl)
                                  if not force_space or r.startswith(' ')]
                      partials = [v + done + r
                                  for v, r in itertools.product(partials, repl)
                                  if not force_space or r.startswith(' ')]
@@ -131,11 +122,10 @@ class GenericTokenAnalysis:
                          # to be helpful. Only use the original term.
                          startpos = 0
                          break
                          # to be helpful. Only use the original term.
                          startpos = 0
                          break
-                    startpos = pos + len(full)
-                    if full[-1] == ' ':
-                        startpos -= 1
+                    if baseform[pos - 1] == ' ':
+                        pos -= 1
                          force_space = True
                          force_space = True
-                    pos = startpos
+                    startpos = pos
                  else:
                      pos += 1
                      force_space = False
                  else:
                      pos += 1
                      force_space = False