replace datrie library with a more simple pure-Python class

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 18 Feb 2025 20:09:12 +0000 (21:09 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Mon, 24 Feb 2025 09:24:21 +0000 (10:24 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 18 Feb 2025 20:09:12 +0000 (21:09 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Mon, 24 Feb 2025 09:24:21 +0000 (10:24 +0100)
diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py

index 4aa84de76ba8945fd18e412abb974dbce8cfe6a0..fa9dc4dfa54c66e6f25a408129ba327d228b38b0 100644 (file)
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Generic processor for names that creates abbreviation variants.
  # For a full list of authors see the git log.
  """
  Generic processor for names that creates abbreviation variants.
@@ -10,12 +10,11 @@ Generic processor for names that creates abbreviation variants.
  from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  import itertools
  
  from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  import itertools
  
-import datrie
-
  from ...errors import UsageError
  from ...data.place_name import PlaceName
  from .config_variants import get_variant_config
  from .generic_mutation import MutationVariantGenerator
  from ...errors import UsageError
  from ...data.place_name import PlaceName
  from .config_variants import get_variant_config
  from .generic_mutation import MutationVariantGenerator
+from .simple_trie import SimpleTrie
  
  # Configuration section
  
  
  # Configuration section
  
@@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
      """
      config: Dict[str, Any] = {}
  
      """
      config: Dict[str, Any] = {}
  
-    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
-                                                                 normalizer)
+    config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
      config['variant_only'] = rules.get('mode', '') == 'variant-only'
  
      # parse mutation rules
      config['variant_only'] = rules.get('mode', '') == 'variant-only'
  
      # parse mutation rules
@@ -68,12 +66,8 @@ class GenericTokenAnalysis:
          self.variant_only = config['variant_only']
  
          # Set up datrie
          self.variant_only = config['variant_only']
  
          # Set up datrie
-        if config['replacements']:
-            self.replacements = datrie.Trie(config['chars'])
-            for src, repllist in config['replacements']:
-                self.replacements[src] = repllist
-        else:
-            self.replacements = None
+        self.replacements: Optional[SimpleTrie[List[str]]] = \
+            SimpleTrie(config['replacements']) if config['replacements'] else None
  
          # set up mutation rules
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  
          # set up mutation rules
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
@@ -116,10 +110,10 @@ class GenericTokenAnalysis:
              pos = 0
              force_space = False
              while pos < baselen:
              pos = 0
              force_space = False
              while pos < baselen:
-                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                                   (None, None))
-                if full is not None:
-                    done = baseform[startpos:pos]
+                frm = pos
+                repl, pos = self.replacements.longest_prefix(baseform, pos)
+                if repl is not None:
+                    done = baseform[startpos:frm]
                      partials = [v + done + r
                                  for v, r in itertools.product(partials, repl)
                                  if not force_space or r.startswith(' ')]
                      partials = [v + done + r
                                  for v, r in itertools.product(partials, repl)
                                  if not force_space or r.startswith(' ')]
@@ -128,11 +122,10 @@ class GenericTokenAnalysis:
                          # to be helpful. Only use the original term.
                          startpos = 0
                          break
                          # to be helpful. Only use the original term.
                          startpos = 0
                          break
-                    startpos = pos + len(full)
-                    if full[-1] == ' ':
-                        startpos -= 1
+                    if baseform[pos - 1] == ' ':
+                        pos -= 1
                          force_space = True
                          force_space = True
-                    pos = startpos
+                    startpos = pos
                  else:
                      pos += 1
                      force_space = False
                  else:
                      pos += 1
                      force_space = False
diff --git a/src/nominatim_db/tokenizer/token_analysis/simple_trie.py b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py

new file mode 100644 (file)

index 0000000..c86551d
--- /dev/null
+++ b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Simple dict-based implementation of a trie structure.
+"""
+from typing import TypeVar, Generic, Tuple, Optional, List, Dict
+from collections import defaultdict
+
+T = TypeVar('T')
+
+
+class SimpleTrie(Generic[T]):
+    """ A simple read-only trie structure.
+        This structure supports examply one lookup operation,
+        which is longest-prefix lookup.
+    """
+
+    def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
+        self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
+        self._value: Optional[T] = None
+        self._prefix = ''
+
+        if data:
+            for key, value in data:
+                self._add(key, 0, value)
+
+            self._make_compact()
+
+    def _add(self, word: str, pos: int, value: T) -> None:
+        """ (Internal) Add a sub-word to the trie.
+            The word is added from index 'pos'. If the sub-word to add
+            is empty, then the trie saves the given value.
+        """
+        if pos < len(word):
+            self._tree[word[pos]]._add(word, pos + 1, value)
+        else:
+            self._value = value
+
+    def _make_compact(self) -> None:
+        """ (Internal) Compress tree where there is exactly one subtree
+            and no value.
+
+            Compression works recursively starting at the leaf.
+        """
+        for t in self._tree.values():
+            t._make_compact()
+
+        if len(self._tree) == 1 and self._value is None:
+            assert not self._prefix
+            for k, v in self._tree.items():
+                self._prefix = k + v._prefix
+                self._tree = v._tree
+                self._value = v._value
+
+    def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
+        """ Return the longest prefix match for the given word starting at
+            the position 'start'.
+
+            The function returns a tuple with the value for the longest match and
+            the position of the word after the match. If no match was found at
+            all, the function returns (None, start).
+        """
+        cur = self
+        pos = start
+        result: Tuple[Optional[T], int] = None, start
+
+        while True:
+            if cur._prefix:
+                if not word.startswith(cur._prefix, pos):
+                    return result
+                pos += len(cur._prefix)
+
+            if cur._value:
+                result = cur._value, pos
+
+            if pos >= len(word) or word[pos] not in cur._tree:
+                return result
+
+            cur = cur._tree[word[pos]]
+            pos += 1
diff --git a/test/python/tokenizer/token_analysis/test_simple_trie.py b/test/python/tokenizer/token_analysis/test_simple_trie.py

new file mode 100644 (file)

index 0000000..0384a45
--- /dev/null
+++ b/test/python/tokenizer/token_analysis/test_simple_trie.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for simplified trie structure.
+"""
+
+from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
+
+def test_single_item_trie():
+    t = SimpleTrie([('foob', 42)])
+
+    assert t.longest_prefix('afoobar') == (None, 0)
+    assert t.longest_prefix('afoobar', start=1) == (42, 5)
+    assert t.longest_prefix('foob') == (42, 4)
+    assert t.longest_prefix('123foofoo', 3) == (None, 3)
+
+def test_complex_item_tree():
+    t = SimpleTrie([('a', 1),
+                    ('b', 2),
+                    ('auto', 3),
+                    ('buto', 4),
+                    ('automat', 5),
+                    ('bu', 6),
+                    ('bx', 7)])
+
+    assert t.longest_prefix('a') == (1, 1)
+    assert t.longest_prefix('au') == (1, 1)
+    assert t.longest_prefix('aut') == (1, 1)
+    assert t.longest_prefix('auto') == (3, 4)
+    assert t.longest_prefix('automat') == (5, 7)
+    assert t.longest_prefix('automatx') == (5, 7)
+    assert t.longest_prefix('butomat') == (4, 4)
+    assert t.longest_prefix('butomat', 1) == (None, 1)
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 18 Feb 2025 20:09:12 +0000 (21:09 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 24 Feb 2025 09:24:21 +0000 (10:24 +0100)
src/nominatim_db/tokenizer/token_analysis/generic.py		patch \| blob \| history
src/nominatim_db/tokenizer/token_analysis/simple_trie.py	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/token_analysis/test_simple_trie.py	[new file with mode: 0644]	patch \| blob