Merge pull request #3659 from lonvia/custom-datrie-structure

author Sarah Hoffmann <lonvia@denofr.de>

Mon, 24 Feb 2025 15:49:42 +0000 (16:49 +0100)

committer GitHub <noreply@github.com>

Mon, 24 Feb 2025 15:49:42 +0000 (16:49 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Mon, 24 Feb 2025 15:49:42 +0000 (16:49 +0100)
committer GitHub <noreply@github.com>
Mon, 24 Feb 2025 15:49:42 +0000 (16:49 +0100)
diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md

index d837439936ac2162b5001f5a1b70ff07d97ad1c1..2571de5d953814c045f22be26024357ecd084593 100644 (file)
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -37,7 +37,6 @@ Furthermore the following Python libraries are required:
    * [Jinja2](https://palletsprojects.com/p/jinja/)
    * [PyICU](https://pypi.org/project/PyICU/)
    * [PyYaml](https://pyyaml.org/) (5.1+)
-  * [datrie](https://github.com/pytries/datrie)
  
  These will be installed automatically when using pip installation.
  
diff --git a/docs/develop/Development-Environment.md b/docs/develop/Development-Environment.md

index 2425ec78c9315cbd5e83ca2391e9e84decbf2452..9ade79162f853bc8c15ba61a3c45cffdaaf75c5f 100644 (file)
--- a/docs/develop/Development-Environment.md
+++ b/docs/develop/Development-Environment.md
@@ -70,7 +70,7 @@ To set up the virtual environment with all necessary packages run:
  virtualenv ~/nominatim-dev-venv
  ~/nominatim-dev-venv/bin/pip install\
      psutil psycopg[binary] PyICU SQLAlchemy \
-    python-dotenv jinja2 pyYAML datrie behave \
+    python-dotenv jinja2 pyYAML behave \
      mkdocs mkdocstrings mkdocs-gen-files pytest pytest-asyncio flake8 \
      types-jinja2 types-markupsafe types-psutil types-psycopg2 \
      types-pygments types-pyyaml types-requests types-ujson \
diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml

index c34ce937ea6fb457bb5d05d86600ecf02d885f41..3c99fd2a1c9ca67cc0a6e0862143842c56cd64f3 100644 (file)
--- a/packaging/nominatim-db/pyproject.toml
+++ b/packaging/nominatim-db/pyproject.toml
@@ -19,7 +19,6 @@ dependencies = [
      "python-dotenv",
      "jinja2",
      "pyYAML>=5.1",
-    "datrie",
      "psutil",
      "PyICU"
  ]
diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py

index 4aa84de76ba8945fd18e412abb974dbce8cfe6a0..fa9dc4dfa54c66e6f25a408129ba327d228b38b0 100644 (file)
--- a/src/nominatim_db/tokenizer/token_analysis/generic.py
+++ b/src/nominatim_db/tokenizer/token_analysis/generic.py
@@ -2,7 +2,7 @@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Generic processor for names that creates abbreviation variants.
@@ -10,12 +10,11 @@ Generic processor for names that creates abbreviation variants.
  from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  import itertools
  
-import datrie
-
  from ...errors import UsageError
  from ...data.place_name import PlaceName
  from .config_variants import get_variant_config
  from .generic_mutation import MutationVariantGenerator
+from .simple_trie import SimpleTrie
  
  # Configuration section
  
@@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An
      """
      config: Dict[str, Any] = {}
  
-    config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
-                                                                 normalizer)
+    config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
      config['variant_only'] = rules.get('mode', '') == 'variant-only'
  
      # parse mutation rules
@@ -68,12 +66,8 @@ class GenericTokenAnalysis:
          self.variant_only = config['variant_only']
  
          # Set up datrie
-        if config['replacements']:
-            self.replacements = datrie.Trie(config['chars'])
-            for src, repllist in config['replacements']:
-                self.replacements[src] = repllist
-        else:
-            self.replacements = None
+        self.replacements: Optional[SimpleTrie[List[str]]] = \
+            SimpleTrie(config['replacements']) if config['replacements'] else None
  
          # set up mutation rules
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
@@ -116,10 +110,10 @@ class GenericTokenAnalysis:
              pos = 0
              force_space = False
              while pos < baselen:
-                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                                   (None, None))
-                if full is not None:
-                    done = baseform[startpos:pos]
+                frm = pos
+                repl, pos = self.replacements.longest_prefix(baseform, pos)
+                if repl is not None:
+                    done = baseform[startpos:frm]
                      partials = [v + done + r
                                  for v, r in itertools.product(partials, repl)
                                  if not force_space or r.startswith(' ')]
@@ -128,11 +122,10 @@ class GenericTokenAnalysis:
                          # to be helpful. Only use the original term.
                          startpos = 0
                          break
-                    startpos = pos + len(full)
-                    if full[-1] == ' ':
-                        startpos -= 1
+                    if baseform[pos - 1] == ' ':
+                        pos -= 1
                          force_space = True
-                    pos = startpos
+                    startpos = pos
                  else:
                      pos += 1
                      force_space = False
diff --git a/src/nominatim_db/tokenizer/token_analysis/simple_trie.py b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py

new file mode 100644 (file)

index 0000000..c86551d
--- /dev/null
+++ b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Simple dict-based implementation of a trie structure.
+"""
+from typing import TypeVar, Generic, Tuple, Optional, List, Dict
+from collections import defaultdict
+
+T = TypeVar('T')
+
+
+class SimpleTrie(Generic[T]):
+    """ A simple read-only trie structure.
+        This structure supports examply one lookup operation,
+        which is longest-prefix lookup.
+    """
+
+    def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
+        self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
+        self._value: Optional[T] = None
+        self._prefix = ''
+
+        if data:
+            for key, value in data:
+                self._add(key, 0, value)
+
+            self._make_compact()
+
+    def _add(self, word: str, pos: int, value: T) -> None:
+        """ (Internal) Add a sub-word to the trie.
+            The word is added from index 'pos'. If the sub-word to add
+            is empty, then the trie saves the given value.
+        """
+        if pos < len(word):
+            self._tree[word[pos]]._add(word, pos + 1, value)
+        else:
+            self._value = value
+
+    def _make_compact(self) -> None:
+        """ (Internal) Compress tree where there is exactly one subtree
+            and no value.
+
+            Compression works recursively starting at the leaf.
+        """
+        for t in self._tree.values():
+            t._make_compact()
+
+        if len(self._tree) == 1 and self._value is None:
+            assert not self._prefix
+            for k, v in self._tree.items():
+                self._prefix = k + v._prefix
+                self._tree = v._tree
+                self._value = v._value
+
+    def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
+        """ Return the longest prefix match for the given word starting at
+            the position 'start'.
+
+            The function returns a tuple with the value for the longest match and
+            the position of the word after the match. If no match was found at
+            all, the function returns (None, start).
+        """
+        cur = self
+        pos = start
+        result: Tuple[Optional[T], int] = None, start
+
+        while True:
+            if cur._prefix:
+                if not word.startswith(cur._prefix, pos):
+                    return result
+                pos += len(cur._prefix)
+
+            if cur._value:
+                result = cur._value, pos
+
+            if pos >= len(word) or word[pos] not in cur._tree:
+                return result
+
+            cur = cur._tree[word[pos]]
+            pos += 1
diff --git a/test/python/tokenizer/token_analysis/test_simple_trie.py b/test/python/tokenizer/token_analysis/test_simple_trie.py

new file mode 100644 (file)

index 0000000..0384a45
--- /dev/null
+++ b/test/python/tokenizer/token_analysis/test_simple_trie.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for simplified trie structure.
+"""
+
+from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
+
+def test_single_item_trie():
+    t = SimpleTrie([('foob', 42)])
+
+    assert t.longest_prefix('afoobar') == (None, 0)
+    assert t.longest_prefix('afoobar', start=1) == (42, 5)
+    assert t.longest_prefix('foob') == (42, 4)
+    assert t.longest_prefix('123foofoo', 3) == (None, 3)
+
+def test_complex_item_tree():
+    t = SimpleTrie([('a', 1),
+                    ('b', 2),
+                    ('auto', 3),
+                    ('buto', 4),
+                    ('automat', 5),
+                    ('bu', 6),
+                    ('bx', 7)])
+
+    assert t.longest_prefix('a') == (1, 1)
+    assert t.longest_prefix('au') == (1, 1)
+    assert t.longest_prefix('aut') == (1, 1)
+    assert t.longest_prefix('auto') == (3, 4)
+    assert t.longest_prefix('automat') == (5, 7)
+    assert t.longest_prefix('automatx') == (5, 7)
+    assert t.longest_prefix('butomat') == (4, 4)
+    assert t.longest_prefix('butomat', 1) == (None, 1)
author	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 24 Feb 2025 15:49:42 +0000 (16:49 +0100)
committer	GitHub <noreply@github.com>
	Mon, 24 Feb 2025 15:49:42 +0000 (16:49 +0100)
docs/admin/Installation.md		patch \| blob \| history
docs/develop/Development-Environment.md		patch \| blob \| history
packaging/nominatim-db/pyproject.toml		patch \| blob \| history
src/nominatim_db/tokenizer/token_analysis/generic.py		patch \| blob \| history
src/nominatim_db/tokenizer/token_analysis/simple_trie.py	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/token_analysis/test_simple_trie.py	[new file with mode: 0644]	patch \| blob