#
# This file is part of Nominatim. (https://nominatim.org)
#
-# Copyright (C) 2024 by the Nominatim developer community.
+# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Generic processor for names that creates abbreviation variants.
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
import itertools
-import datrie
-
from ...errors import UsageError
from ...data.place_name import PlaceName
from .config_variants import get_variant_config
from .generic_mutation import MutationVariantGenerator
+from .simple_trie import SimpleTrie
# Configuration section
"""
config: Dict[str, Any] = {}
- config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
- normalizer)
+ config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules
self.variant_only = config['variant_only']
# Set up datrie
- if config['replacements']:
- self.replacements = datrie.Trie(config['chars'])
- for src, repllist in config['replacements']:
- self.replacements[src] = repllist
- else:
- self.replacements = None
+ self.replacements: Optional[SimpleTrie[List[str]]] = \
+ SimpleTrie(config['replacements']) if config['replacements'] else None
# set up mutation rules
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
pos = 0
force_space = False
while pos < baselen:
- full, repl = self.replacements.longest_prefix_item(baseform[pos:],
- (None, None))
- if full is not None:
- done = baseform[startpos:pos]
+ frm = pos
+ repl, pos = self.replacements.longest_prefix(baseform, pos)
+ if repl is not None:
+ done = baseform[startpos:frm]
partials = [v + done + r
for v, r in itertools.product(partials, repl)
if not force_space or r.startswith(' ')]
# to be helpful. Only use the original term.
startpos = 0
break
- startpos = pos + len(full)
- if full[-1] == ' ':
- startpos -= 1
+ if baseform[pos - 1] == ' ':
+ pos -= 1
force_space = True
- pos = startpos
+ startpos = pos
else:
pos += 1
force_space = False
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Simple dict-based implementation of a trie structure.
+"""
+from typing import TypeVar, Generic, Tuple, Optional, List, Dict
+from collections import defaultdict
+
+T = TypeVar('T')
+
+
+class SimpleTrie(Generic[T]):
+ """ A simple read-only trie structure.
+ This structure supports examply one lookup operation,
+ which is longest-prefix lookup.
+ """
+
+ def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None:
+ self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T])
+ self._value: Optional[T] = None
+ self._prefix = ''
+
+ if data:
+ for key, value in data:
+ self._add(key, 0, value)
+
+ self._make_compact()
+
+ def _add(self, word: str, pos: int, value: T) -> None:
+ """ (Internal) Add a sub-word to the trie.
+ The word is added from index 'pos'. If the sub-word to add
+ is empty, then the trie saves the given value.
+ """
+ if pos < len(word):
+ self._tree[word[pos]]._add(word, pos + 1, value)
+ else:
+ self._value = value
+
+ def _make_compact(self) -> None:
+ """ (Internal) Compress tree where there is exactly one subtree
+ and no value.
+
+ Compression works recursively starting at the leaf.
+ """
+ for t in self._tree.values():
+ t._make_compact()
+
+ if len(self._tree) == 1 and self._value is None:
+ assert not self._prefix
+ for k, v in self._tree.items():
+ self._prefix = k + v._prefix
+ self._tree = v._tree
+ self._value = v._value
+
+ def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]:
+ """ Return the longest prefix match for the given word starting at
+ the position 'start'.
+
+ The function returns a tuple with the value for the longest match and
+ the position of the word after the match. If no match was found at
+ all, the function returns (None, start).
+ """
+ cur = self
+ pos = start
+ result: Tuple[Optional[T], int] = None, start
+
+ while True:
+ if cur._prefix:
+ if not word.startswith(cur._prefix, pos):
+ return result
+ pos += len(cur._prefix)
+
+ if cur._value:
+ result = cur._value, pos
+
+ if pos >= len(word) or word[pos] not in cur._tree:
+ return result
+
+ cur = cur._tree[word[pos]]
+ pos += 1
--- /dev/null
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for simplified trie structure.
+"""
+
+from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie
+
+def test_single_item_trie():
+ t = SimpleTrie([('foob', 42)])
+
+ assert t.longest_prefix('afoobar') == (None, 0)
+ assert t.longest_prefix('afoobar', start=1) == (42, 5)
+ assert t.longest_prefix('foob') == (42, 4)
+ assert t.longest_prefix('123foofoo', 3) == (None, 3)
+
+def test_complex_item_tree():
+ t = SimpleTrie([('a', 1),
+ ('b', 2),
+ ('auto', 3),
+ ('buto', 4),
+ ('automat', 5),
+ ('bu', 6),
+ ('bx', 7)])
+
+ assert t.longest_prefix('a') == (1, 1)
+ assert t.longest_prefix('au') == (1, 1)
+ assert t.longest_prefix('aut') == (1, 1)
+ assert t.longest_prefix('auto') == (3, 4)
+ assert t.longest_prefix('automat') == (5, 7)
+ assert t.longest_prefix('automatx') == (5, 7)
+ assert t.longest_prefix('butomat') == (4, 4)
+ assert t.longest_prefix('butomat', 1) == (None, 1)