From 13db4c9731072c8e8af5f3eccbefb8723233f51c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 18 Feb 2025 21:09:12 +0100 Subject: [PATCH] replace datrie library with a more simple pure-Python class --- .../tokenizer/token_analysis/generic.py | 31 +++---- .../tokenizer/token_analysis/simple_trie.py | 84 +++++++++++++++++++ .../token_analysis/test_simple_trie.py | 37 ++++++++ 3 files changed, 133 insertions(+), 19 deletions(-) create mode 100644 src/nominatim_db/tokenizer/token_analysis/simple_trie.py create mode 100644 test/python/tokenizer/token_analysis/test_simple_trie.py diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py index 4aa84de7..fa9dc4df 100644 --- a/src/nominatim_db/tokenizer/token_analysis/generic.py +++ b/src/nominatim_db/tokenizer/token_analysis/generic.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Generic processor for names that creates abbreviation variants. @@ -10,12 +10,11 @@ Generic processor for names that creates abbreviation variants. from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast import itertools -import datrie - from ...errors import UsageError from ...data.place_name import PlaceName from .config_variants import get_variant_config from .generic_mutation import MutationVariantGenerator +from .simple_trie import SimpleTrie # Configuration section @@ -25,8 +24,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An """ config: Dict[str, Any] = {} - config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), - normalizer) + config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer) config['variant_only'] = rules.get('mode', '') == 'variant-only' # parse mutation rules @@ -68,12 +66,8 @@ class GenericTokenAnalysis: self.variant_only = config['variant_only'] # Set up datrie - if config['replacements']: - self.replacements = datrie.Trie(config['chars']) - for src, repllist in config['replacements']: - self.replacements[src] = repllist - else: - self.replacements = None + self.replacements: Optional[SimpleTrie[List[str]]] = \ + SimpleTrie(config['replacements']) if config['replacements'] else None # set up mutation rules self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] @@ -116,10 +110,10 @@ class GenericTokenAnalysis: pos = 0 force_space = False while pos < baselen: - full, repl = self.replacements.longest_prefix_item(baseform[pos:], - (None, None)) - if full is not None: - done = baseform[startpos:pos] + frm = pos + repl, pos = self.replacements.longest_prefix(baseform, pos) + if repl is not None: + done = baseform[startpos:frm] partials = [v + done + r for v, r in itertools.product(partials, repl) if not force_space or r.startswith(' ')] @@ -128,11 +122,10 @@ class GenericTokenAnalysis: # to be helpful. Only use the original term. startpos = 0 break - startpos = pos + len(full) - if full[-1] == ' ': - startpos -= 1 + if baseform[pos - 1] == ' ': + pos -= 1 force_space = True - pos = startpos + startpos = pos else: pos += 1 force_space = False diff --git a/src/nominatim_db/tokenizer/token_analysis/simple_trie.py b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py new file mode 100644 index 00000000..c86551df --- /dev/null +++ b/src/nominatim_db/tokenizer/token_analysis/simple_trie.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Simple dict-based implementation of a trie structure. +""" +from typing import TypeVar, Generic, Tuple, Optional, List, Dict +from collections import defaultdict + +T = TypeVar('T') + + +class SimpleTrie(Generic[T]): + """ A simple read-only trie structure. + This structure supports examply one lookup operation, + which is longest-prefix lookup. + """ + + def __init__(self, data: Optional[List[Tuple[str, T]]] = None) -> None: + self._tree: Dict[str, 'SimpleTrie[T]'] = defaultdict(SimpleTrie[T]) + self._value: Optional[T] = None + self._prefix = '' + + if data: + for key, value in data: + self._add(key, 0, value) + + self._make_compact() + + def _add(self, word: str, pos: int, value: T) -> None: + """ (Internal) Add a sub-word to the trie. + The word is added from index 'pos'. If the sub-word to add + is empty, then the trie saves the given value. + """ + if pos < len(word): + self._tree[word[pos]]._add(word, pos + 1, value) + else: + self._value = value + + def _make_compact(self) -> None: + """ (Internal) Compress tree where there is exactly one subtree + and no value. + + Compression works recursively starting at the leaf. + """ + for t in self._tree.values(): + t._make_compact() + + if len(self._tree) == 1 and self._value is None: + assert not self._prefix + for k, v in self._tree.items(): + self._prefix = k + v._prefix + self._tree = v._tree + self._value = v._value + + def longest_prefix(self, word: str, start: int = 0) -> Tuple[Optional[T], int]: + """ Return the longest prefix match for the given word starting at + the position 'start'. + + The function returns a tuple with the value for the longest match and + the position of the word after the match. If no match was found at + all, the function returns (None, start). + """ + cur = self + pos = start + result: Tuple[Optional[T], int] = None, start + + while True: + if cur._prefix: + if not word.startswith(cur._prefix, pos): + return result + pos += len(cur._prefix) + + if cur._value: + result = cur._value, pos + + if pos >= len(word) or word[pos] not in cur._tree: + return result + + cur = cur._tree[word[pos]] + pos += 1 diff --git a/test/python/tokenizer/token_analysis/test_simple_trie.py b/test/python/tokenizer/token_analysis/test_simple_trie.py new file mode 100644 index 00000000..0384a456 --- /dev/null +++ b/test/python/tokenizer/token_analysis/test_simple_trie.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for simplified trie structure. +""" + +from nominatim_db.tokenizer.token_analysis.simple_trie import SimpleTrie + +def test_single_item_trie(): + t = SimpleTrie([('foob', 42)]) + + assert t.longest_prefix('afoobar') == (None, 0) + assert t.longest_prefix('afoobar', start=1) == (42, 5) + assert t.longest_prefix('foob') == (42, 4) + assert t.longest_prefix('123foofoo', 3) == (None, 3) + +def test_complex_item_tree(): + t = SimpleTrie([('a', 1), + ('b', 2), + ('auto', 3), + ('buto', 4), + ('automat', 5), + ('bu', 6), + ('bx', 7)]) + + assert t.longest_prefix('a') == (1, 1) + assert t.longest_prefix('au') == (1, 1) + assert t.longest_prefix('aut') == (1, 1) + assert t.longest_prefix('auto') == (3, 4) + assert t.longest_prefix('automat') == (5, 7) + assert t.longest_prefix('automatx') == (5, 7) + assert t.longest_prefix('butomat') == (4, 4) + assert t.longest_prefix('butomat', 1) == (None, 1) -- 2.39.5