From 8f3845660f18bdbf2dd42dd2c6db6c7fa5160f3e Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 19 Apr 2024 17:52:22 +0200
Subject: [PATCH] add full tokens to addresses

This is now needed to weigh results.
---
 nominatim/tokenizer/icu_tokenizer.py | 40 +++++-----------------------
 test/python/tokenizer/test_icu.py    |  8 +++---
 2 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 251f4da5..4b9dac69 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -712,10 +712,11 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 token_info.add_street(self._retrieve_full_tokens(item.name))
             elif item.kind == 'place':
                 if not item.suffix:
-                    token_info.add_place(self._compute_partial_tokens(item.name))
+                    token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
             elif not item.kind.startswith('_') and not item.suffix and \
                  item.kind not in ('country', 'full', 'inclusion'):
-                token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
+                token_info.add_address_term(item.kind,
+                                            itertools.chain(*self._compute_name_tokens([item])))
 
 
     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
@@ -756,36 +757,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
         return result
 
 
-    def _compute_partial_tokens(self, name: str) -> List[int]:
-        """ Normalize the given term, split it into partial words and return
-            then token list for them.
-        """
-        assert self.conn is not None
-        norm_name = self._search_normalized(name)
-
-        tokens = []
-        need_lookup = []
-        for partial in norm_name.split():
-            token = self._cache.partials.get(partial)
-            if token:
-                tokens.append(token)
-            else:
-                need_lookup.append(partial)
-
-        if need_lookup:
-            with self.conn.cursor() as cur:
-                cur.execute("""SELECT word, getorcreate_partial_word(word)
-                               FROM unnest(%s) word""",
-                            (need_lookup, ))
-
-                for partial, token in cur:
-                    assert token is not None
-                    tokens.append(token)
-                    self._cache.partials[partial] = token
-
-        return tokens
-
-
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
             The name is only retrieved for the standard analyser.
@@ -957,8 +928,9 @@ class _TokenInfo:
     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
         """ Add additional address terms.
         """
-        if partials:
-            self.address_tokens[key] = self._mk_array(partials)
+        array = self._mk_array(partials)
+        if len(array) > 2:
+            self.address_tokens[key] = array
 
     def set_postcode(self, postcode: Optional[str]) -> None:
         """ Set the postcode to the given one.
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py
index 9f6eae62..2a4865db 100644
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -554,7 +554,7 @@ class TestPlaceAddress:
     def test_process_place_place(self):
         info = self.process_address(place='Honu Lulu')
 
-        assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
+        assert eval(info['place']) == self.name_token_set('HONU', 'LULU', '#HONU LULU')
 
 
     def test_process_place_place_extra(self):
@@ -574,8 +574,8 @@ class TestPlaceAddress:
                                     suburb='Zwickau', street='Hauptstr',
                                     full='right behind the church')
 
-        city = self.name_token_set('ZWICKAU')
-        state = self.name_token_set('SACHSEN')
+        city = self.name_token_set('ZWICKAU', '#ZWICKAU')
+        state = self.name_token_set('SACHSEN', '#SACHSEN')
 
         result = {k: eval(v) for k,v in info['addr'].items()}
 
@@ -587,7 +587,7 @@ class TestPlaceAddress:
 
         result = {k: eval(v) for k,v in info['addr'].items()}
 
-        assert result == {'city': self.name_token_set('Bruxelles')}
+        assert result == {'city': self.name_token_set('Bruxelles', '#Bruxelles')}
 
 
     def test_process_place_address_terms_empty(self):
-- 
2.39.5