]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Fri, 3 May 2024 14:34:22 +0000 (16:34 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Fri, 3 May 2024 14:34:22 +0000 (16:34 +0200)
1  2 
nominatim/tokenizer/icu_tokenizer.py

index 9032d71b9069c5210d625ae21641f835610c0423,4b9dac69e18eb63760c07ff71a981f1343ab6ac3..70273b90e0af59e43a6a58108ffb427b31b5a654
@@@ -163,8 -163,6 +163,8 @@@ class ICUTokenizer(AbstractTokenizer)
                  else:
                      LOG.info('Computing word frequencies')
                      cur.drop_table('word_frequencies')
 +                    cur.execute('ANALYSE search_name')
 +                    cur.execute('ANALYSE word')
                      cur.execute("""
                        CREATE TEMP TABLE word_frequencies AS
                        WITH word_freq AS MATERIALIZED (
                                              END) as info
                                      FROM word LEFT JOIN word_frequencies wf
                                           ON word.word_id = wf.id
 +                                    ORDER BY word_id
                                  """)
                      cur.drop_table('word_frequencies')
  
@@@ -715,10 -712,11 +715,11 @@@ class ICUNameAnalyzer(AbstractAnalyzer)
                  token_info.add_street(self._retrieve_full_tokens(item.name))
              elif item.kind == 'place':
                  if not item.suffix:
-                     token_info.add_place(self._compute_partial_tokens(item.name))
+                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
              elif not item.kind.startswith('_') and not item.suffix and \
                   item.kind not in ('country', 'full', 'inclusion'):
-                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
+                 token_info.add_address_term(item.kind,
+                                             itertools.chain(*self._compute_name_tokens([item])))
  
  
      def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
          return result
  
  
-     def _compute_partial_tokens(self, name: str) -> List[int]:
-         """ Normalize the given term, split it into partial words and return
-             then token list for them.
-         """
-         assert self.conn is not None
-         norm_name = self._search_normalized(name)
-         tokens = []
-         need_lookup = []
-         for partial in norm_name.split():
-             token = self._cache.partials.get(partial)
-             if token:
-                 tokens.append(token)
-             else:
-                 need_lookup.append(partial)
-         if need_lookup:
-             with self.conn.cursor() as cur:
-                 cur.execute("""SELECT word, getorcreate_partial_word(word)
-                                FROM unnest(%s) word""",
-                             (need_lookup, ))
-                 for partial, token in cur:
-                     assert token is not None
-                     tokens.append(token)
-                     self._cache.partials[partial] = token
-         return tokens
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
              The name is only retrieved for the standard analyser.
@@@ -960,8 -928,9 +931,9 @@@ class _TokenInfo
      def add_address_term(self, key: str, partials: Iterable[int]) -> None:
          """ Add additional address terms.
          """
-         if partials:
-             self.address_tokens[key] = self._mk_array(partials)
+         array = self._mk_array(partials)
+         if len(array) > 2:
+             self.address_tokens[key] = array
  
      def set_postcode(self, postcode: Optional[str]) -> None:
          """ Set the postcode to the given one.