From: Sarah Hoffmann Date: Fri, 3 May 2024 14:34:22 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~8 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/0eddfe588069fc8833d6948d01e07bd055e4c3bc?hp=-c Merge remote-tracking branch 'upstream/master' --- 0eddfe588069fc8833d6948d01e07bd055e4c3bc diff --combined nominatim/tokenizer/icu_tokenizer.py index 9032d71b,4b9dac69..70273b90 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@@ -163,8 -163,6 +163,8 @@@ class ICUTokenizer(AbstractTokenizer) else: LOG.info('Computing word frequencies') cur.drop_table('word_frequencies') + cur.execute('ANALYSE search_name') + cur.execute('ANALYSE word') cur.execute(""" CREATE TEMP TABLE word_frequencies AS WITH word_freq AS MATERIALIZED ( @@@ -192,7 -190,6 +192,7 @@@ END) as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id + ORDER BY word_id """) cur.drop_table('word_frequencies') @@@ -715,10 -712,11 +715,11 @@@ class ICUNameAnalyzer(AbstractAnalyzer) token_info.add_street(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': if not item.suffix: - token_info.add_place(self._compute_partial_tokens(item.name)) + token_info.add_place(itertools.chain(*self._compute_name_tokens([item]))) elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full', 'inclusion'): - token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) + token_info.add_address_term(item.kind, + itertools.chain(*self._compute_name_tokens([item]))) def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]: @@@ -759,36 -757,6 +760,6 @@@ return result - def _compute_partial_tokens(self, name: str) -> List[int]: - """ Normalize the given term, split it into partial words and return - then token list for them. - """ - assert self.conn is not None - norm_name = self._search_normalized(name) - - tokens = [] - need_lookup = [] - for partial in norm_name.split(): - token = self._cache.partials.get(partial) - if token: - tokens.append(token) - else: - need_lookup.append(partial) - - if need_lookup: - with self.conn.cursor() as cur: - cur.execute("""SELECT word, getorcreate_partial_word(word) - FROM unnest(%s) word""", - (need_lookup, )) - - for partial, token in cur: - assert token is not None - tokens.append(token) - self._cache.partials[partial] = token - - return tokens - - def _retrieve_full_tokens(self, name: str) -> List[int]: """ Get the full name token for the given name, if it exists. The name is only retrieved for the standard analyser. @@@ -960,8 -928,9 +931,9 @@@ class _TokenInfo def add_address_term(self, key: str, partials: Iterable[int]) -> None: """ Add additional address terms. """ - if partials: - self.address_tokens[key] = self._mk_array(partials) + array = self._mk_array(partials) + if len(array) > 2: + self.address_tokens[key] = array def set_postcode(self, postcode: Optional[str]) -> None: """ Set the postcode to the given one.