From: Sarah Hoffmann Date: Mon, 31 Mar 2025 15:14:19 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/acf4a74f8ae821ef0893be3a823f6fecaed3cad9?hp=-c Merge remote-tracking branch 'upstream/master' --- acf4a74f8ae821ef0893be3a823f6fecaed3cad9 diff --combined src/nominatim_api/search/db_search_builder.py index c63803d2,4dcaf14d..0292335e --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@@ -208,7 -208,7 +208,7 @@@ class SearchBuilder addr_partials = [t for r in address for t in self.query.get_partials_list(r)] addr_tokens = list({t.token for t in addr_partials}) - exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1)) + exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1)) if (len(name_partials) > 3 or exp_count < 8000): yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens) @@@ -220,7 -220,7 +220,7 @@@ if name_fulls: fulls_count = sum(t.count for t in name_fulls) - if fulls_count < 50000 or addr_count < 50000: + if fulls_count < 80000 or addr_count < 50000: yield penalty, fulls_count / (2**len(addr_tokens)), \ self.get_full_name_ranking(name_fulls, addr_partials, fulls_count > 30000 / max(1, len(addr_tokens))) @@@ -264,8 -264,6 +264,6 @@@ address lookups will use the index, when the occurrences are not too many. """ - # At this point drop unindexed partials from the address. - # This might yield wrong results, nothing we can do about that. if use_lookup: addr_restrict_tokens = [] addr_lookup_tokens = [t.token for t in addr_partials] diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py index 858cb64c,297c9ef9..19b83863 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@@ -121,10 -121,10 +121,10 @@@ class ICUTokenizer(AbstractTokenizer) SELECT unnest(nameaddress_vector) as id, count(*) FROM search_name GROUP BY id) SELECT coalesce(a.id, w.id) as id, - (CASE WHEN w.count is null THEN '{}'::JSONB + (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB ELSE jsonb_build_object('count', w.count) END || - CASE WHEN a.count is null THEN '{}'::JSONB + CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB ELSE jsonb_build_object('addr_count', a.count) END) as info FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; """) @@@ -134,12 -134,12 +134,13 @@@ drop_tables(conn, 'tmp_word') cur.execute("""CREATE TABLE tmp_word AS SELECT word_id, word_token, type, word, - (CASE WHEN wf.info is null THEN word.info - ELSE coalesce(word.info, '{}'::jsonb) || wf.info - END) as info + coalesce(word.info, '{}'::jsonb) + - 'count' - 'addr_count' || + coalesce(wf.info, '{}'::jsonb) + as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id + ORDER BY word_id """) drop_tables(conn, 'word_frequencies') @@@ -585,10 -585,14 +586,14 @@@ class ICUNameAnalyzer(AbstractAnalyzer) if word_id: result = self._cache.housenumbers.get(word_id, result) if result[0] is None: - variants = analyzer.compute_variants(word_id) + varout = analyzer.compute_variants(word_id) + if isinstance(varout, tuple): + variants = varout[0] + else: + variants = varout if variants: hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)", - (word_id, list(variants))) + (word_id, variants)) result = hid, variants[0] self._cache.housenumbers[word_id] = result @@@ -633,13 -637,17 +638,17 @@@ full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = analyzer.compute_variants(word_id) + varset = analyzer.compute_variants(word_id) + if isinstance(varset, tuple): + variants, lookups = varset + else: + variants, lookups = varset, None if not variants: continue with self.conn.cursor() as cur: - cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", - (token_id, variants)) + cur.execute("SELECT * FROM getorcreate_full_word(%s, %s, %s)", + (token_id, variants, lookups)) full, part = cast(Tuple[int, List[int]], cur.fetchone()) self._cache.names[token_id] = (full, part)