From: Sarah Hoffmann Date: Wed, 19 Mar 2025 15:01:23 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/be63329a69762a3d13487b060f3cbc7b815bdb48?hp=-c Merge remote-tracking branch 'upstream/master' --- be63329a69762a3d13487b060f3cbc7b815bdb48 diff --combined lib-sql/functions/address_lookup.sql index cba11dbf,6d7a7bd5..b59b7656 --- a/lib-sql/functions/address_lookup.sql +++ b/lib-sql/functions/address_lookup.sql @@@ -47,7 -47,7 +47,7 @@@ BEGI RETURN trim((avals(name))[array_length(avals(name), 1)]); END; $$ - LANGUAGE plpgsql IMMUTABLE; + LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; --housenumber only needed for tiger data @@@ -84,7 -84,7 +84,7 @@@ BEGI RETURN array_to_string(result,', '); END; $$ - LANGUAGE plpgsql STABLE; + LANGUAGE plpgsql STABLE PARALLEL SAFE; DROP TYPE IF EXISTS addressdata_place; CREATE TYPE addressdata_place AS ( @@@ -232,7 -232,7 +232,7 @@@ BEGI FOR location IN SELECT placex.place_id, osm_type, osm_id, name, class, type, coalesce(extratags->'linked_place', extratags->'place') as place_type, - admin_level, fromarea, isaddress, + admin_level, fromarea, isaddress and linked_place_id is NULL as isaddress, CASE WHEN rank_address = 11 THEN 5 ELSE rank_address END as rank_address, distance, country_code, postcode FROM place_addressline join placex on (address_place_id = placex.place_id) @@@ -331,4 -331,4 +331,4 @@@ RETURN; END; $$ - LANGUAGE plpgsql STABLE; + LANGUAGE plpgsql STABLE PARALLEL SAFE; diff --combined src/nominatim_api/search/db_search_builder.py index 4987f156,ddfddaa6..c63803d2 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@@ -214,20 -214,20 +214,20 @@@ class SearchBuilder yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens) return - addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000 + addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000 # Partial term to frequent. Try looking up by rare full names first. name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD) if name_fulls: fulls_count = sum(t.count for t in name_fulls) - if fulls_count < 50000 or addr_count < 30000: + if fulls_count < 80000 or addr_count < 50000: yield penalty, fulls_count / (2**len(addr_tokens)), \ self.get_full_name_ranking(name_fulls, addr_partials, fulls_count > 30000 / max(1, len(addr_tokens))) # To catch remaining results, lookup by name and address # We only do this if there is a reasonable number of results expected. - exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count + exp_count /= 2**len(addr_tokens) if exp_count < 10000 and addr_count < 20000: penalty += 0.35 * max(1 if name_fulls else 0.1, 5 - len(name_partials) - len(addr_tokens)) @@@ -268,7 -268,12 +268,7 @@@ # This might yield wrong results, nothing we can do about that. if use_lookup: addr_restrict_tokens = [] - addr_lookup_tokens = [] - for t in addr_partials: - if t.addr_count > 20000: - addr_restrict_tokens.append(t.token) - else: - addr_lookup_tokens.append(t.token) + addr_lookup_tokens = [t.token for t in addr_partials] else: addr_restrict_tokens = [t.token for t in addr_partials] addr_lookup_tokens = [] diff --combined src/nominatim_api/search/icu_tokenizer.py index ecc2c1c7,1bd0030d..cc5b6cf0 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@@ -166,12 -166,6 +166,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna log().section('Analyze query (using ICU tokenizer)') for func in self.preprocessors: phrases = func(phrases) + + if len(phrases) == 1 \ + and phrases[0].text.count(' ') > 3 \ + and max(len(s) for s in phrases[0].text.split()) < 3: + normalized = [] + query = qmod.QueryStruct(phrases) log().var_dump('Normalized query', query.source) @@@ -199,10 -193,12 +199,12 @@@ self.add_extra_tokens(query) for start, end, pc in self.postcode_parser.parse(query): + term = ' '.join(n.term_lookup for n in query.nodes[start + 1:end + 1]) query.add_token(qmod.TokenRange(start, end), qmod.TOKEN_POSTCODE, ICUToken(penalty=0.1, token=0, count=1, addr_count=1, - lookup_word=pc, word_token=pc, info=None)) + lookup_word=pc, word_token=term, + info=None)) self.rerank_tokens(query) log().table_dump('Word tokens', _dump_word_tokens(query)) @@@ -273,10 -269,10 +275,10 @@@ """ for i, node, tlist in query.iter_token_lists(): if tlist.ttype == qmod.TOKEN_POSTCODE: + tlen = len(cast(ICUToken, tlist.tokens[0]).word_token) for repl in node.starting: if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \ - and (repl.ttype != qmod.TOKEN_HOUSENUMBER - or len(tlist.tokens[0].lookup_word) > 4): + and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4): repl.add_penalty(0.39) elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER and len(tlist.tokens[0].lookup_word) <= 3):