]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 19 Mar 2025 15:01:23 +0000 (16:01 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 19 Mar 2025 15:01:23 +0000 (16:01 +0100)
1  2 
lib-sql/functions/address_lookup.sql
src/nominatim_api/search/db_search_builder.py
src/nominatim_api/search/icu_tokenizer.py

index cba11dbf3400d7fcc45a74f9c7fb85ed1a2f4c69,6d7a7bd529aba9baedca5a829f4fa7b0267e91a1..b59b7656bca93ddd3c8d88e819a93ca57fed1396
@@@ -47,7 -47,7 +47,7 @@@ BEGI
    RETURN trim((avals(name))[array_length(avals(name), 1)]);
  END;
  $$
- LANGUAGE plpgsql IMMUTABLE;
+ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
  
  
  --housenumber only needed for tiger data
@@@ -84,7 -84,7 +84,7 @@@ BEGI
    RETURN array_to_string(result,', ');
  END;
  $$
- LANGUAGE plpgsql STABLE;
+ LANGUAGE plpgsql STABLE PARALLEL SAFE;
  
  DROP TYPE IF EXISTS addressdata_place;
  CREATE TYPE addressdata_place AS (
@@@ -232,7 -232,7 +232,7 @@@ BEGI
    FOR location IN
      SELECT placex.place_id, osm_type, osm_id, name, class, type,
             coalesce(extratags->'linked_place', extratags->'place') as place_type,
 -           admin_level, fromarea, isaddress,
 +           admin_level, fromarea, isaddress and linked_place_id is NULL as isaddress,
             CASE WHEN rank_address = 11 THEN 5 ELSE rank_address END as rank_address,
             distance, country_code, postcode
        FROM place_addressline join placex on (address_place_id = placex.place_id)
    RETURN;
  END;
  $$
- LANGUAGE plpgsql STABLE;
+ LANGUAGE plpgsql STABLE PARALLEL SAFE;
index 4987f156e9b9a6e134e56a48dafade15729165ab,ddfddaa64878a0d6f05d297d7427cf4416156259..c63803d21b6d10935c3cdc9f758caed48d91c308
@@@ -214,20 -214,20 +214,20 @@@ class SearchBuilder
              yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
              return
  
 -        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
 +        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD)
          if name_fulls:
              fulls_count = sum(t.count for t in name_fulls)
  
 -            if fulls_count < 50000 or addr_count < 30000:
 +            if fulls_count < 80000 or addr_count < 50000:
                  yield penalty, fulls_count / (2**len(addr_tokens)), \
                      self.get_full_name_ranking(name_fulls, addr_partials,
                                                 fulls_count > 30000 / max(1, len(addr_tokens)))
  
          # To catch remaining results, lookup by name and address
          # We only do this if there is a reasonable number of results expected.
-         exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
+         exp_count /= 2**len(addr_tokens)
          if exp_count < 10000 and addr_count < 20000:
              penalty += 0.35 * max(1 if name_fulls else 0.1,
                                    5 - len(name_partials) - len(addr_tokens))
          # This might yield wrong results, nothing we can do about that.
          if use_lookup:
              addr_restrict_tokens = []
 -            addr_lookup_tokens = []
 -            for t in addr_partials:
 -                if t.addr_count > 20000:
 -                    addr_restrict_tokens.append(t.token)
 -                else:
 -                    addr_lookup_tokens.append(t.token)
 +            addr_lookup_tokens = [t.token for t in addr_partials]
          else:
              addr_restrict_tokens = [t.token for t in addr_partials]
              addr_lookup_tokens = []
index ecc2c1c7f1c917ade41d5e7b2efba10b89caf96f,1bd0030d54f0319581c22dd5921c2deeaa8f0927..cc5b6cf098c1c00bdc0e30ebf8a4d44ccc1f640f
@@@ -166,12 -166,6 +166,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
          log().section('Analyze query (using ICU tokenizer)')
          for func in self.preprocessors:
              phrases = func(phrases)
 +
 +        if len(phrases) == 1 \
 +                and phrases[0].text.count(' ') > 3 \
 +                and max(len(s) for s in phrases[0].text.split()) < 3:
 +            normalized = []
 +
          query = qmod.QueryStruct(phrases)
  
          log().var_dump('Normalized query', query.source)
  
          self.add_extra_tokens(query)
          for start, end, pc in self.postcode_parser.parse(query):
+             term = ' '.join(n.term_lookup for n in query.nodes[start + 1:end + 1])
              query.add_token(qmod.TokenRange(start, end),
                              qmod.TOKEN_POSTCODE,
                              ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
-                                      lookup_word=pc, word_token=pc, info=None))
+                                      lookup_word=pc, word_token=term,
+                                      info=None))
          self.rerank_tokens(query)
  
          log().table_dump('Word tokens', _dump_word_tokens(query))
          """
          for i, node, tlist in query.iter_token_lists():
              if tlist.ttype == qmod.TOKEN_POSTCODE:
+                 tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
                  for repl in node.starting:
                      if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
-                        and (repl.ttype != qmod.TOKEN_HOUSENUMBER
-                             or len(tlist.tokens[0].lookup_word) > 4):
+                        and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
                          repl.add_penalty(0.39)
              elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
                    and len(tlist.tokens[0].lookup_word) <= 3):