]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 31 Mar 2025 15:14:19 +0000 (17:14 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Mon, 31 Mar 2025 15:14:19 +0000 (17:14 +0200)
1  2 
src/nominatim_api/search/db_search_builder.py
src/nominatim_db/tokenizer/icu_tokenizer.py

index c63803d21b6d10935c3cdc9f758caed48d91c308,4dcaf14dc3b89c9a4f9845c19a1314b47dcf4864..0292335eb918391c296cb8d05735aeb82e5ea501
@@@ -208,7 -208,7 +208,7 @@@ class SearchBuilder
          addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
          addr_tokens = list({t.token for t in addr_partials})
  
-         exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
+         exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
  
          if (len(name_partials) > 3 or exp_count < 8000):
              yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
          if name_fulls:
              fulls_count = sum(t.count for t in name_fulls)
  
 -            if fulls_count < 50000 or addr_count < 50000:
 +            if fulls_count < 80000 or addr_count < 50000:
                  yield penalty, fulls_count / (2**len(addr_tokens)), \
                      self.get_full_name_ranking(name_fulls, addr_partials,
                                                 fulls_count > 30000 / max(1, len(addr_tokens)))
              address lookups will use the index, when the occurrences are not
              too many.
          """
-         # At this point drop unindexed partials from the address.
-         # This might yield wrong results, nothing we can do about that.
          if use_lookup:
              addr_restrict_tokens = []
              addr_lookup_tokens = [t.token for t in addr_partials]
index 858cb64c63ef4843a49d42da20fa0c2f65fc05ed,297c9ef9a401e02413d8c223f9633c0d4ad36e33..19b838639ab0e557a7cba97fbe5e012a9bf81b70
@@@ -121,10 -121,10 +121,10 @@@ class ICUTokenizer(AbstractTokenizer)
                             SELECT unnest(nameaddress_vector) as id, count(*)
                                   FROM search_name GROUP BY id)
                    SELECT coalesce(a.id, w.id) as id,
-                          (CASE WHEN w.count is null THEN '{}'::JSONB
+                          (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB
                                ELSE jsonb_build_object('count', w.count) END
                            ||
-                           CASE WHEN a.count is null THEN '{}'::JSONB
+                           CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB
                                ELSE jsonb_build_object('addr_count', a.count) END) as info
                    FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
                    """)
                  drop_tables(conn, 'tmp_word')
                  cur.execute("""CREATE TABLE tmp_word AS
                                  SELECT word_id, word_token, type, word,
-                                        (CASE WHEN wf.info is null THEN word.info
-                                         ELSE coalesce(word.info, '{}'::jsonb) || wf.info
-                                         END) as info
+                                        coalesce(word.info, '{}'::jsonb)
+                                        - 'count' - 'addr_count' ||
+                                        coalesce(wf.info, '{}'::jsonb)
+                                        as info
                                  FROM word LEFT JOIN word_frequencies wf
                                       ON word.word_id = wf.id
 +                                ORDER BY word_id
                              """)
                  drop_tables(conn, 'word_frequencies')
  
@@@ -585,10 -585,14 +586,14 @@@ class ICUNameAnalyzer(AbstractAnalyzer)
              if word_id:
                  result = self._cache.housenumbers.get(word_id, result)
                  if result[0] is None:
-                     variants = analyzer.compute_variants(word_id)
+                     varout = analyzer.compute_variants(word_id)
+                     if isinstance(varout, tuple):
+                         variants = varout[0]
+                     else:
+                         variants = varout
                      if variants:
                          hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
-                                              (word_id, list(variants)))
+                                              (word_id, variants))
                          result = hid, variants[0]
                          self._cache.housenumbers[word_id] = result
  
  
              full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
-                 variants = analyzer.compute_variants(word_id)
+                 varset = analyzer.compute_variants(word_id)
+                 if isinstance(varset, tuple):
+                     variants, lookups = varset
+                 else:
+                     variants, lookups = varset, None
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
-                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
-                                 (token_id, variants))
+                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s, %s)",
+                                 (token_id, variants, lookups))
                      full, part = cast(Tuple[int, List[int]], cur.fetchone())
  
                  self._cache.names[token_id] = (full, part)