]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/api/search/icu_tokenizer.py
don't even try heavily penalized searches
[nominatim.git] / nominatim / api / search / icu_tokenizer.py
index 72e0f547bcbaf9f5bb0798b8d26ce8b228b22249..23cfa5a166c003a1b5638f0334d10636a335d935 100644 (file)
@@ -97,6 +97,7 @@ class ICUToken(qmod.Token):
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
+        addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
 
         penalty = 0.0
         if row.type == 'w':
 
         penalty = 0.0
         if row.type == 'w':
@@ -123,7 +124,8 @@ class ICUToken(qmod.Token):
 
         return ICUToken(penalty=penalty, token=row.word_id, count=count,
                         lookup_word=lookup_word, is_indexed=True,
 
         return ICUToken(penalty=penalty, token=row.word_id, count=count,
                         lookup_word=lookup_word, is_indexed=True,
-                        word_token=row.word_token, info=row.info)
+                        word_token=row.word_token, info=row.info,
+                        addr_count=addr_count)
 
 
 
 
 
 
@@ -186,7 +188,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                         if trange.start == 0:
                             query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                     else:
                         if trange.start == 0:
                             query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                     else:
-                        query.add_token(trange, qmod.TokenType.QUALIFIER, token)
+                        if trange.start == 0 and trange.end == query.num_token_slots():
+                            query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
+                        else:
+                            query.add_token(trange, qmod.TokenType.QUALIFIER, token)
                 else:
                     query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
 
                 else:
                     query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
 
@@ -203,7 +208,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             standardized form search will work with. All information removed
             at this stage is inevitably lost.
         """
             standardized form search will work with. All information removed
             at this stage is inevitably lost.
         """
-        return cast(str, self.normalizer.transliterate(text))
+        norm = cast(str, self.normalizer.transliterate(text))
+        numspaces = norm.count(' ')
+        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+            return ''
+
+        return norm
 
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
 
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
@@ -254,7 +264,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                ICUToken(0.5, 0, 1, part.token, True, part.token, None))
+                                ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: