]> git.openstreetmap.org Git - nominatim.git/blobdiff - src/nominatim_api/search/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / src / nominatim_api / search / icu_tokenizer.py
index 971e95beec1a6935b58e7c9cc4879d9797a73f1b..c2a265105a69d08eb3d7d8a75331e4a8c4d61dc9 100644 (file)
@@ -123,7 +123,7 @@ class ICUToken(qmod.Token):
             lookup_word = row.word_token
 
         return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
-                        lookup_word=lookup_word, is_indexed=True,
+                        lookup_word=lookup_word,
                         word_token=row.word_token, info=row.info,
                         addr_count=max(1, addr_count))
 
@@ -208,7 +208,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             standardized form search will work with. All information removed
             at this stage is inevitably lost.
         """
-        return cast(str, self.normalizer.transliterate(text))
+        norm = cast(str, self.normalizer.transliterate(text))
+        numspaces = norm.count(' ')
+        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+            return ''
+
+        return norm
 
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
@@ -259,7 +264,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             if len(part.token) <= 4 and part[0].isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
+                                ICUToken(penalty=0.5, token=0,
+                                         count=1, addr_count=1, lookup_word=part.token,
+                                         word_token=part.token, info=None))
 
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: