]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/api/search/legacy_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / api / search / legacy_tokenizer.py
index 3346584ccd1b35b4e74e4725ee079cb54e45a905..031f2456c455d108c21e1ea14a048b9f6d396df2 100644 (file)
@@ -44,7 +44,7 @@ class LegacyToken(qmod.Token):
 
     @property
     def info(self) -> Dict[str, Any]:
-        """ Dictionary of additional propoerties of the token.
+        """ Dictionary of additional properties of the token.
             Should only be used for debugging purposes.
         """
         return {'category': self.category,
@@ -107,15 +107,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
         for row in await self.lookup_in_db(lookup_words):
             for trange in words[row.word_token.strip()]:
                 token, ttype = self.make_token(row)
-                if ttype == qmod.TokenType.CATEGORY:
+                if ttype == qmod.TokenType.NEAR_ITEM:
                     if trange.start == 0:
-                        query.add_token(trange, qmod.TokenType.CATEGORY, token)
+                        query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                 elif ttype == qmod.TokenType.QUALIFIER:
                     query.add_token(trange, qmod.TokenType.QUALIFIER, token)
                     if trange.start == 0 or trange.end == query.num_token_slots():
                         token = copy(token)
                         token.penalty += 0.1 * (query.num_token_slots())
-                        query.add_token(trange, qmod.TokenType.CATEGORY, token)
+                        query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
                 elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
                     query.add_token(trange, ttype, token)
 
@@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
         return query
 
 
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form.
+
+            This only removes case, so some difference with the normalization
+            in the phrase remains.
+        """
+        return text.lower()
+
+
     def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
                                                             Dict[str, List[qmod.TokenRange]]]:
         """ Transliterate the phrases and split them into tokens.
@@ -186,7 +195,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
                 ttype = qmod.TokenType.POSTCODE
                 lookup_word = row.word_token[1:]
             else:
-                ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\
+                ttype = qmod.TokenType.NEAR_ITEM if row.operator in ('in', 'near')\
                         else qmod.TokenType.QUALIFIER
                 lookup_word = row.word
         elif row.word_token.startswith(' '):
@@ -200,7 +209,8 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
                 is_indexed = False
 
         return LegacyToken(penalty=penalty, token=row.word_id,
-                           count=row.search_name_count or 1,
+                           count=max(1, row.search_name_count or 1),
+                           addr_count=1, # not supported
                            lookup_word=lookup_word,
                            word_token=row.word_token.strip(),
                            category=(rowclass, row.type) if rowclass is not None else None,
@@ -217,7 +227,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
             if len(part) <= 4 and part.isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                LegacyToken(penalty=0.5, token=0, count=1,
+                                LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
                                             lookup_word=part, word_token=part,
                                             category=None, country=None,
                                             operator=None, is_indexed=True))