avoid splitting of first token when a housenumber is present

[nominatim.git] / nominatim / api / search / icu_tokenizer.py
diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py

index 14698a28867ca7ae0fc783f6b6e11385ffe45d8a..f259995db112bbbe537aaa3855f2d4d78e36f5e2 100644 (file)
--- a/nominatim/api/search/icu_tokenizer.py
+++ b/nominatim/api/search/icu_tokenizer.py
@@ -21,10 +21,7 @@ from nominatim.typing import SaRow
  from nominatim.api.connection import SearchConnection
  from nominatim.api.logging import log
  from nominatim.api.search import query as qmod
-
-# XXX: TODO
-class AbstractQueryAnalyzer:
-    pass
+from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
  
  
  DB_TO_TOKEN_TYPE = {
@@ -156,7 +153,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          """
          log().section('Analyze query (using ICU tokenizer)')
          normalized = list(filter(lambda p: p.text,
-                                 (qmod.Phrase(p.ptype, self.normalizer.transliterate(p.text))
+                                 (qmod.Phrase(p.ptype, self.normalize_text(p.text))
                                    for p in phrases)))
          query = qmod.QueryStruct(normalized)
          log().var_dump('Normalized query', query.source)
@@ -190,6 +187,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          return query
  
  
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form. That is the
+            standardized form search will work with. All information removed
+            at this stage is inevitably lost.
+        """
+        return cast(str, self.normalizer.transliterate(text))
+
+
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
  
@@ -251,12 +256,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                         and (repl.ttype != qmod.TokenType.HOUSENUMBER
                              or len(tlist.tokens[0].lookup_word) > 4):
                          repl.add_penalty(0.39)
-            elif tlist.ttype == qmod.TokenType.HOUSENUMBER:
+            elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
+                 and len(tlist.tokens[0].lookup_word) <= 3:
                  if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
                      for repl in node.starting:
-                        if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \
-                           and (repl.ttype != qmod.TokenType.HOUSENUMBER
-                                or len(tlist.tokens[0].lookup_word) <= 3):
+                        if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
                              repl.add_penalty(0.5 - tlist.tokens[0].penalty)
              elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                  norm = parts[i].normalized