avoid splitting of first token when a housenumber is present

[nominatim.git] / nominatim / api / search / icu_tokenizer.py
diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py

index 14698a28867ca7ae0fc783f6b6e11385ffe45d8a..f259995db112bbbe537aaa3855f2d4d78e36f5e2 100644 (file)
--- a/nominatim/api/search/icu_tokenizer.py
+++ b/nominatim/api/search/icu_tokenizer.py
@@ -21,10 +21,7 @@ from nominatim.typing import SaRow
  from nominatim.api.connection import SearchConnection
  from nominatim.api.logging import log
  from nominatim.api.search import query as qmod
  from nominatim.api.connection import SearchConnection
  from nominatim.api.logging import log
  from nominatim.api.search import query as qmod
-
-# XXX: TODO
-class AbstractQueryAnalyzer:
-    pass
+from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
  
  
  DB_TO_TOKEN_TYPE = {
  
  
  DB_TO_TOKEN_TYPE = {
@@ -156,7 +153,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          """
          log().section('Analyze query (using ICU tokenizer)')
          normalized = list(filter(lambda p: p.text,
          """
          log().section('Analyze query (using ICU tokenizer)')
          normalized = list(filter(lambda p: p.text,
-                                 (qmod.Phrase(p.ptype, self.normalizer.transliterate(p.text))
+                                 (qmod.Phrase(p.ptype, self.normalize_text(p.text))
                                    for p in phrases)))
          query = qmod.QueryStruct(normalized)
          log().var_dump('Normalized query', query.source)
                                    for p in phrases)))
          query = qmod.QueryStruct(normalized)
          log().var_dump('Normalized query', query.source)
@@ -190,6 +187,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          return query
  
  
          return query
  
  
+    def normalize_text(self, text: str) -> str:
+        """ Bring the given text into a normalized form. That is the
+            standardized form search will work with. All information removed
+            at this stage is inevitably lost.
+        """
+        return cast(str, self.normalizer.transliterate(text))
+
+
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
  
@@ -251,12 +256,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                         and (repl.ttype != qmod.TokenType.HOUSENUMBER
                              or len(tlist.tokens[0].lookup_word) > 4):
                          repl.add_penalty(0.39)
                         and (repl.ttype != qmod.TokenType.HOUSENUMBER
                              or len(tlist.tokens[0].lookup_word) > 4):
                          repl.add_penalty(0.39)
-            elif tlist.ttype == qmod.TokenType.HOUSENUMBER:
+            elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
+                 and len(tlist.tokens[0].lookup_word) <= 3:
                  if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
                      for repl in node.starting:
                  if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
                      for repl in node.starting:
-                        if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \
-                           and (repl.ttype != qmod.TokenType.HOUSENUMBER
-                                or len(tlist.tokens[0].lookup_word) <= 3):
+                        if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
                              repl.add_penalty(0.5 - tlist.tokens[0].penalty)
              elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                  norm = parts[i].normalized
                              repl.add_penalty(0.5 - tlist.tokens[0].penalty)
              elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                  norm = parts[i].normalized