remove word_number counting for phrases

[nominatim.git] / src / nominatim_api / search / icu_tokenizer.py
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py

index 6f1dcf7902ab65e5a4481d6a3d7b65e6274deab8..04e781caa3080195ba373b7a6fe1224c75d72d5e 100644 (file)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -50,15 +50,16 @@ PENALTY_IN_TOKEN_BREAK = {
  @dataclasses.dataclass
  class QueryPart:
      """ Normalized and transliterated form of a single term in the query.
  @dataclasses.dataclass
  class QueryPart:
      """ Normalized and transliterated form of a single term in the query.
+
          When the term came out of a split during the transliteration,
          the normalized string is the full word before transliteration.
          When the term came out of a split during the transliteration,
          the normalized string is the full word before transliteration.
-        The word number keeps track of the word before transliteration
-        and can be used to identify partial transliterated terms.
+        Check the subsequent break type to figure out if the word is
+        continued.
+
          Penalty is the break penalty for the break following the token.
      """
      token: str
      normalized: str
          Penalty is the break penalty for the break following the token.
      """
      token: str
      normalized: str
-    word_number: int
      penalty: float
  
  
      penalty: float
  
  
@@ -244,7 +245,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
-        return cast(str, self.normalizer.transliterate(text))
+        return cast(str, self.normalizer.transliterate(text)).strip('-: ')
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
@@ -256,7 +257,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          parts: QueryParts = []
          phrase_start = 0
          words = defaultdict(list)
          parts: QueryParts = []
          phrase_start = 0
          words = defaultdict(list)
-        wordnr = 0
          for phrase in query.source:
              query.nodes[-1].ptype = phrase.ptype
              phrase_split = re.split('([ :-])', phrase.text)
          for phrase in query.source:
              query.nodes[-1].ptype = phrase.ptype
              phrase_split = re.split('([ :-])', phrase.text)
@@ -271,12 +271,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                  if trans:
                      for term in trans.split(' '):
                          if term:
                  if trans:
                      for term in trans.split(' '):
                          if term:
-                            parts.append(QueryPart(term, word, wordnr,
+                            parts.append(QueryPart(term, word,
                                                     PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
                              query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
                      query.nodes[-1].btype = qmod.BreakType(breakchar)
                      parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
                                                     PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
                              query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
                      query.nodes[-1].btype = qmod.BreakType(breakchar)
                      parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
-                wordnr += 1
  
              for word, wrange in yield_words(parts, phrase_start):
                  words[word].append(wrange)
  
              for word, wrange in yield_words(parts, phrase_start):
                  words[word].append(wrange)
@@ -323,7 +322,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
              elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                  norm = parts[i].normalized
                  for j in range(i + 1, tlist.end):
              elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                  norm = parts[i].normalized
                  for j in range(i + 1, tlist.end):
-                    if parts[j - 1].word_number != parts[j].word_number:
+                    if node.btype != qmod.BreakType.TOKEN:
                          norm += '  ' + parts[j].normalized
                  for token in tlist.tokens:
                      cast(ICUToken, token).rematch(norm)
                          norm += '  ' + parts[j].normalized
                  for token in tlist.tokens:
                      cast(ICUToken, token).rematch(norm)