]> git.openstreetmap.org Git - nominatim.git/blobdiff - src/nominatim_api/search/icu_tokenizer.py
remove word_number counting for phrases
[nominatim.git] / src / nominatim_api / search / icu_tokenizer.py
index 6f1dcf7902ab65e5a4481d6a3d7b65e6274deab8..04e781caa3080195ba373b7a6fe1224c75d72d5e 100644 (file)
@@ -50,15 +50,16 @@ PENALTY_IN_TOKEN_BREAK = {
 @dataclasses.dataclass
 class QueryPart:
     """ Normalized and transliterated form of a single term in the query.
 @dataclasses.dataclass
 class QueryPart:
     """ Normalized and transliterated form of a single term in the query.
+
         When the term came out of a split during the transliteration,
         the normalized string is the full word before transliteration.
         When the term came out of a split during the transliteration,
         the normalized string is the full word before transliteration.
-        The word number keeps track of the word before transliteration
-        and can be used to identify partial transliterated terms.
+        Check the subsequent break type to figure out if the word is
+        continued.
+
         Penalty is the break penalty for the break following the token.
     """
     token: str
     normalized: str
         Penalty is the break penalty for the break following the token.
     """
     token: str
     normalized: str
-    word_number: int
     penalty: float
 
 
     penalty: float
 
 
@@ -244,7 +245,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             standardized form search will work with. All information removed
             at this stage is inevitably lost.
         """
             standardized form search will work with. All information removed
             at this stage is inevitably lost.
         """
-        return cast(str, self.normalizer.transliterate(text))
+        return cast(str, self.normalizer.transliterate(text)).strip('-: ')
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
         """ Transliterate the phrases and split them into tokens.
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
         """ Transliterate the phrases and split them into tokens.
@@ -256,7 +257,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
         parts: QueryParts = []
         phrase_start = 0
         words = defaultdict(list)
         parts: QueryParts = []
         phrase_start = 0
         words = defaultdict(list)
-        wordnr = 0
         for phrase in query.source:
             query.nodes[-1].ptype = phrase.ptype
             phrase_split = re.split('([ :-])', phrase.text)
         for phrase in query.source:
             query.nodes[-1].ptype = phrase.ptype
             phrase_split = re.split('([ :-])', phrase.text)
@@ -271,12 +271,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                 if trans:
                     for term in trans.split(' '):
                         if term:
                 if trans:
                     for term in trans.split(' '):
                         if term:
-                            parts.append(QueryPart(term, word, wordnr,
+                            parts.append(QueryPart(term, word,
                                                    PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
                             query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
                     query.nodes[-1].btype = qmod.BreakType(breakchar)
                     parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
                                                    PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
                             query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
                     query.nodes[-1].btype = qmod.BreakType(breakchar)
                     parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
-                wordnr += 1
 
             for word, wrange in yield_words(parts, phrase_start):
                 words[word].append(wrange)
 
             for word, wrange in yield_words(parts, phrase_start):
                 words[word].append(wrange)
@@ -323,7 +322,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                 norm = parts[i].normalized
                 for j in range(i + 1, tlist.end):
             elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                 norm = parts[i].normalized
                 for j in range(i + 1, tlist.end):
-                    if parts[j - 1].word_number != parts[j].word_number:
+                    if node.btype != qmod.BreakType.TOKEN:
                         norm += '  ' + parts[j].normalized
                 for token in tlist.tokens:
                     cast(ICUToken, token).rematch(norm)
                         norm += '  ' + parts[j].normalized
                 for token in tlist.tokens:
                     cast(ICUToken, token).rematch(norm)