X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/d984100e23d28253b3659e7b628cde2c8b436cf8..4577669213ea392fa7e25a2fce444f387763f4c8:/src/nominatim_api/search/icu_tokenizer.py

diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index 6f1dcf79..35621125 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -37,28 +37,29 @@ DB_TO_TOKEN_TYPE = {
 }
 
 PENALTY_IN_TOKEN_BREAK = {
-     qmod.BreakType.START: 0.5,
-     qmod.BreakType.END: 0.5,
-     qmod.BreakType.PHRASE: 0.5,
-     qmod.BreakType.SOFT_PHRASE: 0.5,
-     qmod.BreakType.WORD: 0.1,
-     qmod.BreakType.PART: 0.0,
-     qmod.BreakType.TOKEN: 0.0
+     qmod.BREAK_START: 0.5,
+     qmod.BREAK_END: 0.5,
+     qmod.BREAK_PHRASE: 0.5,
+     qmod.BREAK_SOFT_PHRASE: 0.5,
+     qmod.BREAK_WORD: 0.1,
+     qmod.BREAK_PART: 0.0,
+     qmod.BREAK_TOKEN: 0.0
 }
 
 
 @dataclasses.dataclass
 class QueryPart:
     """ Normalized and transliterated form of a single term in the query.
+
         When the term came out of a split during the transliteration,
         the normalized string is the full word before transliteration.
-        The word number keeps track of the word before transliteration
-        and can be used to identify partial transliterated terms.
+        Check the subsequent break type to figure out if the word is
+        continued.
+
         Penalty is the break penalty for the break following the token.
     """
     token: str
     normalized: str
-    word_number: int
     penalty: float
 
 
@@ -66,19 +67,20 @@ QueryParts = List[QueryPart]
 WordDict = Dict[str, List[qmod.TokenRange]]
 
 
-def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
-    """ Return all combinations of words in the terms list after the
-        given position.
+def extract_words(terms: List[QueryPart], start: int,  words: WordDict) -> None:
+    """ Add all combinations of words in the terms list after the
+        given position to the word list.
     """
     total = len(terms)
+    base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]
     for first in range(start, total):
         word = terms[first].token
-        penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
-        yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
+        penalty = base_penalty
+        words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty))
         for last in range(first + 1, min(first + 20, total)):
             word = ' '.join((word, terms[last].token))
             penalty += terms[last - 1].penalty
-            yield word, qmod.TokenRange(first, last + 1, penalty=penalty)
+            words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty))
 
 
 @dataclasses.dataclass
@@ -244,7 +246,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             standardized form search will work with. All information removed
             at this stage is inevitably lost.
         """
-        return cast(str, self.normalizer.transliterate(text))
+        return cast(str, self.normalizer.transliterate(text)).strip('-: ')
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
         """ Transliterate the phrases and split them into tokens.
@@ -255,8 +257,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
         """
         parts: QueryParts = []
         phrase_start = 0
-        words = defaultdict(list)
-        wordnr = 0
+        words: WordDict = defaultdict(list)
         for phrase in query.source:
             query.nodes[-1].ptype = phrase.ptype
             phrase_split = re.split('([ :-])', phrase.text)
@@ -271,18 +272,16 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                 if trans:
                     for term in trans.split(' '):
                         if term:
-                            parts.append(QueryPart(term, word, wordnr,
-                                                   PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
-                            query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
-                    query.nodes[-1].btype = qmod.BreakType(breakchar)
-                    parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
-                wordnr += 1
+                            parts.append(QueryPart(term, word,
+                                                   PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN]))
+                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype)
+                    query.nodes[-1].btype = breakchar
+                    parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar]
 
-            for word, wrange in yield_words(parts, phrase_start):
-                words[word].append(wrange)
+            extract_words(parts, phrase_start, words)
 
             phrase_start = len(parts)
-        query.nodes[-1].btype = qmod.BreakType.END
+        query.nodes[-1].btype = qmod.BREAK_END
 
         return parts, words
 
@@ -323,16 +322,16 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
             elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                 norm = parts[i].normalized
                 for j in range(i + 1, tlist.end):
-                    if parts[j - 1].word_number != parts[j].word_number:
+                    if node.btype != qmod.BREAK_TOKEN:
                         norm += '  ' + parts[j].normalized
                 for token in tlist.tokens:
                     cast(ICUToken, token).rematch(norm)
 
 
 def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
-    out = query.nodes[0].btype.value
+    out = query.nodes[0].btype
     for node, part in zip(query.nodes[1:], parts):
-        out += part.token + node.btype.value
+        out += part.token + node.btype
     return out