X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/c314a3092c5b51c7782015f6fa9ac093b46fa174..e4295dba10bdb05045e35c772b3d8ca3cb042fd1:/src/nominatim_api/search/token_assignment.py

diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py
index 5ac63d6f..8d25aa8f 100644
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -14,7 +14,6 @@ import dataclasses
 from ..logging import log
 from . import query as qmod
 
-# pylint: disable=too-many-return-statements,too-many-branches
 
 @dataclasses.dataclass
 class TypedRange:
@@ -25,18 +24,20 @@ class TypedRange:
 
 
 PENALTY_TOKENCHANGE = {
-    qmod.BreakType.START: 0.0,
-    qmod.BreakType.END: 0.0,
-    qmod.BreakType.PHRASE: 0.0,
-    qmod.BreakType.WORD: 0.1,
-    qmod.BreakType.PART: 0.2,
-    qmod.BreakType.TOKEN: 0.4
+    qmod.BREAK_START: 0.0,
+    qmod.BREAK_END: 0.0,
+    qmod.BREAK_PHRASE: 0.0,
+    qmod.BREAK_SOFT_PHRASE: 0.0,
+    qmod.BREAK_WORD: 0.1,
+    qmod.BREAK_PART: 0.2,
+    qmod.BREAK_TOKEN: 0.4
 }
 
 TypedRangeSeq = List[TypedRange]
 
+
 @dataclasses.dataclass
-class TokenAssignment: # pylint: disable=too-many-instance-attributes
+class TokenAssignment:
     """ Representation of a possible assignment of token types
         to the tokens in a tokenized query.
     """
@@ -49,24 +50,23 @@ class TokenAssignment: # pylint: disable=too-many-instance-attributes
     near_item: Optional[qmod.TokenRange] = None
     qualifier: Optional[qmod.TokenRange] = None
 
-
     @staticmethod
     def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
         """ Create a new token assignment from a sequence of typed spans.
         """
         out = TokenAssignment()
         for token in ranges:
-            if token.ttype == qmod.TokenType.PARTIAL:
+            if token.ttype == qmod.TOKEN_PARTIAL:
                 out.address.append(token.trange)
-            elif token.ttype == qmod.TokenType.HOUSENUMBER:
+            elif token.ttype == qmod.TOKEN_HOUSENUMBER:
                 out.housenumber = token.trange
-            elif token.ttype == qmod.TokenType.POSTCODE:
+            elif token.ttype == qmod.TOKEN_POSTCODE:
                 out.postcode = token.trange
-            elif token.ttype == qmod.TokenType.COUNTRY:
+            elif token.ttype == qmod.TOKEN_COUNTRY:
                 out.country = token.trange
-            elif token.ttype == qmod.TokenType.NEAR_ITEM:
+            elif token.ttype == qmod.TOKEN_NEAR_ITEM:
                 out.near_item = token.trange
-            elif token.ttype == qmod.TokenType.QUALIFIER:
+            elif token.ttype == qmod.TOKEN_QUALIFIER:
                 out.qualifier = token.trange
         return out
 
@@ -83,34 +83,29 @@ class _TokenSequence:
         self.direction = direction
         self.penalty = penalty
 
-
     def __str__(self) -> str:
-        seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
+        seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype}]' for r in self.seq)
         return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
 
-
     @property
     def end_pos(self) -> int:
         """ Return the index of the global end of the current sequence.
         """
         return self.seq[-1].trange.end if self.seq else 0
 
-
     def has_types(self, *ttypes: qmod.TokenType) -> bool:
         """ Check if the current sequence contains any typed ranges of
             the given types.
         """
         return any(s.ttype in ttypes for s in self.seq)
 
-
     def is_final(self) -> bool:
         """ Return true when the sequence cannot be extended by any
             form of token anymore.
         """
         # Country and category must be the final term for left-to-right
         return len(self.seq) > 1 and \
-               self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM)
-
+            self.seq[-1].ttype in (qmod.TOKEN_COUNTRY, qmod.TOKEN_NEAR_ITEM)
 
     def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
         """ Check if the give token type is appendable to the existing sequence.
@@ -119,23 +114,23 @@ class _TokenSequence:
             new direction of the sequence after adding such a type. The
             token is not added.
         """
-        if ttype == qmod.TokenType.WORD:
+        if ttype == qmod.TOKEN_WORD:
             return None
 
         if not self.seq:
             # Append unconditionally to the empty list
-            if ttype == qmod.TokenType.COUNTRY:
+            if ttype == qmod.TOKEN_COUNTRY:
                 return -1
-            if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+            if ttype in (qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
                 return 1
             return self.direction
 
         # Name tokens are always acceptable and don't change direction
-        if ttype == qmod.TokenType.PARTIAL:
+        if ttype == qmod.TOKEN_PARTIAL:
             # qualifiers cannot appear in the middle of the query. They need
             # to be near the next phrase.
             if self.direction == -1 \
-               and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
+               and any(t.ttype == qmod.TOKEN_QUALIFIER for t in self.seq[:-1]):
                 return None
             return self.direction
 
@@ -143,60 +138,59 @@ class _TokenSequence:
         if self.has_types(ttype):
             return None
 
-        if ttype == qmod.TokenType.HOUSENUMBER:
+        if ttype == qmod.TOKEN_HOUSENUMBER:
             if self.direction == 1:
-                if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
+                if len(self.seq) == 1 and self.seq[0].ttype == qmod.TOKEN_QUALIFIER:
                     return None
                 if len(self.seq) > 2 \
-                   or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
-                    return None # direction left-to-right: housenumber must come before anything
-            elif self.direction == -1 \
-                 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
-                return -1 # force direction right-to-left if after other terms
+                   or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
+                    return None  # direction left-to-right: housenumber must come before anything
+            elif (self.direction == -1
+                  or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY)):
+                return -1  # force direction right-to-left if after other terms
 
             return self.direction
 
-        if ttype == qmod.TokenType.POSTCODE:
+        if ttype == qmod.TOKEN_POSTCODE:
             if self.direction == -1:
-                if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+                if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
                     return None
                 return -1
             if self.direction == 1:
-                return None if self.has_types(qmod.TokenType.COUNTRY) else 1
-            if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+                return None if self.has_types(qmod.TOKEN_COUNTRY) else 1
+            if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
                 return 1
             return self.direction
 
-        if ttype == qmod.TokenType.COUNTRY:
+        if ttype == qmod.TOKEN_COUNTRY:
             return None if self.direction == -1 else 1
 
-        if ttype == qmod.TokenType.NEAR_ITEM:
+        if ttype == qmod.TOKEN_NEAR_ITEM:
             return self.direction
 
-        if ttype == qmod.TokenType.QUALIFIER:
+        if ttype == qmod.TOKEN_QUALIFIER:
             if self.direction == 1:
                 if (len(self.seq) == 1
-                    and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \
+                    and self.seq[0].ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_NEAR_ITEM)) \
                    or (len(self.seq) == 2
-                       and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM
-                       and self.seq[1].ttype == qmod.TokenType.PARTIAL):
+                       and self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM
+                       and self.seq[1].ttype == qmod.TOKEN_PARTIAL):
                     return 1
                 return None
             if self.direction == -1:
                 return -1
 
-            tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq
+            tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM else self.seq
             if len(tempseq) == 0:
                 return 1
-            if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
+            if len(tempseq) == 1 and self.seq[0].ttype == qmod.TOKEN_HOUSENUMBER:
                 return None
-            if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
+            if len(tempseq) > 1 or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
                 return -1
             return 0
 
         return None
 
-
     def advance(self, ttype: qmod.TokenType, end_pos: int,
                 btype: qmod.BreakType) -> Optional['_TokenSequence']:
         """ Return a new token sequence state with the given token type
@@ -211,7 +205,7 @@ class _TokenSequence:
             new_penalty = 0.0
         else:
             last = self.seq[-1]
-            if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
+            if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
                 # extend the existing range
                 newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
                 new_penalty = 0.0
@@ -223,7 +217,6 @@ class _TokenSequence:
 
         return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
 
-
     def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
         if priors >= 2:
             if self.direction == 0:
@@ -236,7 +229,6 @@ class _TokenSequence:
 
         return True
 
-
     def recheck_sequence(self) -> bool:
         """ Check that the sequence is a fully valid token assignment
             and adapt direction and penalties further if necessary.
@@ -248,25 +240,24 @@ class _TokenSequence:
         # housenumbers may not be further than 2 words from the beginning.
         # If there are two words in front, give it a penalty.
         hnrpos = next((i for i, tr in enumerate(self.seq)
-                       if tr.ttype == qmod.TokenType.HOUSENUMBER),
+                       if tr.ttype == qmod.TOKEN_HOUSENUMBER),
                       None)
         if hnrpos is not None:
             if self.direction != -1:
-                priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
+                priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TOKEN_PARTIAL)
                 if not self._adapt_penalty_from_priors(priors, -1):
                     return False
             if self.direction != 1:
-                priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
+                priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TOKEN_PARTIAL)
                 if not self._adapt_penalty_from_priors(priors, 1):
                     return False
-            if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq):
+            if any(t.ttype == qmod.TOKEN_NEAR_ITEM for t in self.seq):
                 self.penalty += 1.0
 
         return True
 
-
     def _get_assignments_postcode(self, base: TokenAssignment,
-                                  query_len: int)  -> Iterator[TokenAssignment]:
+                                  query_len: int) -> Iterator[TokenAssignment]:
         """ Yield possible assignments of Postcode searches with an
             address component.
         """
@@ -278,13 +269,11 @@ class _TokenSequence:
             # <address>,<postcode> should give preference to address search
             if base.postcode.start == 0:
                 penalty = self.penalty
-                self.direction = -1 # name searches are only possible backwards
             else:
                 penalty = self.penalty + 0.1
-                self.direction = 1 # name searches are only possible forwards
+            penalty += 0.1 * max(0, len(base.address) - 1)
             yield dataclasses.replace(base, penalty=penalty)
 
-
     def _get_assignments_address_forward(self, base: TokenAssignment,
                                          query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
         """ Yield possible assignments of address searches with
@@ -292,6 +281,11 @@ class _TokenSequence:
         """
         first = base.address[0]
 
+        # The postcode must come after the name.
+        if base.postcode and base.postcode < first:
+            log().var_dump('skip forward', (base.postcode, first))
+            return
+
         log().comment('first word = name')
         yield dataclasses.replace(base, penalty=self.penalty,
                                   name=first, address=base.address[1:])
@@ -303,7 +297,7 @@ class _TokenSequence:
         #  * the containing phrase is strictly typed
         if (base.housenumber and first.end < base.housenumber.start)\
            or (base.qualifier and base.qualifier > first)\
-           or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
+           or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
             return
 
         penalty = self.penalty
@@ -320,7 +314,6 @@ class _TokenSequence:
             yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
                                       penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
 
-
     def _get_assignments_address_backward(self, base: TokenAssignment,
                                           query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
         """ Yield possible assignments of address searches with
@@ -328,7 +321,12 @@ class _TokenSequence:
         """
         last = base.address[-1]
 
-        if self.direction == -1 or len(base.address) > 1:
+        # The postcode must come before the name for backward direction.
+        if base.postcode and base.postcode > last:
+            log().var_dump('skip backward', (base.postcode, last))
+            return
+
+        if self.direction == -1 or len(base.address) > 1 or base.postcode:
             log().comment('last word = name')
             yield dataclasses.replace(base, penalty=self.penalty,
                                       name=last, address=base.address[:-1])
@@ -340,7 +338,7 @@ class _TokenSequence:
         #  * the containing phrase is strictly typed
         if (base.housenumber and last.start > base.housenumber.end)\
            or (base.qualifier and base.qualifier < last)\
-           or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
+           or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
             return
 
         penalty = self.penalty
@@ -355,7 +353,6 @@ class _TokenSequence:
             yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
                                       penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
 
-
     def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
         """ Yield possible assignments for the current sequence.
 
@@ -405,7 +402,7 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
         another. It does not include penalties for transitions within a
         type.
     """
-    todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
+    todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PHRASE_ANY else 1)]
 
     while todo:
         state = todo.pop()