Merge pull request #3678 from lonvia/search-tweaks

[nominatim.git] / src / nominatim_api / search / token_assignment.py
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py

index 5ac63d6f844e013cbfc94e50eae7e830cd68f262..8d25aa8f10bdc3c751a2788c89c5710b50a4f8fb 100644 (file)
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -14,7 +14,6 @@ import dataclasses
  from ..logging import log
  from . import query as qmod
  
-# pylint: disable=too-many-return-statements,too-many-branches
  
  @dataclasses.dataclass
  class TypedRange:
@@ -25,18 +24,20 @@ class TypedRange:
  
  
  PENALTY_TOKENCHANGE = {
-    qmod.BreakType.START: 0.0,
-    qmod.BreakType.END: 0.0,
-    qmod.BreakType.PHRASE: 0.0,
-    qmod.BreakType.WORD: 0.1,
-    qmod.BreakType.PART: 0.2,
-    qmod.BreakType.TOKEN: 0.4
+    qmod.BREAK_START: 0.0,
+    qmod.BREAK_END: 0.0,
+    qmod.BREAK_PHRASE: 0.0,
+    qmod.BREAK_SOFT_PHRASE: 0.0,
+    qmod.BREAK_WORD: 0.1,
+    qmod.BREAK_PART: 0.2,
+    qmod.BREAK_TOKEN: 0.4
  }
  
  TypedRangeSeq = List[TypedRange]
  
+
  @dataclasses.dataclass
-class TokenAssignment: # pylint: disable=too-many-instance-attributes
+class TokenAssignment:
      """ Representation of a possible assignment of token types
          to the tokens in a tokenized query.
      """
@@ -49,24 +50,23 @@ class TokenAssignment: # pylint: disable=too-many-instance-attributes
      near_item: Optional[qmod.TokenRange] = None
      qualifier: Optional[qmod.TokenRange] = None
  
-
      @staticmethod
      def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
          """ Create a new token assignment from a sequence of typed spans.
          """
          out = TokenAssignment()
          for token in ranges:
-            if token.ttype == qmod.TokenType.PARTIAL:
+            if token.ttype == qmod.TOKEN_PARTIAL:
                  out.address.append(token.trange)
-            elif token.ttype == qmod.TokenType.HOUSENUMBER:
+            elif token.ttype == qmod.TOKEN_HOUSENUMBER:
                  out.housenumber = token.trange
-            elif token.ttype == qmod.TokenType.POSTCODE:
+            elif token.ttype == qmod.TOKEN_POSTCODE:
                  out.postcode = token.trange
-            elif token.ttype == qmod.TokenType.COUNTRY:
+            elif token.ttype == qmod.TOKEN_COUNTRY:
                  out.country = token.trange
-            elif token.ttype == qmod.TokenType.NEAR_ITEM:
+            elif token.ttype == qmod.TOKEN_NEAR_ITEM:
                  out.near_item = token.trange
-            elif token.ttype == qmod.TokenType.QUALIFIER:
+            elif token.ttype == qmod.TOKEN_QUALIFIER:
                  out.qualifier = token.trange
          return out
  
@@ -83,34 +83,29 @@ class _TokenSequence:
          self.direction = direction
          self.penalty = penalty
  
-
      def __str__(self) -> str:
-        seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
+        seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype}]' for r in self.seq)
          return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
  
-
      @property
      def end_pos(self) -> int:
          """ Return the index of the global end of the current sequence.
          """
          return self.seq[-1].trange.end if self.seq else 0
  
-
      def has_types(self, *ttypes: qmod.TokenType) -> bool:
          """ Check if the current sequence contains any typed ranges of
              the given types.
          """
          return any(s.ttype in ttypes for s in self.seq)
  
-
      def is_final(self) -> bool:
          """ Return true when the sequence cannot be extended by any
              form of token anymore.
          """
          # Country and category must be the final term for left-to-right
          return len(self.seq) > 1 and \
-               self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM)
-
+            self.seq[-1].ttype in (qmod.TOKEN_COUNTRY, qmod.TOKEN_NEAR_ITEM)
  
      def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
          """ Check if the give token type is appendable to the existing sequence.
@@ -119,23 +114,23 @@ class _TokenSequence:
              new direction of the sequence after adding such a type. The
              token is not added.
          """
-        if ttype == qmod.TokenType.WORD:
+        if ttype == qmod.TOKEN_WORD:
              return None
  
          if not self.seq:
              # Append unconditionally to the empty list
-            if ttype == qmod.TokenType.COUNTRY:
+            if ttype == qmod.TOKEN_COUNTRY:
                  return -1
-            if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+            if ttype in (qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
                  return 1
              return self.direction
  
          # Name tokens are always acceptable and don't change direction
-        if ttype == qmod.TokenType.PARTIAL:
+        if ttype == qmod.TOKEN_PARTIAL:
              # qualifiers cannot appear in the middle of the query. They need
              # to be near the next phrase.
              if self.direction == -1 \
-               and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
+               and any(t.ttype == qmod.TOKEN_QUALIFIER for t in self.seq[:-1]):
                  return None
              return self.direction
  
@@ -143,60 +138,59 @@ class _TokenSequence:
          if self.has_types(ttype):
              return None
  
-        if ttype == qmod.TokenType.HOUSENUMBER:
+        if ttype == qmod.TOKEN_HOUSENUMBER:
              if self.direction == 1:
-                if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
+                if len(self.seq) == 1 and self.seq[0].ttype == qmod.TOKEN_QUALIFIER:
                      return None
                  if len(self.seq) > 2 \
-                   or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
-                    return None # direction left-to-right: housenumber must come before anything
-            elif self.direction == -1 \
-                 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
-                return -1 # force direction right-to-left if after other terms
+                   or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
+                    return None  # direction left-to-right: housenumber must come before anything
+            elif (self.direction == -1
+                  or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY)):
+                return -1  # force direction right-to-left if after other terms
  
              return self.direction
  
-        if ttype == qmod.TokenType.POSTCODE:
+        if ttype == qmod.TOKEN_POSTCODE:
              if self.direction == -1:
-                if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+                if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
                      return None
                  return -1
              if self.direction == 1:
-                return None if self.has_types(qmod.TokenType.COUNTRY) else 1
-            if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+                return None if self.has_types(qmod.TOKEN_COUNTRY) else 1
+            if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
                  return 1
              return self.direction
  
-        if ttype == qmod.TokenType.COUNTRY:
+        if ttype == qmod.TOKEN_COUNTRY:
              return None if self.direction == -1 else 1
  
-        if ttype == qmod.TokenType.NEAR_ITEM:
+        if ttype == qmod.TOKEN_NEAR_ITEM:
              return self.direction
  
-        if ttype == qmod.TokenType.QUALIFIER:
+        if ttype == qmod.TOKEN_QUALIFIER:
              if self.direction == 1:
                  if (len(self.seq) == 1
-                    and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \
+                    and self.seq[0].ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_NEAR_ITEM)) \
                     or (len(self.seq) == 2
-                       and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM
-                       and self.seq[1].ttype == qmod.TokenType.PARTIAL):
+                       and self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM
+                       and self.seq[1].ttype == qmod.TOKEN_PARTIAL):
                      return 1
                  return None
              if self.direction == -1:
                  return -1
  
-            tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq
+            tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM else self.seq
              if len(tempseq) == 0:
                  return 1
-            if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
+            if len(tempseq) == 1 and self.seq[0].ttype == qmod.TOKEN_HOUSENUMBER:
                  return None
-            if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
+            if len(tempseq) > 1 or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
                  return -1
              return 0
  
          return None
  
-
      def advance(self, ttype: qmod.TokenType, end_pos: int,
                  btype: qmod.BreakType) -> Optional['_TokenSequence']:
          """ Return a new token sequence state with the given token type
@@ -211,7 +205,7 @@ class _TokenSequence:
              new_penalty = 0.0
          else:
              last = self.seq[-1]
-            if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
+            if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
                  # extend the existing range
                  newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
                  new_penalty = 0.0
@@ -223,7 +217,6 @@ class _TokenSequence:
  
          return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
  
-
      def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
          if priors >= 2:
              if self.direction == 0:
@@ -236,7 +229,6 @@ class _TokenSequence:
  
          return True
  
-
      def recheck_sequence(self) -> bool:
          """ Check that the sequence is a fully valid token assignment
              and adapt direction and penalties further if necessary.
@@ -248,25 +240,24 @@ class _TokenSequence:
          # housenumbers may not be further than 2 words from the beginning.
          # If there are two words in front, give it a penalty.
          hnrpos = next((i for i, tr in enumerate(self.seq)
-                       if tr.ttype == qmod.TokenType.HOUSENUMBER),
+                       if tr.ttype == qmod.TOKEN_HOUSENUMBER),
                        None)
          if hnrpos is not None:
              if self.direction != -1:
-                priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
+                priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TOKEN_PARTIAL)
                  if not self._adapt_penalty_from_priors(priors, -1):
                      return False
              if self.direction != 1:
-                priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
+                priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TOKEN_PARTIAL)
                  if not self._adapt_penalty_from_priors(priors, 1):
                      return False
-            if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq):
+            if any(t.ttype == qmod.TOKEN_NEAR_ITEM for t in self.seq):
                  self.penalty += 1.0
  
          return True
  
-
      def _get_assignments_postcode(self, base: TokenAssignment,
-                                  query_len: int)  -> Iterator[TokenAssignment]:
+                                  query_len: int) -> Iterator[TokenAssignment]:
          """ Yield possible assignments of Postcode searches with an
              address component.
          """
@@ -278,13 +269,11 @@ class _TokenSequence:
              # <address>,<postcode> should give preference to address search
              if base.postcode.start == 0:
                  penalty = self.penalty
-                self.direction = -1 # name searches are only possible backwards
              else:
                  penalty = self.penalty + 0.1
-                self.direction = 1 # name searches are only possible forwards
+            penalty += 0.1 * max(0, len(base.address) - 1)
              yield dataclasses.replace(base, penalty=penalty)
  
-
      def _get_assignments_address_forward(self, base: TokenAssignment,
                                           query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
          """ Yield possible assignments of address searches with
@@ -292,6 +281,11 @@ class _TokenSequence:
          """
          first = base.address[0]
  
+        # The postcode must come after the name.
+        if base.postcode and base.postcode < first:
+            log().var_dump('skip forward', (base.postcode, first))
+            return
+
          log().comment('first word = name')
          yield dataclasses.replace(base, penalty=self.penalty,
                                    name=first, address=base.address[1:])
@@ -303,7 +297,7 @@ class _TokenSequence:
          #  * the containing phrase is strictly typed
          if (base.housenumber and first.end < base.housenumber.start)\
             or (base.qualifier and base.qualifier > first)\
-           or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
+           or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
              return
  
          penalty = self.penalty
@@ -320,7 +314,6 @@ class _TokenSequence:
              yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
                                        penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
  
-
      def _get_assignments_address_backward(self, base: TokenAssignment,
                                            query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
          """ Yield possible assignments of address searches with
@@ -328,7 +321,12 @@ class _TokenSequence:
          """
          last = base.address[-1]
  
-        if self.direction == -1 or len(base.address) > 1:
+        # The postcode must come before the name for backward direction.
+        if base.postcode and base.postcode > last:
+            log().var_dump('skip backward', (base.postcode, last))
+            return
+
+        if self.direction == -1 or len(base.address) > 1 or base.postcode:
              log().comment('last word = name')
              yield dataclasses.replace(base, penalty=self.penalty,
                                        name=last, address=base.address[:-1])
@@ -340,7 +338,7 @@ class _TokenSequence:
          #  * the containing phrase is strictly typed
          if (base.housenumber and last.start > base.housenumber.end)\
             or (base.qualifier and base.qualifier < last)\
-           or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
+           or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
              return
  
          penalty = self.penalty
@@ -355,7 +353,6 @@ class _TokenSequence:
              yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
                                        penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
  
-
      def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
          """ Yield possible assignments for the current sequence.
  
@@ -405,7 +402,7 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
          another. It does not include penalties for transitions within a
          type.
      """
-    todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
+    todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PHRASE_ANY else 1)]
  
      while todo:
          state = todo.pop()