src/nominatim_api/search/token_assignment.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Create query interpretations where each vertice in the query is assigned
   9 a specific function (expressed as a token type).
  10 """
  11 from typing import Optional, List, Iterator
  12 import dataclasses
  13
  14 from ..logging import log
  15 from . import query as qmod
  16
  17
  18 @dataclasses.dataclass
  19 class TypedRange:
  20     """ A token range for a specific type of tokens.
  21     """
  22     ttype: qmod.TokenType
  23     trange: qmod.TokenRange
  24
  25
  26 PENALTY_TOKENCHANGE = {
  27     qmod.BREAK_START: 0.0,
  28     qmod.BREAK_END: 0.0,
  29     qmod.BREAK_PHRASE: 0.0,
  30     qmod.BREAK_SOFT_PHRASE: 0.0,
  31     qmod.BREAK_WORD: 0.1,
  32     qmod.BREAK_PART: 0.2,
  33     qmod.BREAK_TOKEN: 0.4
  34 }
  35
  36 TypedRangeSeq = List[TypedRange]
  37
  38
  39 @dataclasses.dataclass
  40 class TokenAssignment:
  41     """ Representation of a possible assignment of token types
  42         to the tokens in a tokenized query.
  43     """
  44     penalty: float = 0.0
  45     name: Optional[qmod.TokenRange] = None
  46     address: List[qmod.TokenRange] = dataclasses.field(default_factory=list)
  47     housenumber: Optional[qmod.TokenRange] = None
  48     postcode: Optional[qmod.TokenRange] = None
  49     country: Optional[qmod.TokenRange] = None
  50     near_item: Optional[qmod.TokenRange] = None
  51     qualifier: Optional[qmod.TokenRange] = None
  52
  53     @staticmethod
  54     def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
  55         """ Create a new token assignment from a sequence of typed spans.
  56         """
  57         out = TokenAssignment()
  58         for token in ranges:
  59             if token.ttype == qmod.TOKEN_PARTIAL:
  60                 out.address.append(token.trange)
  61             elif token.ttype == qmod.TOKEN_HOUSENUMBER:
  62                 out.housenumber = token.trange
  63             elif token.ttype == qmod.TOKEN_POSTCODE:
  64                 out.postcode = token.trange
  65             elif token.ttype == qmod.TOKEN_COUNTRY:
  66                 out.country = token.trange
  67             elif token.ttype == qmod.TOKEN_NEAR_ITEM:
  68                 out.near_item = token.trange
  69             elif token.ttype == qmod.TOKEN_QUALIFIER:
  70                 out.qualifier = token.trange
  71         return out
  72
  73
  74 class _TokenSequence:
  75     """ Working state used to put together the token assignments.
  76
  77         Represents an intermediate state while traversing the tokenized
  78         query.
  79     """
  80     def __init__(self, seq: TypedRangeSeq,
  81                  direction: int = 0, penalty: float = 0.0) -> None:
  82         self.seq = seq
  83         self.direction = direction
  84         self.penalty = penalty
  85
  86     def __str__(self) -> str:
  87         seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype}]' for r in self.seq)
  88         return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
  89
  90     @property
  91     def end_pos(self) -> int:
  92         """ Return the index of the global end of the current sequence.
  93         """
  94         return self.seq[-1].trange.end if self.seq else 0
  95
  96     def has_types(self, *ttypes: qmod.TokenType) -> bool:
  97         """ Check if the current sequence contains any typed ranges of
  98             the given types.
  99         """
 100         return any(s.ttype in ttypes for s in self.seq)
 101
 102     def is_final(self) -> bool:
 103         """ Return true when the sequence cannot be extended by any
 104             form of token anymore.
 105         """
 106         # Country and category must be the final term for left-to-right
 107         return len(self.seq) > 1 and \
 108             self.seq[-1].ttype in (qmod.TOKEN_COUNTRY, qmod.TOKEN_NEAR_ITEM)
 109
 110     def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
 111         """ Check if the give token type is appendable to the existing sequence.
 112
 113             Returns None if the token type is not appendable, otherwise the
 114             new direction of the sequence after adding such a type. The
 115             token is not added.
 116         """
 117         if ttype == qmod.TOKEN_WORD:
 118             return None
 119
 120         if not self.seq:
 121             # Append unconditionally to the empty list
 122             if ttype == qmod.TOKEN_COUNTRY:
 123                 return -1
 124             if ttype in (qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
 125                 return 1
 126             return self.direction
 127
 128         # Name tokens are always acceptable and don't change direction
 129         if ttype == qmod.TOKEN_PARTIAL:
 130             # qualifiers cannot appear in the middle of the query. They need
 131             # to be near the next phrase.
 132             if self.direction == -1 \
 133                and any(t.ttype == qmod.TOKEN_QUALIFIER for t in self.seq[:-1]):
 134                 return None
 135             return self.direction
 136
 137         # Other tokens may only appear once
 138         if self.has_types(ttype):
 139             return None
 140
 141         if ttype == qmod.TOKEN_HOUSENUMBER:
 142             if self.direction == 1:
 143                 if len(self.seq) == 1 and self.seq[0].ttype == qmod.TOKEN_QUALIFIER:
 144                     return None
 145                 if len(self.seq) > 2 \
 146                    or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
 147                     return None  # direction left-to-right: housenumber must come before anything
 148             elif (self.direction == -1
 149                   or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY)):
 150                 return -1  # force direction right-to-left if after other terms
 151
 152             return self.direction
 153
 154         if ttype == qmod.TOKEN_POSTCODE:
 155             if self.direction == -1:
 156                 if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
 157                     return None
 158                 return -1
 159             if self.direction == 1:
 160                 return None if self.has_types(qmod.TOKEN_COUNTRY) else 1
 161             if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
 162                 return 1
 163             return self.direction
 164
 165         if ttype == qmod.TOKEN_COUNTRY:
 166             return None if self.direction == -1 else 1
 167
 168         if ttype == qmod.TOKEN_NEAR_ITEM:
 169             return self.direction
 170
 171         if ttype == qmod.TOKEN_QUALIFIER:
 172             if self.direction == 1:
 173                 if (len(self.seq) == 1
 174                     and self.seq[0].ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_NEAR_ITEM)) \
 175                    or (len(self.seq) == 2
 176                        and self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM
 177                        and self.seq[1].ttype == qmod.TOKEN_PARTIAL):
 178                     return 1
 179                 return None
 180             if self.direction == -1:
 181                 return -1
 182
 183             tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM else self.seq
 184             if len(tempseq) == 0:
 185                 return 1
 186             if len(tempseq) == 1 and self.seq[0].ttype == qmod.TOKEN_HOUSENUMBER:
 187                 return None
 188             if len(tempseq) > 1 or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
 189                 return -1
 190             return 0
 191
 192         return None
 193
 194     def advance(self, ttype: qmod.TokenType, end_pos: int,
 195                 btype: qmod.BreakType) -> Optional['_TokenSequence']:
 196         """ Return a new token sequence state with the given token type
 197             extended.
 198         """
 199         newdir = self.appendable(ttype)
 200         if newdir is None:
 201             return None
 202
 203         if not self.seq:
 204             newseq = [TypedRange(ttype, qmod.TokenRange(0, end_pos))]
 205             new_penalty = 0.0
 206         else:
 207             last = self.seq[-1]
 208             if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
 209                 # extend the existing range
 210                 newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
 211                 new_penalty = 0.0
 212             else:
 213                 # start a new range
 214                 newseq = list(self.seq) + [TypedRange(ttype,
 215                                                       qmod.TokenRange(last.trange.end, end_pos))]
 216                 new_penalty = PENALTY_TOKENCHANGE[btype]
 217
 218         return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
 219
 220     def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
 221         if priors >= 2:
 222             if self.direction == 0:
 223                 self.direction = new_dir
 224             else:
 225                 if priors == 2:
 226                     self.penalty += 0.8
 227                 else:
 228                     return False
 229
 230         return True
 231
 232     def recheck_sequence(self) -> bool:
 233         """ Check that the sequence is a fully valid token assignment
 234             and adapt direction and penalties further if necessary.
 235
 236             This function catches some impossible assignments that need
 237             forward context and can therefore not be excluded when building
 238             the assignment.
 239         """
 240         # housenumbers may not be further than 2 words from the beginning.
 241         # If there are two words in front, give it a penalty.
 242         hnrpos = next((i for i, tr in enumerate(self.seq)
 243                        if tr.ttype == qmod.TOKEN_HOUSENUMBER),
 244                       None)
 245         if hnrpos is not None:
 246             if self.direction != -1:
 247                 priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TOKEN_PARTIAL)
 248                 if not self._adapt_penalty_from_priors(priors, -1):
 249                     return False
 250             if self.direction != 1:
 251                 priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TOKEN_PARTIAL)
 252                 if not self._adapt_penalty_from_priors(priors, 1):
 253                     return False
 254             if any(t.ttype == qmod.TOKEN_NEAR_ITEM for t in self.seq):
 255                 self.penalty += 1.0
 256
 257         return True
 258
 259     def _get_assignments_postcode(self, base: TokenAssignment,
 260                                   query_len: int) -> Iterator[TokenAssignment]:
 261         """ Yield possible assignments of Postcode searches with an
 262             address component.
 263         """
 264         assert base.postcode is not None
 265
 266         if (base.postcode.start == 0 and self.direction != -1)\
 267            or (base.postcode.end == query_len and self.direction != 1):
 268             log().comment('postcode search')
 269             # <address>,<postcode> should give preference to address search
 270             if base.postcode.start == 0:
 271                 penalty = self.penalty
 272             else:
 273                 penalty = self.penalty + 0.1
 274             penalty += 0.1 * max(0, len(base.address) - 1)
 275             yield dataclasses.replace(base, penalty=penalty)
 276
 277     def _get_assignments_address_forward(self, base: TokenAssignment,
 278                                          query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
 279         """ Yield possible assignments of address searches with
 280             left-to-right reading.
 281         """
 282         first = base.address[0]
 283
 284         # The postcode must come after the name.
 285         if base.postcode and base.postcode < first:
 286             log().var_dump('skip forward', (base.postcode, first))
 287             return
 288
 289         log().comment('first word = name')
 290         yield dataclasses.replace(base, penalty=self.penalty,
 291                                   name=first, address=base.address[1:])
 292
 293         # To paraphrase:
 294         #  * if another name term comes after the first one and before the
 295         #    housenumber
 296         #  * a qualifier comes after the name
 297         #  * the containing phrase is strictly typed
 298         if (base.housenumber and first.end < base.housenumber.start)\
 299            or (base.qualifier and base.qualifier > first)\
 300            or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
 301             return
 302
 303         penalty = self.penalty
 304
 305         # Penalty for:
 306         #  * <name>, <street>, <housenumber> , ...
 307         #  * queries that are comma-separated
 308         if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
 309             penalty += 0.25
 310
 311         for i in range(first.start + 1, first.end):
 312             name, addr = first.split(i)
 313             log().comment(f'split first word = name ({i - first.start})')
 314             yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
 315                                       penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
 316
 317     def _get_assignments_address_backward(self, base: TokenAssignment,
 318                                           query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
 319         """ Yield possible assignments of address searches with
 320             right-to-left reading.
 321         """
 322         last = base.address[-1]
 323
 324         # The postcode must come before the name for backward direction.
 325         if base.postcode and base.postcode > last:
 326             log().var_dump('skip backward', (base.postcode, last))
 327             return
 328
 329         if self.direction == -1 or len(base.address) > 1 or base.postcode:
 330             log().comment('last word = name')
 331             yield dataclasses.replace(base, penalty=self.penalty,
 332                                       name=last, address=base.address[:-1])
 333
 334         # To paraphrase:
 335         #  * if another name term comes before the last one and after the
 336         #    housenumber
 337         #  * a qualifier comes before the name
 338         #  * the containing phrase is strictly typed
 339         if (base.housenumber and last.start > base.housenumber.end)\
 340            or (base.qualifier and base.qualifier < last)\
 341            or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
 342             return
 343
 344         penalty = self.penalty
 345         if base.housenumber and base.housenumber < last:
 346             penalty += 0.4
 347         if len(query.source) > 1:
 348             penalty += 0.25
 349
 350         for i in range(last.start + 1, last.end):
 351             addr, name = last.split(i)
 352             log().comment(f'split last word = name ({i - last.start})')
 353             yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
 354                                       penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
 355
 356     def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
 357         """ Yield possible assignments for the current sequence.
 358
 359             This function splits up general name assignments into name
 360             and address and yields all possible variants of that.
 361         """
 362         base = TokenAssignment.from_ranges(self.seq)
 363
 364         num_addr_tokens = sum(t.end - t.start for t in base.address)
 365         if num_addr_tokens > 50:
 366             return
 367
 368         # Postcode search (postcode-only search is covered in next case)
 369         if base.postcode is not None and base.address:
 370             yield from self._get_assignments_postcode(base, query.num_token_slots())
 371
 372         # Postcode or country-only search
 373         if not base.address:
 374             if not base.housenumber and (base.postcode or base.country or base.near_item):
 375                 log().comment('postcode/country search')
 376                 yield dataclasses.replace(base, penalty=self.penalty)
 377         else:
 378             # <postcode>,<address> should give preference to postcode search
 379             if base.postcode and base.postcode.start == 0:
 380                 self.penalty += 0.1
 381
 382             # Right-to-left reading of the address
 383             if self.direction != -1:
 384                 yield from self._get_assignments_address_forward(base, query)
 385
 386             # Left-to-right reading of the address
 387             if self.direction != 1:
 388                 yield from self._get_assignments_address_backward(base, query)
 389
 390             # variant for special housenumber searches
 391             if base.housenumber and not base.qualifier:
 392                 yield dataclasses.replace(base, penalty=self.penalty)
 393
 394
 395 def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
 396     """ Return possible word type assignments to word positions.
 397
 398         The assignments are computed from the concrete tokens listed
 399         in the tokenized query.
 400
 401         The result includes the penalty for transitions from one word type to
 402         another. It does not include penalties for transitions within a
 403         type.
 404     """
 405     todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PHRASE_ANY else 1)]
 406
 407     while todo:
 408         state = todo.pop()
 409         node = query.nodes[state.end_pos]
 410
 411         for tlist in node.starting:
 412             newstate = state.advance(tlist.ttype, tlist.end, node.btype)
 413             if newstate is not None:
 414                 if newstate.end_pos == query.num_token_slots():
 415                     if newstate.recheck_sequence():
 416                         log().var_dump('Assignment', newstate)
 417                         yield from newstate.get_assignments(query)
 418                 elif not newstate.is_final():
 419                     todo.append(newstate)