nominatim/api/search/legacy_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2023 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Implementation of query analysis for the legacy tokenizer.
   9 """
  10 from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
  11 from copy import copy
  12 from collections import defaultdict
  13 import dataclasses
  14
  15 import sqlalchemy as sa
  16
  17 from nominatim.typing import SaRow
  18 from nominatim.api.connection import SearchConnection
  19 from nominatim.api.logging import log
  20 from nominatim.api.search import query as qmod
  21 from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
  22
  23 def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
  24     """ Return all combinations of words in the terms list after the
  25         given position.
  26     """
  27     total = len(terms)
  28     for first in range(start, total):
  29         word = terms[first]
  30         yield word, qmod.TokenRange(first, first + 1)
  31         for last in range(first + 1, min(first + 20, total)):
  32             word = ' '.join((word, terms[last]))
  33             yield word, qmod.TokenRange(first, last + 1)
  34
  35
  36 @dataclasses.dataclass
  37 class LegacyToken(qmod.Token):
  38     """ Specialised token for legacy tokenizer.
  39     """
  40     word_token: str
  41     category: Optional[Tuple[str, str]]
  42     country: Optional[str]
  43     operator: Optional[str]
  44
  45     @property
  46     def info(self) -> Dict[str, Any]:
  47         """ Dictionary of additional propoerties of the token.
  48             Should only be used for debugging purposes.
  49         """
  50         return {'category': self.category,
  51                 'country': self.country,
  52                 'operator': self.operator}
  53
  54
  55     def get_category(self) -> Tuple[str, str]:
  56         assert self.category
  57         return self.category
  58
  59
  60 class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
  61     """ Converter for query strings into a tokenized query
  62         using the tokens created by a legacy tokenizer.
  63     """
  64
  65     def __init__(self, conn: SearchConnection) -> None:
  66         self.conn = conn
  67
  68     async def setup(self) -> None:
  69         """ Set up static data structures needed for the analysis.
  70         """
  71         self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
  72         if 'word' not in self.conn.t.meta.tables:
  73             sa.Table('word', self.conn.t.meta,
  74                      sa.Column('word_id', sa.Integer),
  75                      sa.Column('word_token', sa.Text, nullable=False),
  76                      sa.Column('word', sa.Text),
  77                      sa.Column('class', sa.Text),
  78                      sa.Column('type', sa.Text),
  79                      sa.Column('country_code', sa.Text),
  80                      sa.Column('search_name_count', sa.Integer),
  81                      sa.Column('operator', sa.Text))
  82
  83
  84     async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
  85         """ Analyze the given list of phrases and return the
  86             tokenized query.
  87         """
  88         log().section('Analyze query (using Legacy tokenizer)')
  89
  90         normalized = []
  91         if phrases:
  92             for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
  93                                                            for p in phrases))):
  94                 normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
  95                 break
  96
  97         query = qmod.QueryStruct(normalized)
  98         log().var_dump('Normalized query', query.source)
  99         if not query.source:
 100             return query
 101
 102         parts, words = self.split_query(query)
 103         lookup_words = list(words.keys())
 104         log().var_dump('Split query', parts)
 105         log().var_dump('Extracted words', lookup_words)
 106
 107         for row in await self.lookup_in_db(lookup_words):
 108             for trange in words[row.word_token.strip()]:
 109                 token, ttype = self.make_token(row)
 110                 if ttype == qmod.TokenType.CATEGORY:
 111                     if trange.start == 0:
 112                         query.add_token(trange, qmod.TokenType.CATEGORY, token)
 113                 elif ttype == qmod.TokenType.QUALIFIER:
 114                     query.add_token(trange, qmod.TokenType.QUALIFIER, token)
 115                     if trange.start == 0 or trange.end == query.num_token_slots():
 116                         token = copy(token)
 117                         token.penalty += 0.1 * (query.num_token_slots())
 118                         query.add_token(trange, qmod.TokenType.CATEGORY, token)
 119                 elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
 120                     query.add_token(trange, ttype, token)
 121
 122         self.add_extra_tokens(query, parts)
 123         self.rerank_tokens(query)
 124
 125         log().table_dump('Word tokens', _dump_word_tokens(query))
 126
 127         return query
 128
 129
 130     def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
 131                                                             Dict[str, List[qmod.TokenRange]]]:
 132         """ Transliterate the phrases and split them into tokens.
 133
 134             Returns a list of transliterated tokens and a dictionary
 135             of words for lookup together with their position.
 136         """
 137         parts: List[str] = []
 138         phrase_start = 0
 139         words = defaultdict(list)
 140         for phrase in query.source:
 141             query.nodes[-1].ptype = phrase.ptype
 142             for trans in phrase.text.split(' '):
 143                 if trans:
 144                     for term in trans.split(' '):
 145                         if term:
 146                             parts.append(trans)
 147                             query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
 148                     query.nodes[-1].btype = qmod.BreakType.WORD
 149             query.nodes[-1].btype = qmod.BreakType.PHRASE
 150             for word, wrange in yield_words(parts, phrase_start):
 151                 words[word].append(wrange)
 152             phrase_start = len(parts)
 153         query.nodes[-1].btype = qmod.BreakType.END
 154
 155         return parts, words
 156
 157
 158     async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
 159         """ Return the token information from the database for the
 160             given word tokens.
 161         """
 162         t = self.conn.t.meta.tables['word']
 163
 164         sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
 165
 166         return await self.conn.execute(sql)
 167
 168
 169     def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
 170         """ Create a LegacyToken from the row of the word table.
 171             Also determines the type of token.
 172         """
 173         penalty = 0.0
 174         is_indexed = True
 175
 176         rowclass = getattr(row, 'class')
 177
 178         if row.country_code is not None:
 179             ttype = qmod.TokenType.COUNTRY
 180             lookup_word = row.country_code
 181         elif rowclass is not None:
 182             if rowclass == 'place' and  row.type == 'house':
 183                 ttype = qmod.TokenType.HOUSENUMBER
 184                 lookup_word = row.word_token[1:]
 185             elif rowclass == 'place' and  row.type == 'postcode':
 186                 ttype = qmod.TokenType.POSTCODE
 187                 lookup_word = row.word_token[1:]
 188             else:
 189                 ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\
 190                         else qmod.TokenType.QUALIFIER
 191                 lookup_word = row.word
 192         elif row.word_token.startswith(' '):
 193             ttype = qmod.TokenType.WORD
 194             lookup_word = row.word or row.word_token[1:]
 195         else:
 196             ttype = qmod.TokenType.PARTIAL
 197             lookup_word = row.word_token
 198             penalty = 0.21
 199             if row.search_name_count > self.max_word_freq:
 200                 is_indexed = False
 201
 202         return LegacyToken(penalty=penalty, token=row.word_id,
 203                            count=row.search_name_count or 1,
 204                            lookup_word=lookup_word,
 205                            word_token=row.word_token.strip(),
 206                            category=(rowclass, row.type) if rowclass is not None else None,
 207                            country=row.country_code,
 208                            operator=row.operator,
 209                            is_indexed=is_indexed),\
 210                ttype
 211
 212
 213     def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
 214         """ Add tokens to query that are not saved in the database.
 215         """
 216         for part, node, i in zip(parts, query.nodes, range(1000)):
 217             if len(part) <= 4 and part.isdigit()\
 218                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
 219                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
 220                                 LegacyToken(penalty=0.5, token=0, count=1,
 221                                             lookup_word=part, word_token=part,
 222                                             category=None, country=None,
 223                                             operator=None, is_indexed=True))
 224
 225
 226     def rerank_tokens(self, query: qmod.QueryStruct) -> None:
 227         """ Add penalties to tokens that depend on presence of other token.
 228         """
 229         for _, node, tlist in query.iter_token_lists():
 230             if tlist.ttype == qmod.TokenType.POSTCODE:
 231                 for repl in node.starting:
 232                     if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
 233                        and (repl.ttype != qmod.TokenType.HOUSENUMBER
 234                             or len(tlist.tokens[0].lookup_word) > 4):
 235                         repl.add_penalty(0.39)
 236             elif tlist.ttype == qmod.TokenType.HOUSENUMBER:
 237                 if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
 238                     for repl in node.starting:
 239                         if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \
 240                            and (repl.ttype != qmod.TokenType.HOUSENUMBER
 241                                 or len(tlist.tokens[0].lookup_word) <= 3):
 242                             repl.add_penalty(0.5 - tlist.tokens[0].penalty)
 243
 244
 245
 246 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
 247     yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
 248     for node in query.nodes:
 249         for tlist in node.starting:
 250             for token in tlist.tokens:
 251                 t = cast(LegacyToken, token)
 252                 yield [tlist.ttype.name, t.token, t.word_token or '',
 253                        t.lookup_word or '', t.penalty, t.count, t.info]
 254
 255
 256 async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
 257     """ Create and set up a new query analyzer for a database based
 258         on the ICU tokenizer.
 259     """
 260     out = LegacyQueryAnalyzer(conn)
 261     await out.setup()
 262
 263     return out