From d8240f9ee475daef1412f0e2cdf36efeba6666f3 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 22 May 2023 11:07:14 +0200 Subject: [PATCH] add query analyser for legacy tokenizer --- nominatim/api/results.py | 9 +- nominatim/api/search/legacy_tokenizer.py | 263 ++++++++++++++++++ nominatim/db/sqlalchemy_schema.py | 7 - .../api/search/test_legacy_query_analyzer.py | 245 ++++++++++++++++ 4 files changed, 514 insertions(+), 10 deletions(-) create mode 100644 nominatim/api/search/legacy_tokenizer.py create mode 100644 test/python/api/search/test_legacy_query_analyzer.py diff --git a/nominatim/api/results.py b/nominatim/api/results.py index 98b13380..56243e8d 100644 --- a/nominatim/api/results.py +++ b/nominatim/api/results.py @@ -23,6 +23,7 @@ from nominatim.api.types import Point, Bbox, LookupDetails from nominatim.api.connection import SearchConnection from nominatim.api.logging import log from nominatim.api.localization import Locales +from nominatim.api.search.query_analyzer_factory import make_query_analyzer # This file defines complex result data classes. # pylint: disable=too-many-instance-attributes @@ -420,10 +421,12 @@ async def complete_keywords(conn: SearchConnection, result: BaseResult) -> None: result.name_keywords = [] result.address_keywords = [] - for name_tokens, address_tokens in await conn.execute(sql): - t = conn.t.word - sel = sa.select(t.c.word_id, t.c.word_token, t.c.word) + await make_query_analyzer(conn) + t = conn.t.meta.tables['word'] + sel = sa.select(t.c.word_id, t.c.word_token, t.c.word) + + for name_tokens, address_tokens in await conn.execute(sql): for row in await conn.execute(sel.where(t.c.word_id == sa.any_(name_tokens))): result.name_keywords.append(WordInfo(*row)) diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py new file mode 100644 index 00000000..96975704 --- /dev/null +++ b/nominatim/api/search/legacy_tokenizer.py @@ -0,0 +1,263 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Implementation of query analysis for the legacy tokenizer. +""" +from typing import Tuple, Dict, List, Optional, Iterator, Any, cast +from copy import copy +from collections import defaultdict +import dataclasses + +import sqlalchemy as sa + +from nominatim.typing import SaRow +from nominatim.api.connection import SearchConnection +from nominatim.api.logging import log +from nominatim.api.search import query as qmod +from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer + +def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]: + """ Return all combinations of words in the terms list after the + given position. + """ + total = len(terms) + for first in range(start, total): + word = terms[first] + yield word, qmod.TokenRange(first, first + 1) + for last in range(first + 1, min(first + 20, total)): + word = ' '.join((word, terms[last])) + yield word, qmod.TokenRange(first, last + 1) + + +@dataclasses.dataclass +class LegacyToken(qmod.Token): + """ Specialised token for legacy tokenizer. + """ + word_token: str + category: Optional[Tuple[str, str]] + country: Optional[str] + operator: Optional[str] + + @property + def info(self) -> Dict[str, Any]: + """ Dictionary of additional propoerties of the token. + Should only be used for debugging purposes. + """ + return {'category': self.category, + 'country': self.country, + 'operator': self.operator} + + + def get_category(self) -> Tuple[str, str]: + assert self.category + return self.category + + +class LegacyQueryAnalyzer(AbstractQueryAnalyzer): + """ Converter for query strings into a tokenized query + using the tokens created by a legacy tokenizer. + """ + + def __init__(self, conn: SearchConnection) -> None: + self.conn = conn + + async def setup(self) -> None: + """ Set up static data structures needed for the analysis. + """ + self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq')) + if 'word' not in self.conn.t.meta.tables: + sa.Table('word', self.conn.t.meta, + sa.Column('word_id', sa.Integer), + sa.Column('word_token', sa.Text, nullable=False), + sa.Column('word', sa.Text), + sa.Column('class', sa.Text), + sa.Column('type', sa.Text), + sa.Column('country_code', sa.Text), + sa.Column('search_name_count', sa.Integer), + sa.Column('operator', sa.Text)) + + + async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: + """ Analyze the given list of phrases and return the + tokenized query. + """ + log().section('Analyze query (using Legacy tokenizer)') + + normalized = [] + if phrases: + for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text) + for p in phrases))): + normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r] + break + + query = qmod.QueryStruct(normalized) + log().var_dump('Normalized query', query.source) + if not query.source: + return query + + parts, words = self.split_query(query) + lookup_words = list(words.keys()) + log().var_dump('Split query', parts) + log().var_dump('Extracted words', lookup_words) + + for row in await self.lookup_in_db(lookup_words): + for trange in words[row.word_token.strip()]: + token, ttype = self.make_token(row) + if ttype == qmod.TokenType.CATEGORY: + if trange.start == 0: + query.add_token(trange, qmod.TokenType.CATEGORY, token) + elif ttype == qmod.TokenType.QUALIFIER: + query.add_token(trange, qmod.TokenType.QUALIFIER, token) + if trange.start == 0 or trange.end == query.num_token_slots(): + token = copy(token) + token.penalty += 0.1 * (query.num_token_slots()) + query.add_token(trange, qmod.TokenType.CATEGORY, token) + elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end: + query.add_token(trange, ttype, token) + + self.add_extra_tokens(query, parts) + self.rerank_tokens(query) + + log().table_dump('Word tokens', _dump_word_tokens(query)) + + return query + + + def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str], + Dict[str, List[qmod.TokenRange]]]: + """ Transliterate the phrases and split them into tokens. + + Returns a list of transliterated tokens and a dictionary + of words for lookup together with their position. + """ + parts: List[str] = [] + phrase_start = 0 + words = defaultdict(list) + for phrase in query.source: + query.nodes[-1].ptype = phrase.ptype + for trans in phrase.text.split(' '): + if trans: + for term in trans.split(' '): + if term: + parts.append(trans) + query.add_node(qmod.BreakType.TOKEN, phrase.ptype) + query.nodes[-1].btype = qmod.BreakType.WORD + query.nodes[-1].btype = qmod.BreakType.PHRASE + for word, wrange in yield_words(parts, phrase_start): + words[word].append(wrange) + phrase_start = len(parts) + query.nodes[-1].btype = qmod.BreakType.END + + return parts, words + + + async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': + """ Return the token information from the database for the + given word tokens. + """ + t = self.conn.t.meta.tables['word'] + + sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words])) + + return await self.conn.execute(sql) + + + def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]: + """ Create a LegacyToken from the row of the word table. + Also determines the type of token. + """ + penalty = 0.0 + is_indexed = True + + rowclass = getattr(row, 'class') + + if row.country_code is not None: + ttype = qmod.TokenType.COUNTRY + lookup_word = row.country_code + elif rowclass is not None: + if rowclass == 'place' and row.type == 'house': + ttype = qmod.TokenType.HOUSENUMBER + lookup_word = row.word_token[1:] + elif rowclass == 'place' and row.type == 'postcode': + ttype = qmod.TokenType.POSTCODE + lookup_word = row.word_token[1:] + else: + ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\ + else qmod.TokenType.QUALIFIER + lookup_word = row.word + elif row.word_token.startswith(' '): + ttype = qmod.TokenType.WORD + lookup_word = row.word or row.word_token[1:] + else: + ttype = qmod.TokenType.PARTIAL + lookup_word = row.word_token + penalty = 0.21 + if row.search_name_count > self.max_word_freq: + is_indexed = False + + return LegacyToken(penalty=penalty, token=row.word_id, + count=row.search_name_count or 1, + lookup_word=lookup_word, + word_token=row.word_token.strip(), + category=(rowclass, row.type) if rowclass is not None else None, + country=row.country_code, + operator=row.operator, + is_indexed=is_indexed),\ + ttype + + + def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None: + """ Add tokens to query that are not saved in the database. + """ + for part, node, i in zip(parts, query.nodes, range(1000)): + if len(part) <= 4 and part.isdigit()\ + and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): + query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, + LegacyToken(penalty=0.5, token=0, count=1, + lookup_word=part, word_token=part, + category=None, country=None, + operator=None, is_indexed=True)) + + + def rerank_tokens(self, query: qmod.QueryStruct) -> None: + """ Add penalties to tokens that depend on presence of other token. + """ + for _, node, tlist in query.iter_token_lists(): + if tlist.ttype == qmod.TokenType.POSTCODE: + for repl in node.starting: + if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \ + and (repl.ttype != qmod.TokenType.HOUSENUMBER + or len(tlist.tokens[0].lookup_word) > 4): + repl.add_penalty(0.39) + elif tlist.ttype == qmod.TokenType.HOUSENUMBER: + if any(c.isdigit() for c in tlist.tokens[0].lookup_word): + for repl in node.starting: + if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \ + and (repl.ttype != qmod.TokenType.HOUSENUMBER + or len(tlist.tokens[0].lookup_word) <= 3): + repl.add_penalty(0.5 - tlist.tokens[0].penalty) + + + +def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: + yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] + for node in query.nodes: + for tlist in node.starting: + for token in tlist.tokens: + t = cast(LegacyToken, token) + yield [tlist.ttype.name, t.token, t.word_token or '', + t.lookup_word or '', t.penalty, t.count, t.info] + + +async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer: + """ Create and set up a new query analyzer for a database based + on the ICU tokenizer. + """ + out = LegacyQueryAnalyzer(conn) + await out.setup() + + return out diff --git a/nominatim/db/sqlalchemy_schema.py b/nominatim/db/sqlalchemy_schema.py index 26bbefcf..550f1f12 100644 --- a/nominatim/db/sqlalchemy_schema.py +++ b/nominatim/db/sqlalchemy_schema.py @@ -113,13 +113,6 @@ class SearchTables: sa.Column('postcode', sa.Text), sa.Column('country_code', sa.String(2))) - self.word = sa.Table('word', meta, - sa.Column('word_id', sa.Integer), - sa.Column('word_token', sa.Text, nullable=False), - sa.Column('type', sa.Text, nullable=False), - sa.Column('word', sa.Text), - sa.Column('info', self.types.Json)) - self.country_name = sa.Table('country_name', meta, sa.Column('country_code', sa.String(2)), sa.Column('name', self.types.Composite), diff --git a/test/python/api/search/test_legacy_query_analyzer.py b/test/python/api/search/test_legacy_query_analyzer.py new file mode 100644 index 00000000..c2115853 --- /dev/null +++ b/test/python/api/search/test_legacy_query_analyzer.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for query analyzer for legacy tokenizer. +""" +from pathlib import Path + +import pytest +import pytest_asyncio + +from nominatim.api import NominatimAPIAsync +from nominatim.api.search.query import Phrase, PhraseType, TokenType, BreakType +import nominatim.api.search.legacy_tokenizer as tok +from nominatim.api.logging import set_log_output, get_and_disable + + +async def add_word(conn, word_id, word_token, word, count): + t = conn.t.meta.tables['word'] + await conn.execute(t.insert(), {'word_id': word_id, + 'word_token': word_token, + 'search_name_count': count, + 'word': word}) + + +async def add_housenumber(conn, word_id, hnr): + t = conn.t.meta.tables['word'] + await conn.execute(t.insert(), {'word_id': word_id, + 'word_token': ' ' + hnr, + 'word': hnr, + 'class': 'place', + 'type': 'house'}) + + +async def add_postcode(conn, word_id, postcode): + t = conn.t.meta.tables['word'] + await conn.execute(t.insert(), {'word_id': word_id, + 'word_token': ' ' + postcode, + 'word': postcode, + 'class': 'place', + 'type': 'postcode'}) + + +async def add_special_term(conn, word_id, word_token, cls, typ, op): + t = conn.t.meta.tables['word'] + await conn.execute(t.insert(), {'word_id': word_id, + 'word_token': word_token, + 'word': word_token, + 'class': cls, + 'type': typ, + 'operator': op}) + + +def make_phrase(query): + return [Phrase(PhraseType.NONE, s) for s in query.split(',')] + + +@pytest_asyncio.fixture +async def conn(table_factory, temp_db_cursor): + """ Create an asynchronous SQLAlchemy engine for the test DB. + """ + table_factory('nominatim_properties', + definition='property TEXT, value TEXT', + content=(('tokenizer_maxwordfreq', '10000'), )) + table_factory('word', + definition="""word_id INT, word_token TEXT, word TEXT, + class TEXT, type TEXT, country_code TEXT, + search_name_count INT, operator TEXT + """) + + temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) + RETURNS TEXT AS $$ SELECT lower(name); $$ LANGUAGE SQL;""") + + api = NominatimAPIAsync(Path('/invalid'), {}) + async with api.begin() as conn: + yield conn + await api.close() + + +@pytest.mark.asyncio +async def test_empty_phrase(conn): + ana = await tok.create_query_analyzer(conn) + + query = await ana.analyze_query([]) + + assert len(query.source) == 0 + assert query.num_token_slots() == 0 + + +@pytest.mark.asyncio +async def test_single_phrase_with_unknown_terms(conn): + ana = await tok.create_query_analyzer(conn) + + await add_word(conn, 1, 'foo', 'FOO', 3) + + query = await ana.analyze_query(make_phrase('foo BAR')) + + assert len(query.source) == 1 + assert query.source[0].ptype == PhraseType.NONE + assert query.source[0].text == 'foo bar' + + assert query.num_token_slots() == 2 + assert len(query.nodes[0].starting) == 1 + assert not query.nodes[1].starting + + +@pytest.mark.asyncio +async def test_multiple_phrases(conn): + ana = await tok.create_query_analyzer(conn) + + await add_word(conn, 1, 'one', 'one', 13) + await add_word(conn, 2, 'two', 'two', 45) + await add_word(conn, 100, 'one two', 'one two', 3) + await add_word(conn, 3, 'three', 'three', 4584) + + query = await ana.analyze_query(make_phrase('one two,three')) + + assert len(query.source) == 2 + + +@pytest.mark.asyncio +async def test_housenumber_token(conn): + ana = await tok.create_query_analyzer(conn) + + await add_housenumber(conn, 556, '45 a') + + query = await ana.analyze_query(make_phrase('45 A')) + + assert query.num_token_slots() == 2 + assert len(query.nodes[0].starting) == 2 + + query.nodes[0].starting.sort(key=lambda tl: tl.end) + + hn1 = query.nodes[0].starting[0] + assert hn1.ttype == TokenType.HOUSENUMBER + assert hn1.end == 1 + assert hn1.tokens[0].token == 0 + + hn2 = query.nodes[0].starting[1] + assert hn2.ttype == TokenType.HOUSENUMBER + assert hn2.end == 2 + assert hn2.tokens[0].token == 556 + + +@pytest.mark.asyncio +async def test_postcode_token(conn): + ana = await tok.create_query_analyzer(conn) + + await add_postcode(conn, 34, '45ax') + + query = await ana.analyze_query(make_phrase('45AX')) + + assert query.num_token_slots() == 1 + assert [tl.ttype for tl in query.nodes[0].starting] == [TokenType.POSTCODE] + + +@pytest.mark.asyncio +async def test_partial_tokens(conn): + ana = await tok.create_query_analyzer(conn) + + await add_word(conn, 1, ' foo', 'foo', 99) + await add_word(conn, 1, 'foo', 'FOO', 99) + await add_word(conn, 1, 'bar', 'FOO', 990000) + + query = await ana.analyze_query(make_phrase('foo bar')) + + assert query.num_token_slots() == 2 + + first = query.nodes[0].starting + first.sort(key=lambda tl: tl.tokens[0].penalty) + assert [tl.ttype for tl in first] == [TokenType.WORD, TokenType.PARTIAL] + assert all(tl.tokens[0].lookup_word == 'foo' for tl in first) + + second = query.nodes[1].starting + assert [tl.ttype for tl in second] == [TokenType.PARTIAL] + assert not second[0].tokens[0].is_indexed + + +@pytest.mark.asyncio +@pytest.mark.parametrize('term,order', [('23456', ['POSTCODE', 'HOUSENUMBER', 'WORD', 'PARTIAL']), + ('3', ['HOUSENUMBER', 'POSTCODE', 'WORD', 'PARTIAL']) + ]) +async def test_penalty_postcodes_and_housenumbers(conn, term, order): + ana = await tok.create_query_analyzer(conn) + + await add_postcode(conn, 1, term) + await add_housenumber(conn, 2, term) + await add_word(conn, 3, term, term, 5) + await add_word(conn, 4, ' ' + term, term, 1) + + query = await ana.analyze_query(make_phrase(term)) + + assert query.num_token_slots() == 1 + + torder = [(tl.tokens[0].penalty, tl.ttype) for tl in query.nodes[0].starting] + print(query.nodes[0].starting) + torder.sort() + + assert [t[1] for t in torder] == [TokenType[o] for o in order] + + +@pytest.mark.asyncio +async def test_category_words_only_at_beginning(conn): + ana = await tok.create_query_analyzer(conn) + + await add_special_term(conn, 1, 'foo', 'amenity', 'restaurant', 'in') + await add_word(conn, 2, ' bar', 'BAR', 1) + + query = await ana.analyze_query(make_phrase('foo BAR foo')) + + assert query.num_token_slots() == 3 + assert len(query.nodes[0].starting) == 1 + assert query.nodes[0].starting[0].ttype == TokenType.CATEGORY + assert not query.nodes[2].starting + + +@pytest.mark.asyncio +async def test_qualifier_words(conn): + ana = await tok.create_query_analyzer(conn) + + await add_special_term(conn, 1, 'foo', 'amenity', 'restaurant', '-') + await add_word(conn, 2, ' bar', 'w', None) + + query = await ana.analyze_query(make_phrase('foo BAR foo BAR foo')) + + assert query.num_token_slots() == 5 + assert set(t.ttype for t in query.nodes[0].starting) == {TokenType.CATEGORY, TokenType.QUALIFIER} + assert set(t.ttype for t in query.nodes[2].starting) == {TokenType.QUALIFIER} + assert set(t.ttype for t in query.nodes[4].starting) == {TokenType.CATEGORY, TokenType.QUALIFIER} + + +@pytest.mark.asyncio +@pytest.mark.parametrize('logtype', ['text', 'html']) +async def test_log_output(conn, logtype): + ana = await tok.create_query_analyzer(conn) + + await add_word(conn, 1, 'foo', 'FOO', 99) + + set_log_output(logtype) + await ana.analyze_query(make_phrase('foo')) + + assert get_and_disable() -- 2.39.5