From 07b7fd1dbbb1c2aab6f678a29ba0b5711ebff53e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 15 Mar 2024 10:54:13 +0100 Subject: [PATCH] add address counts to tokens --- nominatim/api/search/icu_tokenizer.py | 6 ++++-- nominatim/api/search/legacy_tokenizer.py | 3 ++- nominatim/api/search/query.py | 2 +- nominatim/tokenizer/base.py | 2 +- nominatim/tokenizer/legacy_tokenizer.py | 2 +- test/python/api/search/test_api_search_query.py | 3 ++- test/python/api/search/test_db_search_builder.py | 12 +++++++----- test/python/api/search/test_token_assignment.py | 3 ++- test/python/cli/conftest.py | 8 ++++---- test/python/tokenizer/test_icu.py | 12 ++++++++---- 10 files changed, 32 insertions(+), 21 deletions(-) diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 1c2565d1..05ec7690 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -97,6 +97,7 @@ class ICUToken(qmod.Token): """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) + addr_count = 1 if row.info is None else row.info.get('addr_count', 1) penalty = 0.0 if row.type == 'w': @@ -123,7 +124,8 @@ class ICUToken(qmod.Token): return ICUToken(penalty=penalty, token=row.word_id, count=count, lookup_word=lookup_word, is_indexed=True, - word_token=row.word_token, info=row.info) + word_token=row.word_token, info=row.info, + addr_count=addr_count) @@ -257,7 +259,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py index 86d42a54..bd17706e 100644 --- a/nominatim/api/search/legacy_tokenizer.py +++ b/nominatim/api/search/legacy_tokenizer.py @@ -210,6 +210,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): return LegacyToken(penalty=penalty, token=row.word_id, count=row.search_name_count or 1, + addr_count=1, # not supported lookup_word=lookup_word, word_token=row.word_token.strip(), category=(rowclass, row.type) if rowclass is not None else None, @@ -226,7 +227,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): if len(part) <= 4 and part.isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - LegacyToken(penalty=0.5, token=0, count=1, + LegacyToken(penalty=0.5, token=0, count=1, addr_count=1, lookup_word=part, word_token=part, category=None, country=None, operator=None, is_indexed=True)) diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index bd91c2ec..a0d7add1 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -99,10 +99,10 @@ class Token(ABC): penalty: float token: int count: int + addr_count: int lookup_word: str is_indexed: bool - addr_count: int = 1 @abstractmethod def get_category(self) -> Tuple[str, str]: diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 29bcc8e1..12c826eb 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -201,7 +201,7 @@ class AbstractTokenizer(ABC): @abstractmethod - def update_statistics(self, config: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute any tokenizer statistics necessary for efficient lookup. This function is meant to be called from time to time by the user to improve performance. However, the tokenizer must not depend on diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index f3a00839..93808cc3 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer): self._save_config(conn, config) - def update_statistics(self, _: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index fe850ce9..bfdceb41 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -18,7 +18,8 @@ class MyToken(query.Token): def mktoken(tid: int): - return MyToken(3.0, tid, 1, 'foo', True) + return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, + lookup_word='foo', is_indexed=True) @pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'), diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py index d3aea900..68f71298 100644 --- a/test/python/api/search/test_db_search_builder.py +++ b/test/python/api/search/test_db_search_builder.py @@ -31,7 +31,9 @@ def make_query(*args): for end, ttype, tinfo in tlist: for tid, word in tinfo: q.add_token(TokenRange(start, end), ttype, - MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True)) + MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0, + token=tid, count=1, addr_count=1, + lookup_word=word, is_indexed=True)) return q @@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full, q.add_node(BreakType.END, PhraseType.NONE) q.add_token(TokenRange(0, 1), TokenType.PARTIAL, - MyToken(0.5, 1, name_part, 'name_part', True)) + MyToken(0.5, 1, name_part, 1, 'name_part', True)) q.add_token(TokenRange(0, 1), TokenType.WORD, - MyToken(0, 101, name_full, 'name_full', True)) + MyToken(0, 101, name_full, 1, 'name_full', True)) for i in range(num_address_parts): q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL, - MyToken(0.5, 2, address_part, 'address_part', True)) + MyToken(0.5, 2, address_part, 1, 'address_part', True)) q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD, - MyToken(0, 102, address_full, 'address_full', True)) + MyToken(0, 102, address_full, 1, 'address_full', True)) builder = SearchBuilder(q, SearchDetails()) diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py index 54e8af14..cde8495d 100644 --- a/test/python/api/search/test_token_assignment.py +++ b/test/python/api/search/test_token_assignment.py @@ -19,7 +19,8 @@ class MyToken(Token): def make_query(*args): q = QueryStruct([Phrase(args[0][1], '')]) - dummy = MyToken(3.0, 45, 1, 'foo', True) + dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1, + lookup_word='foo', is_indexed=True) for btype, ptype, _ in args[1:]: q.add_node(btype, ptype) diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py index 1bb393fb..28aba597 100644 --- a/test/python/cli/conftest.py +++ b/test/python/cli/conftest.py @@ -32,16 +32,16 @@ class DummyTokenizer: self.update_statistics_called = False self.update_word_tokens_called = False - def update_sql_functions(self, *args): + def update_sql_functions(self, *args, **kwargs): self.update_sql_functions_called = True - def finalize_import(self, *args): + def finalize_import(self, *args, **kwargs): self.finalize_import_called = True - def update_statistics(self, *args): + def update_statistics(self, *args, **kwargs): self.update_statistics_called = True - def update_word_tokens(self, *args): + def update_word_tokens(self, *args, **kwargs): self.update_word_tokens_called = True diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index aa1afe16..9f6eae62 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config): word_table.add_full_word(1000, 'hello') + word_table.add_full_word(1001, 'bye') table_factory('search_name', - 'place_id BIGINT, name_vector INT[]', - [(12, [1000])]) + 'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]', + [(12, [1000], [1001])]) tok = tokenizer_factory() tok.update_statistics(test_config) assert temp_db_cursor.scalar("""SELECT count(*) FROM word - WHERE type = 'W' and - (info->>'count')::int > 0""") > 0 + WHERE type = 'W' and word_id = 1000 and + (info->>'count')::int > 0""") == 1 + assert temp_db_cursor.scalar("""SELECT count(*) FROM word + WHERE type = 'W' and word_id = 1001 and + (info->>'addr_count')::int > 0""") == 1 def test_normalize_postcode(analyzer): -- 2.39.5