From: Sarah Hoffmann Date: Wed, 5 Mar 2025 15:03:23 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4af0bc6056edf3596b07cfaf975bf195a51b8557?hp=96ce8f83bda30d1fc96756d36d7f47890f118575 Merge remote-tracking branch 'upstream/master' --- diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 667684c6..a2ca7412 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -1809,7 +1809,8 @@ us: languages: en names: !include country-names/us.yaml postcode: - pattern: "ddddd" + pattern: "(ddddd)(?:-dddd)?" + output: \1 # Uruguay (Uruguay) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 3b85f26d..ecc2c1c7 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -8,7 +8,6 @@ Implementation of query analysis for the ICU tokenizer. """ from typing import Tuple, Dict, List, Optional, Iterator, Any, cast -from collections import defaultdict import dataclasses import difflib import re @@ -25,7 +24,9 @@ from ..connection import SearchConnection from ..logging import log from . import query as qmod from ..query_preprocessing.config import QueryConfig +from ..query_preprocessing.base import QueryProcessingFunc from .query_analyzer_factory import AbstractQueryAnalyzer +from .postcode_parser import PostcodeParser DB_TO_TOKEN_TYPE = { @@ -47,42 +48,6 @@ PENALTY_IN_TOKEN_BREAK = { } -@dataclasses.dataclass -class QueryPart: - """ Normalized and transliterated form of a single term in the query. - - When the term came out of a split during the transliteration, - the normalized string is the full word before transliteration. - Check the subsequent break type to figure out if the word is - continued. - - Penalty is the break penalty for the break following the token. - """ - token: str - normalized: str - penalty: float - - -QueryParts = List[QueryPart] -WordDict = Dict[str, List[qmod.TokenRange]] - - -def extract_words(terms: List[QueryPart], start: int, words: WordDict) -> None: - """ Add all combinations of words in the terms list after the - given position to the word list. - """ - total = len(terms) - base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD] - for first in range(start, total): - word = terms[first].token - penalty = base_penalty - words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty)) - for last in range(first + 1, min(first + 20, total)): - word = ' '.join((word, terms[last].token)) - penalty += terms[last - 1].penalty - words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty)) - - @dataclasses.dataclass class ICUToken(qmod.Token): """ Specialised token for ICU tokenizer. @@ -148,60 +113,51 @@ class ICUToken(qmod.Token): addr_count=max(1, addr_count)) -class ICUQueryAnalyzer(AbstractQueryAnalyzer): - """ Converter for query strings into a tokenized query - using the tokens created by a ICU tokenizer. - """ - def __init__(self, conn: SearchConnection) -> None: - self.conn = conn - - async def setup(self) -> None: - """ Set up static data structures needed for the analysis. - """ - async def _make_normalizer() -> Any: - rules = await self.conn.get_property('tokenizer_import_normalisation') - return Transliterator.createFromRules("normalization", rules) - - self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer', - _make_normalizer) - - async def _make_transliterator() -> Any: - rules = await self.conn.get_property('tokenizer_import_transliteration') - return Transliterator.createFromRules("transliteration", rules) - - self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator', - _make_transliterator) - - await self._setup_preprocessing() - - if 'word' not in self.conn.t.meta.tables: - sa.Table('word', self.conn.t.meta, - sa.Column('word_id', sa.Integer), - sa.Column('word_token', sa.Text, nullable=False), - sa.Column('type', sa.Text, nullable=False), - sa.Column('word', sa.Text), - sa.Column('info', Json)) +@dataclasses.dataclass +class ICUAnalyzerConfig: + postcode_parser: PostcodeParser + normalizer: Transliterator + transliterator: Transliterator + preprocessors: List[QueryProcessingFunc] - async def _setup_preprocessing(self) -> None: - """ Load the rules for preprocessing and set up the handlers. - """ + @staticmethod + async def create(conn: SearchConnection) -> 'ICUAnalyzerConfig': + rules = await conn.get_property('tokenizer_import_normalisation') + normalizer = Transliterator.createFromRules("normalization", rules) - rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml', - config='TOKENIZER_CONFIG') - preprocessing_rules = rules.get('query-preprocessing', []) + rules = await conn.get_property('tokenizer_import_transliteration') + transliterator = Transliterator.createFromRules("transliteration", rules) - self.preprocessors = [] + preprocessing_rules = conn.config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG')\ + .get('query-preprocessing', []) + preprocessors: List[QueryProcessingFunc] = [] for func in preprocessing_rules: if 'step' not in func: raise UsageError("Preprocessing rule is missing the 'step' attribute.") if not isinstance(func['step'], str): raise UsageError("'step' attribute must be a simple string.") - module = self.conn.config.load_plugin_module( + module = conn.config.load_plugin_module( func['step'], 'nominatim_api.query_preprocessing') - self.preprocessors.append( - module.create(QueryConfig(func).set_normalizer(self.normalizer))) + preprocessors.append( + module.create(QueryConfig(func).set_normalizer(normalizer))) + + return ICUAnalyzerConfig(PostcodeParser(conn.config), + normalizer, transliterator, preprocessors) + + +class ICUQueryAnalyzer(AbstractQueryAnalyzer): + """ Converter for query strings into a tokenized query + using the tokens created by a ICU tokenizer. + """ + def __init__(self, conn: SearchConnection, config: ICUAnalyzerConfig) -> None: + self.conn = conn + self.postcode_parser = config.postcode_parser + self.normalizer = config.normalizer + self.transliterator = config.transliterator + self.preprocessors = config.preprocessors async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: """ Analyze the given list of phrases and return the @@ -222,8 +178,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if not query.source: return query - parts, words = self.split_query(query) - log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts)) + self.split_query(query) + log().var_dump('Transliterated query', lambda: query.get_transliterated_query()) + words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]) for row in await self.lookup_in_db(list(words.keys())): for trange in words[row.word_token]: @@ -240,8 +197,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) - self.add_extra_tokens(query, parts) - self.rerank_tokens(query, parts) + self.add_extra_tokens(query) + for start, end, pc in self.postcode_parser.parse(query): + query.add_token(qmod.TokenRange(start, end), + qmod.TOKEN_POSTCODE, + ICUToken(penalty=0.1, token=0, count=1, addr_count=1, + lookup_word=pc, word_token=pc, info=None)) + self.rerank_tokens(query) log().table_dump('Word tokens', _dump_word_tokens(query)) @@ -254,16 +216,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ return cast(str, self.normalizer.transliterate(text)).strip('-: ') - def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: + def split_query(self, query: qmod.QueryStruct) -> None: """ Transliterate the phrases and split them into tokens. - - Returns the list of transliterated tokens together with their - normalized form and a dictionary of words for lookup together - with their position. """ - parts: QueryParts = [] - phrase_start = 0 - words: WordDict = defaultdict(list) for phrase in query.source: query.nodes[-1].ptype = phrase.ptype phrase_split = re.split('([ :-])', phrase.text) @@ -278,38 +233,42 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trans: for term in trans.split(' '): if term: - parts.append(QueryPart(term, word, - PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN])) - query.add_node(qmod.BREAK_TOKEN, phrase.ptype) - query.nodes[-1].btype = breakchar - parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar] + query.add_node(qmod.BREAK_TOKEN, phrase.ptype, + PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN], + term, word) + query.nodes[-1].adjust_break(breakchar, + PENALTY_IN_TOKEN_BREAK[breakchar]) - extract_words(parts, phrase_start, words) - - phrase_start = len(parts) - query.nodes[-1].btype = qmod.BREAK_END - - return parts, words + query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END]) async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': """ Return the token information from the database for the given word tokens. + + This function excludes postcode tokens """ t = self.conn.t.meta.tables['word'] - return await self.conn.execute(t.select().where(t.c.word_token.in_(words))) + return await self.conn.execute(t.select() + .where(t.c.word_token.in_(words)) + .where(t.c.type != 'P')) - def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: + def add_extra_tokens(self, query: qmod.QueryStruct) -> None: """ Add tokens to query that are not saved in the database. """ - for part, node, i in zip(parts, query.nodes, range(1000)): - if len(part.token) <= 4 and part.token.isdigit()\ - and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER): - query.add_token(qmod.TokenRange(i, i+1), qmod.TOKEN_HOUSENUMBER, + need_hnr = False + for i, node in enumerate(query.nodes): + is_full_token = node.btype not in (qmod.BREAK_TOKEN, qmod.BREAK_PART) + if need_hnr and is_full_token \ + and len(node.term_normalized) <= 4 and node.term_normalized.isdigit(): + query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER, ICUToken(penalty=0.5, token=0, - count=1, addr_count=1, lookup_word=part.token, - word_token=part.token, info=None)) + count=1, addr_count=1, + lookup_word=node.term_lookup, + word_token=node.term_lookup, info=None)) + + need_hnr = is_full_token and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER) - def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: + def rerank_tokens(self, query: qmod.QueryStruct) -> None: """ Add penalties to tokens that depend on presence of other token. """ for i, node, tlist in query.iter_token_lists(): @@ -326,28 +285,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER: repl.add_penalty(0.5 - tlist.tokens[0].penalty) elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL): - norm = parts[i].normalized - for j in range(i + 1, tlist.end): - if node.btype != qmod.BREAK_TOKEN: - norm += ' ' + parts[j].normalized + norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1] + if n.btype != qmod.BREAK_TOKEN) + if not norm: + # Can happen when the token only covers a partial term + norm = query.nodes[i + 1].term_normalized for token in tlist.tokens: cast(ICUToken, token).rematch(norm) -def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str: - out = query.nodes[0].btype - for node, part in zip(query.nodes[1:], parts): - out += part.token + node.btype - return out - - def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: - yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] - for node in query.nodes: + yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] + for i, node in enumerate(query.nodes): for tlist in node.starting: for token in tlist.tokens: t = cast(ICUToken, token) - yield [tlist.ttype, t.token, t.word_token or '', + yield [tlist.ttype, str(i), str(tlist.end), t.token, t.word_token or '', t.lookup_word or '', t.penalty, t.count, t.info] @@ -355,7 +308,17 @@ async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer """ Create and set up a new query analyzer for a database based on the ICU tokenizer. """ - out = ICUQueryAnalyzer(conn) - await out.setup() + async def _get_config() -> ICUAnalyzerConfig: + if 'word' not in conn.t.meta.tables: + sa.Table('word', conn.t.meta, + sa.Column('word_id', sa.Integer), + sa.Column('word_token', sa.Text, nullable=False), + sa.Column('type', sa.Text, nullable=False), + sa.Column('word', sa.Text), + sa.Column('info', Json)) + + return await ICUAnalyzerConfig.create(conn) + + config = await conn.get_cached_value('ICUTOK', 'config', _get_config) - return out + return ICUQueryAnalyzer(conn, config) diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py new file mode 100644 index 00000000..bb3ef1a4 --- /dev/null +++ b/src/nominatim_api/search/postcode_parser.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Handling of arbitrary postcode tokens in tokenized query string. +""" +from typing import Tuple, Set, Dict, List +import re +from collections import defaultdict + +import yaml + +from ..config import Configuration +from . import query as qmod + + +class PostcodeParser: + """ Pattern-based parser for postcodes in tokenized queries. + + The postcode patterns are read from the country configuration. + The parser does currently not return country restrictions. + """ + + def __init__(self, config: Configuration) -> None: + # skip over includes here to avoid loading the complete country name data + yaml.add_constructor('!include', lambda loader, node: [], + Loader=yaml.SafeLoader) + cdata = yaml.safe_load(config.find_config_file('country_settings.yaml') + .read_text(encoding='utf-8')) + + unique_patterns: Dict[str, Dict[str, List[str]]] = {} + for cc, data in cdata.items(): + if data.get('postcode'): + pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]') + out = data['postcode'].get('output') + if pat not in unique_patterns: + unique_patterns[pat] = defaultdict(list) + unique_patterns[pat][out].append(cc) + + self.global_pattern = re.compile( + '(?:(?P[a-z][a-z])(?P[ -]?))?(?P(?:(?:' + + ')|(?:'.join(unique_patterns) + '))[:, >].*)') + + self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items())) + for pat, info in unique_patterns.items()] + + def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]: + """ Parse postcodes in the given list of query tokens taking into + account the list of breaks from the nodes. + + The result is a sequence of tuples with + [start node id, end node id, postcode token] + """ + nodes = query.nodes + outcodes: Set[Tuple[int, int, str]] = set() + + for i in range(query.num_token_slots()): + if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \ + and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE): + if nodes[i].ptype == qmod.PHRASE_ANY: + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + if word[-1] in ' -' and nodes[i + 2].btype != '`' \ + and nodes[i + 1].ptype == qmod.PHRASE_ANY: + word += nodes[i + 2].term_normalized + nodes[i + 2].btype + if word[-1] in ' -' and nodes[i + 3].btype != '`' \ + and nodes[i + 2].ptype == qmod.PHRASE_ANY: + word += nodes[i + 3].term_normalized + nodes[i + 3].btype + + self._match_word(word, i, False, outcodes) + elif nodes[i].ptype == qmod.PHRASE_POSTCODE: + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + for j in range(i + 1, query.num_token_slots()): + if nodes[j].ptype != qmod.PHRASE_POSTCODE: + break + word += nodes[j + 1].term_normalized + nodes[j + 1].btype + + self._match_word(word, i, True, outcodes) + + return outcodes + + def _match_word(self, word: str, pos: int, fullmatch: bool, + outcodes: Set[Tuple[int, int, str]]) -> None: + # Use global pattern to check for presence of any postcode. + m = self.global_pattern.fullmatch(word) + if m: + # If there was a match, check against each pattern separately + # because multiple patterns might be machting at the end. + cc = m.group('cc') + pc_word = m.group('pc') + cc_spaces = len(m.group('space') or '') + for pattern, info in self.local_patterns: + lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word) + if lm: + trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) + for out, out_ccs in info: + if cc is None or cc in out_ccs: + if out: + outcodes.add((*trange, lm.expand(out).upper())) + else: + outcodes.add((*trange, lm.group(0)[:-1].upper())) diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 8530c4f2..07bb685b 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -7,8 +7,9 @@ """ Datastructures for a tokenized query. """ -from typing import List, Tuple, Optional, Iterator +from typing import Dict, List, Tuple, Optional, Iterator from abc import ABC, abstractmethod +from collections import defaultdict import dataclasses @@ -171,11 +172,33 @@ class TokenList: @dataclasses.dataclass class QueryNode: """ A node of the query representing a break between terms. + + The node also contains information on the source term + ending at the node. The tokens are created from this information. """ btype: BreakType ptype: PhraseType + + penalty: float + """ Penalty for the break at this node. + """ + term_lookup: str + """ Transliterated term following this node. + """ + term_normalized: str + """ Normalised form of term following this node. + When the token resulted from a split during transliteration, + then this string contains the complete source term. + """ + starting: List[TokenList] = dataclasses.field(default_factory=list) + def adjust_break(self, btype: BreakType, penalty: float) -> None: + """ Change the break type and penalty for this node. + """ + self.btype = btype + self.penalty = penalty + def has_tokens(self, end: int, *ttypes: TokenType) -> bool: """ Check if there are tokens of the given types ending at the given node. @@ -218,19 +241,22 @@ class QueryStruct: def __init__(self, source: List[Phrase]) -> None: self.source = source self.nodes: List[QueryNode] = \ - [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)] + [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY, + 0.0, '', '')] def num_token_slots(self) -> int: """ Return the length of the query in vertice steps. """ return len(self.nodes) - 1 - def add_node(self, btype: BreakType, ptype: PhraseType) -> None: + def add_node(self, btype: BreakType, ptype: PhraseType, + break_penalty: float = 0.0, + term_lookup: str = '', term_normalized: str = '') -> None: """ Append a new break node with the given break type. The phrase type denotes the type for any tokens starting at the node. """ - self.nodes.append(QueryNode(btype, ptype)) + self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized)) def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None: """ Add a token to the query. 'start' and 'end' are the indexes of the @@ -287,3 +313,42 @@ class QueryStruct: if t.token == token: return f"[{tlist.ttype}]{t.lookup_word}" return 'None' + + def get_transliterated_query(self) -> str: + """ Return a string representation of the transliterated query + with the character representation of the different break types. + + For debugging purposes only. + """ + return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes) + + def extract_words(self, base_penalty: float = 0.0, + start: int = 0, + endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]: + """ Add all combinations of words that can be formed from the terms + between the given start and endnode. The terms are joined with + spaces for each break. Words can never go across a BREAK_PHRASE. + + The functions returns a dictionary of possible words with their + position within the query and a penalty. The penalty is computed + from the base_penalty plus the penalty for each node the word + crosses. + """ + if endpos is None: + endpos = len(self.nodes) + + words: Dict[str, List[TokenRange]] = defaultdict(list) + + for first in range(start, endpos - 1): + word = self.nodes[first + 1].term_lookup + penalty = base_penalty + words[word].append(TokenRange(first, first + 1, penalty=penalty)) + if self.nodes[first + 1].btype != BREAK_PHRASE: + for last in range(first + 2, min(first + 20, endpos)): + word = ' '.join((word, self.nodes[last].term_lookup)) + penalty += self.nodes[last - 1].penalty + words[word].append(TokenRange(first, last, penalty=penalty)) + if self.nodes[last].btype == BREAK_PHRASE: + break + + return words diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 16122d08..858cb64c 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -382,76 +382,15 @@ class ICUNameAnalyzer(AbstractAnalyzer): return postcode.strip().upper() def update_postcodes_from_db(self) -> None: - """ Update postcode tokens in the word table from the location_postcode - table. + """ Postcode update. + + Removes all postcodes from the word table because they are not + needed. Postcodes are recognised by pattern. """ assert self.conn is not None - analyzer = self.token_analysis.analysis.get('@postcode') with self.conn.cursor() as cur: - # First get all postcode names currently in the word table. - cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'") - word_entries = set((entry[0] for entry in cur)) - - # Then compute the required postcode names from the postcode table. - needed_entries = set() - cur.execute("SELECT country_code, postcode FROM location_postcode") - for cc, postcode in cur: - info = PlaceInfo({'country_code': cc, - 'class': 'place', 'type': 'postcode', - 'address': {'postcode': postcode}}) - address = self.sanitizer.process_names(info)[1] - for place in address: - if place.kind == 'postcode': - if analyzer is None: - postcode_name = place.name.strip().upper() - variant_base = None - else: - postcode_name = analyzer.get_canonical_id(place) - variant_base = place.get_attr("variant") - - if variant_base: - needed_entries.add(f'{postcode_name}@{variant_base}') - else: - needed_entries.add(postcode_name) - break - - # Now update the word table. - self._delete_unused_postcode_words(word_entries - needed_entries) - self._add_missing_postcode_words(needed_entries - word_entries) - - def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None: - assert self.conn is not None - if tokens: - with self.conn.cursor() as cur: - cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)", - (list(tokens), )) - - def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None: - assert self.conn is not None - if not tokens: - return - - analyzer = self.token_analysis.analysis.get('@postcode') - terms = [] - - for postcode_name in tokens: - if '@' in postcode_name: - term, variant = postcode_name.split('@', 2) - term = self._search_normalized(term) - if analyzer is None: - variants = [term] - else: - variants = analyzer.compute_variants(variant) - if term not in variants: - variants.append(term) - else: - variants = [self._search_normalized(postcode_name)] - terms.append((postcode_name, variants)) - - if terms: - with self.conn.cursor() as cur: - cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms) + cur.execute("DELETE FROM word WHERE type = 'P'") def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: @@ -719,32 +658,9 @@ class ICUNameAnalyzer(AbstractAnalyzer): analyzer = self.token_analysis.analysis.get('@postcode') if analyzer is None: - postcode_name = item.name.strip().upper() - variant_base = None - else: - postcode_name = analyzer.get_canonical_id(item) - variant_base = item.get_attr("variant") - - if variant_base: - postcode = f'{postcode_name}@{variant_base}' + return item.name.strip().upper() else: - postcode = postcode_name - - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode_name) - if not term: - return None - - variants = {term} - if analyzer is not None and variant_base: - variants.update(analyzer.compute_variants(variant_base)) - - with self.conn.cursor() as cur: - cur.execute("SELECT create_postcode_word(%s, %s)", - (postcode, list(variants))) - self._cache.postcodes.add(postcode) - - return postcode_name + return analyzer.get_canonical_id(item) class _TokenInfo: @@ -837,5 +753,4 @@ class _TokenCache: self.names: Dict[str, Tuple[int, List[int]]] = {} self.partials: Dict[str, int] = {} self.fulls: Dict[str, List[int]] = {} - self.postcodes: Set[str] = set() self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {} diff --git a/test/bdd/api/search/postcode.feature b/test/bdd/api/search/postcode.feature index bb1b755b..fb722862 100644 --- a/test/bdd/api/search/postcode.feature +++ b/test/bdd/api/search/postcode.feature @@ -3,9 +3,8 @@ Feature: Searches with postcodes Various searches involving postcodes - @Fail Scenario: US 5+4 ZIP codes are shortened to 5 ZIP codes if not found - When sending json search query "36067 1111, us" with address + When sending json search query "36067-1111, us" with address Then result addresses contain | postcode | | 36067 | diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 3f4976f1..a9b07bfe 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -170,7 +170,7 @@ Feature: Import of postcodes | object | postcode | | W93 | 11200 | - Scenario: Postcodes are added to the postcode and word table + Scenario: Postcodes are added to the postcode Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -178,7 +178,6 @@ Feature: Import of postcodes Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And there are word tokens for postcodes 01982 @Fail @@ -195,7 +194,7 @@ Feature: Import of postcodes | E45 2 | gb | 23 | 5 | | Y45 | gb | 21 | 5 | - Scenario: Postcodes outside all countries are not added to the postcode and word table + Scenario: Postcodes outside all countries are not added to the postcode table Given the places | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 | @@ -205,7 +204,6 @@ Feature: Import of postcodes When importing Then location_postcode contains exactly | country | postcode | geometry | - And there are no word tokens for postcodes 01982 When sending search query "111, 01982 Null Island" Then results contain | osm | display_name | diff --git a/test/bdd/db/update/postcode.feature b/test/bdd/db/update/postcode.feature index 39318101..61b52f3d 100644 --- a/test/bdd/db/update/postcode.feature +++ b/test/bdd/db/update/postcode.feature @@ -2,7 +2,7 @@ Feature: Update of postcode Tests for updating of data related to postcodes - Scenario: A new postcode appears in the postcode and word table + Scenario: A new postcode appears in the postcode table Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -18,9 +18,8 @@ Feature: Update of postcode | country | postcode | geometry | | de | 01982 | country:de | | ch | 4567 | country:ch | - And there are word tokens for postcodes 01982,4567 - Scenario: When the last postcode is deleted, it is deleted from postcode and word + Scenario: When the last postcode is deleted, it is deleted from postcode Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -31,10 +30,8 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 4567 | country:ch | - And there are word tokens for postcodes 4567 - And there are no word tokens for postcodes 01982 - Scenario: A postcode is not deleted from postcode and word when it exist in another country + Scenario: A postcode is not deleted from postcode when it exist in another country Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -45,7 +42,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | fr | 01982 | country:fr | - And there are word tokens for postcodes 01982 Scenario: Updating a postcode is reflected in postcode table Given the places @@ -59,7 +55,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And there are word tokens for postcodes 20453 Scenario: When changing from a postcode type, the entry appears in placex When importing @@ -80,7 +75,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And there are word tokens for postcodes 20453 Scenario: When changing to a postcode type, the entry disappears from placex When importing @@ -101,7 +95,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And there are word tokens for postcodes 01982 Scenario: When a parent is deleted, the postcode gets a new parent Given the grid with origin DE diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index 412a5bf2..08a1f7aa 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -21,6 +21,9 @@ def mktoken(tid: int): return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, lookup_word='foo') +@pytest.fixture +def qnode(): + return query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY, 0.0 ,'', '') @pytest.mark.parametrize('ptype,ttype', [(query.PHRASE_ANY, 'W'), (query.PHRASE_AMENITY, 'Q'), @@ -37,27 +40,24 @@ def test_phrase_incompatible(ptype): assert not query._phrase_compatible_with(ptype, query.TOKEN_PARTIAL, True) -def test_query_node_empty(): - qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY) +def test_query_node_empty(qnode): + assert not qnode.has_tokens(3, query.TOKEN_PARTIAL) + assert qnode.get_tokens(3, query.TOKEN_WORD) is None - assert not qn.has_tokens(3, query.TOKEN_PARTIAL) - assert qn.get_tokens(3, query.TOKEN_WORD) is None +def test_query_node_with_content(qnode): + qnode.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)])) + qnode.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)])) -def test_query_node_with_content(): - qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY) - qn.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)])) - qn.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)])) + assert not qnode.has_tokens(3, query.TOKEN_PARTIAL) + assert not qnode.has_tokens(2, query.TOKEN_COUNTRY) + assert qnode.has_tokens(2, query.TOKEN_PARTIAL) + assert qnode.has_tokens(2, query.TOKEN_WORD) - assert not qn.has_tokens(3, query.TOKEN_PARTIAL) - assert not qn.has_tokens(2, query.TOKEN_COUNTRY) - assert qn.has_tokens(2, query.TOKEN_PARTIAL) - assert qn.has_tokens(2, query.TOKEN_WORD) - - assert qn.get_tokens(3, query.TOKEN_PARTIAL) is None - assert qn.get_tokens(2, query.TOKEN_COUNTRY) is None - assert len(qn.get_tokens(2, query.TOKEN_PARTIAL)) == 2 - assert len(qn.get_tokens(2, query.TOKEN_WORD)) == 1 + assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None + assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None + assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2 + assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1 def test_query_struct_empty(): diff --git a/test/python/api/search/test_icu_query_analyzer.py b/test/python/api/search/test_icu_query_analyzer.py index eb453fda..fc200bca 100644 --- a/test/python/api/search/test_icu_query_analyzer.py +++ b/test/python/api/search/test_icu_query_analyzer.py @@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn): @pytest.mark.asyncio @pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']), - ('3', ['H', 'P', 'W', 'w']) + ('3', ['H', 'W', 'w']) ]) async def test_penalty_postcodes_and_housenumbers(conn, term, order): ana = await tok.create_query_analyzer(conn) - await add_word(conn, 1, term, 'P', None) await add_word(conn, 2, term, 'H', term) await add_word(conn, 3, term, 'w', term) await add_word(conn, 4, term, 'W', term) @@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn): assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER assert len(query.nodes[1].starting[0].tokens) == 1 assert query.nodes[1].starting[0].tokens[0].token == 1 - assert not query.nodes[2].starting - assert not query.nodes[3].starting + assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE) + assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER) + assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER) + assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER) @pytest.mark.asyncio diff --git a/test/python/api/search/test_postcode_parser.py b/test/python/api/search/test_postcode_parser.py new file mode 100644 index 00000000..284aba5b --- /dev/null +++ b/test/python/api/search/test_postcode_parser.py @@ -0,0 +1,154 @@ + +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Test for parsing of postcodes in queries. +""" +import re +from itertools import zip_longest + +import pytest + +from nominatim_api.search.postcode_parser import PostcodeParser +from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET + +@pytest.fixture +def pc_config(project_env): + country_file = project_env.project_dir / 'country_settings.yaml' + country_file.write_text(r""" +ab: + postcode: + pattern: "ddddd ll" +ba: + postcode: + pattern: "ddddd" +de: + postcode: + pattern: "ddddd" +gr: + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 +in: + postcode: + pattern: "(ddd) ?(ddd)" + output: \1\2 +mc: + postcode: + pattern: "980dd" +mz: + postcode: + pattern: "(dddd)(?:-dd)?" +bn: + postcode: + pattern: "(ll) ?(dddd)" + output: \1\2 +ky: + postcode: + pattern: "(d)-(dddd)" + output: KY\1-\2 + """) + + return project_env + +def mk_query(inp): + query = QueryStruct([]) + phrase_split = re.split(r"([ ,:'-])", inp) + + for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'): + query.add_node(breakchar, PHRASE_ANY, 0.1, word, word) + + return query + + +@pytest.mark.parametrize('query,pos', [('45325 Berlin', 0), + ('45325:Berlin', 0), + ('45325,Berlin', 0), + ('Berlin 45325', 1), + ('Berlin,45325', 1), + ('Berlin:45325', 1), + ('Hansastr,45325 Berlin', 1), + ('Hansastr 45325 Berlin', 1)]) +def test_simple_postcode(pc_config, query, pos): + parser = PostcodeParser(pc_config) + + result = parser.parse(mk_query(query)) + + assert result == {(pos, pos + 1, '45325'), (pos, pos + 1, '453 25')} + +def test_contained_postcode(pc_config): + parser = PostcodeParser(pc_config) + + assert parser.parse(mk_query('12345 dx')) == {(0, 1, '12345'), (0, 1, '123 45'), + (0, 2, '12345 DX')} + + + +@pytest.mark.parametrize('query,frm,to', [('345987', 0, 1), ('345 987', 0, 2), + ('Aina 345 987', 1, 3), + ('Aina 23 345 987 ff', 2, 4)]) +def test_postcode_with_space(pc_config, query, frm, to): + parser = PostcodeParser(pc_config) + + result = parser.parse(mk_query(query)) + + assert result == {(frm, to, '345987')} + +def test_overlapping_postcode(pc_config): + parser = PostcodeParser(pc_config) + + assert parser.parse(mk_query('123 456 78')) == {(0, 2, '123456'), (1, 3, '456 78')} + + +@pytest.mark.parametrize('query', ['45325-Berlin', "45325'Berlin", + 'Berlin-45325', "Berlin'45325", '45325Berlin' + '345-987', "345'987", '345,987', '345:987']) +def test_not_a_postcode(pc_config, query): + parser = PostcodeParser(pc_config) + + assert not parser.parse(mk_query(query)) + + +@pytest.mark.parametrize('query', ['ba 12233', 'ba-12233']) +def test_postcode_with_country_prefix(pc_config, query): + parser = PostcodeParser(pc_config) + + assert (0, 2, '12233') in parser.parse(mk_query(query)) + + +def test_postcode_with_joined_country_prefix(pc_config): + parser = PostcodeParser(pc_config) + + assert parser.parse(mk_query('ba12233')) == {(0, 1, '12233')} + + +def test_postcode_with_non_matching_country_prefix(pc_config): + parser = PostcodeParser(pc_config) + + assert not parser.parse(mk_query('ky12233')) + +def test_postcode_inside_postcode_phrase(pc_config): + parser = PostcodeParser(pc_config) + + query = QueryStruct([]) + query.nodes[-1].ptype = PHRASE_STREET + query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345') + query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz') + query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444') + + assert parser.parse(query) == {(2, 3, '4444')} + + +def test_partial_postcode_in_postcode_phrase(pc_config): + parser = PostcodeParser(pc_config) + + query = QueryStruct([]) + query.nodes[-1].ptype = PHRASE_POSTCODE + query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224') + query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345') + + assert not parser.parse(query) diff --git a/test/python/api/search/test_query.py b/test/python/api/search/test_query.py index c39094f0..bfed38df 100644 --- a/test/python/api/search/test_query.py +++ b/test/python/api/search/test_query.py @@ -46,3 +46,20 @@ def test_token_range_unimplemented_ops(): nq.TokenRange(1, 3) <= nq.TokenRange(10, 12) with pytest.raises(TypeError): nq.TokenRange(1, 3) >= nq.TokenRange(10, 12) + + +def test_query_extract_words(): + q = nq.QueryStruct([]) + q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '') + q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '') + q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '') + q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '') + + words = q.extract_words(base_penalty=1.0) + + assert set(words.keys()) \ + == {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'} + assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)] + assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)] + assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)] + diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index a2bf6766..06a3cd6c 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -265,37 +265,13 @@ class TestPostcodes: 'address': {'postcode': postcode}})) - def test_update_postcodes_from_db_empty(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('de', '12345'), ('se', '132 34'), - ('bm', 'AB23'), ('fr', '12345'))) - - self.analyzer.update_postcodes_from_db() - - assert word_table.count() == 5 - assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'} - - - def test_update_postcodes_from_db_ambigious(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('in', '123456'), ('sg', '123456'))) - - self.analyzer.update_postcodes_from_db() - - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'123456', '123456@123 456'} - - - def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45'))) + def test_update_postcodes_deleted(self, word_table): word_table.add_postcode(' 1234', '1234') word_table.add_postcode(' 5678', '5678') self.analyzer.update_postcodes_from_db() - assert word_table.count() == 5 - assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'} + assert word_table.count() == 0 def test_process_place_postcode_simple(self, word_table): @@ -303,16 +279,12 @@ class TestPostcodes: assert info['postcode'] == '12345' - assert word_table.get_postcodes() == {'12345', } - def test_process_place_postcode_with_space(self, word_table): info = self.process_postcode('in', '123 567') assert info['postcode'] == '123567' - assert word_table.get_postcodes() == {'123567@123 567', } - def test_update_special_phrase_empty_table(analyzer, word_table): @@ -477,9 +449,9 @@ class TestPlaceAddress: @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) def test_process_place_postcode(self, word_table, pcode): - self.process_address(postcode=pcode) + info = self.process_address(postcode=pcode) - assert word_table.get_postcodes() == {pcode, } + assert info['postcode'] == pcode @pytest.mark.parametrize('hnr', ['123a', '1', '101'])