From 499110f549cc2c369ea3a0fb55a79a11bdc0352f Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Jan 2025 17:10:24 +0100 Subject: [PATCH] add SOFT_PHRASE break and enable parsing Also enables parsing of PART breaks. --- src/nominatim_api/search/db_search_builder.py | 1 + src/nominatim_api/search/icu_tokenizer.py | 14 +++++++++++--- src/nominatim_api/search/query.py | 8 +++++++- src/nominatim_api/search/token_assignment.py | 1 + 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index 632270ef..a6335c13 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = { BreakType.START: 0.0, BreakType.END: 0.0, BreakType.PHRASE: 0.0, + BreakType.SOFT_PHRASE: 0.0, BreakType.WORD: 0.1, BreakType.PART: 0.2, BreakType.TOKEN: 0.4 diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 5976fbec..d52614fd 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast from collections import defaultdict import dataclasses import difflib +import re +from itertools import zip_longest from icu import Transliterator @@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): wordnr = 0 for phrase in query.source: query.nodes[-1].ptype = phrase.ptype - for word in phrase.text.split(' '): + phrase_split = re.split('([ :-])', phrase.text) + # The zip construct will give us the pairs of word/break from + # the regular expression split. As the split array ends on the + # final word, we simply use the fillvalue to even out the list and + # add the phrase break at the end. + for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','): + if not word: + continue trans = self.transliterator.transliterate(word) if trans: for term in trans.split(' '): if term: parts.append(QueryPart(term, word, wordnr)) query.add_node(qmod.BreakType.TOKEN, phrase.ptype) - query.nodes[-1].btype = qmod.BreakType.WORD + query.nodes[-1].btype = qmod.BreakType(breakchar) wordnr += 1 - query.nodes[-1].btype = qmod.BreakType.PHRASE for word, wrange in yield_words(parts, phrase_start): words[word].append(wrange) diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 02ebbb5b..b2e18337 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -21,7 +21,13 @@ class BreakType(enum.Enum): END = '>' """ End of the query. """ PHRASE = ',' - """ Break between two phrases. """ + """ Hard break between two phrases. Address parts cannot cross hard + phrase boundaries.""" + SOFT_PHRASE = ':' + """ Likely break between two phrases. Address parts should not cross soft + phrase boundaries. Soft breaks can be inserted by a preprocessor + that is analysing the input string. + """ WORD = ' ' """ Break between words. """ PART = '-' diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py index a2e1804c..0983fd13 100644 --- a/src/nominatim_api/search/token_assignment.py +++ b/src/nominatim_api/search/token_assignment.py @@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = { qmod.BreakType.START: 0.0, qmod.BreakType.END: 0.0, qmod.BreakType.PHRASE: 0.0, + qmod.BreakType.SOFT_PHRASE: 0.0, qmod.BreakType.WORD: 0.1, qmod.BreakType.PART: 0.2, qmod.BreakType.TOKEN: 0.4 -- 2.39.5