X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/9bc5be837b0bf366306ce526a1a15a2835d8db85..0f196952259baddb77bd1c60ffc3b5ef214da179:/nominatim/api/search/icu_tokenizer.py?ds=inline diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 9bd16e1d..b68e8d10 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -83,7 +83,7 @@ class ICUToken(qmod.Token): seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm) distance = 0 for tag, afrom, ato, bfrom, bto in seq.get_opcodes(): - if tag == 'delete' and (afrom == 0 or ato == len(self.lookup_word)): + if tag in ('delete', 'insert') and (afrom == 0 or ato == len(self.lookup_word)): distance += 1 elif tag == 'replace': distance += max((ato-afrom), (bto-bfrom)) @@ -133,10 +133,19 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): async def setup(self) -> None: """ Set up static data structures needed for the analysis. """ - rules = await self.conn.get_property('tokenizer_import_normalisation') - self.normalizer = Transliterator.createFromRules("normalization", rules) - rules = await self.conn.get_property('tokenizer_import_transliteration') - self.transliterator = Transliterator.createFromRules("transliteration", rules) + async def _make_normalizer() -> Any: + rules = await self.conn.get_property('tokenizer_import_normalisation') + return Transliterator.createFromRules("normalization", rules) + + self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer', + _make_normalizer) + + async def _make_transliterator() -> Any: + rules = await self.conn.get_property('tokenizer_import_transliteration') + return Transliterator.createFromRules("transliteration", rules) + + self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator', + _make_transliterator) if 'word' not in self.conn.t.meta.tables: sa.Table('word', self.conn.t.meta, @@ -153,7 +162,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ log().section('Analyze query (using ICU tokenizer)') normalized = list(filter(lambda p: p.text, - (qmod.Phrase(p.ptype, self.normalizer.transliterate(p.text)) + (qmod.Phrase(p.ptype, self.normalize_text(p.text)) for p in phrases))) query = qmod.QueryStruct(normalized) log().var_dump('Normalized query', query.source) @@ -187,6 +196,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): return query + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. That is the + standardized form search will work with. All information removed + at this stage is inevitably lost. + """ + return cast(str, self.normalizer.transliterate(text)) + + def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens.