X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/16daa57e4757e4daeffec1e61630f989727dc563..33b611e24313db83b7a5e175a4e11c5f66c4d25d:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 87906d71..12d1eccd 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -13,6 +13,7 @@ from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer @@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer): Analyzers are not thread-safe. You need to instantiate one per thread. """ - return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis()) + return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(), + self.loader.make_token_analysis()) def _install_php(self, phpdir): @@ -162,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer): """ Count the partial terms from the names in the place table. """ words = Counter() - name_proc = self.loader.make_token_analysis() + analysis = self.loader.make_token_analysis() with conn.cursor(name="words") as cur: cur.execute(""" SELECT v, count(*) FROM @@ -170,12 +172,10 @@ class LegacyICUTokenizer(AbstractTokenizer): WHERE length(v) < 75 GROUP BY v""") for name, cnt in cur: - terms = set() - for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - if ' ' in word: - terms.update(word.split()) - for term in terms: - words[term] += cnt + word = analysis.search.transliterate(name) + if word and ' ' in word: + for term in set(word.split()): + words[term] += cnt return words @@ -187,10 +187,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): normalization. """ - def __init__(self, dsn, name_proc): + def __init__(self, dsn, sanitizer, token_analysis): self.conn = connect(dsn).connection self.conn.autocommit = True - self.name_processor = name_proc + self.sanitizer = sanitizer + self.token_analysis = token_analysis self._cache = _TokenCache() @@ -203,6 +204,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): self.conn = None + def _search_normalized(self, name): + """ Return the search token transliteration of the given name. + """ + return self.token_analysis.search.transliterate(name).strip() + + + def _normalized(self, name): + """ Return the normalized version of the given name with all + non-relevant information removed. + """ + return self.token_analysis.normalizer.transliterate(name).strip() + + def get_word_token_info(self, words): """ Return token information for the given list of words. If a word starts with # it is assumed to be a full name @@ -218,9 +232,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): partial_tokens = {} for word in words: if word.startswith('#'): - full_tokens[word] = self.name_processor.get_search_normalized(word[1:]) + full_tokens[word] = self._search_normalized(word[1:]) else: - partial_tokens[word] = self.name_processor.get_search_normalized(word) + partial_tokens[word] = self._search_normalized(word) with self.conn.cursor() as cur: cur.execute("""SELECT word_token, word_id @@ -251,7 +265,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): This function takes minor shortcuts on transliteration. """ - return self.name_processor.get_search_normalized(hnr) + return self._search_normalized(hnr) def update_postcodes_from_db(self): """ Update postcode tokens in the word table from the location_postcode @@ -274,7 +288,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if postcode is None: to_delete.append(word) else: - copystr.add(self.name_processor.get_search_normalized(postcode), + copystr.add(self._search_normalized(postcode), 'P', postcode) if to_delete: @@ -292,7 +306,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): completely replaced. Otherwise the phrases are added to the already existing ones. """ - norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) + norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) with self.conn.cursor() as cur: @@ -322,7 +336,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): added = 0 with CopyBuffer() as copystr: for word, cls, typ, oper in to_add: - term = self.name_processor.get_search_normalized(word) + term = self._search_normalized(word) if term: copystr.add(term, 'S', word, json.dumps({'class': cls, 'type': typ, @@ -356,9 +370,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def add_country_names(self, country_code, names): """ Add names for the given country to the search index. """ + # Make sure any name preprocessing for country names applies. + info = PlaceInfo({'name': names, 'country_code': country_code, + 'rank_address': 4, 'class': 'boundary', + 'type': 'administrative'}) + self._add_country_full_names(country_code, + self.sanitizer.process_names(info)[0]) + + + def _add_country_full_names(self, country_code, names): + """ Add names for the given country from an already sanitized + name list. + """ word_tokens = set() - for name in self._compute_full_names(names): - norm_name = self.name_processor.get_search_normalized(name) + for name in names: + norm_name = self._search_normalized(name.name) if norm_name: word_tokens.add(norm_name) @@ -384,12 +410,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def process_place(self, place): """ Determine tokenizer information about the given place. - Returns a JSON-serialisable structure that will be handed into + Returns a JSON-serializable structure that will be handed into the database via the token_info field. """ token_info = _TokenInfo(self._cache) - names = place.name + names, address = self.sanitizer.process_names(place) if names: fulls, partials = self._compute_name_tokens(names) @@ -397,9 +423,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): token_info.add_names(fulls, partials) if place.is_country(): - self.add_country_names(place.country_code, names) + self._add_country_full_names(place.country_code, names) - address = place.address if address: self._process_place_address(token_info, address) @@ -409,18 +434,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): hnrs = [] addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(self._compute_partial_tokens(value)) - elif key == 'place': - token_info.add_place(self._compute_partial_tokens(value)) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, self._compute_partial_tokens(value))) + for item in address: + if item.kind == 'postcode': + self._add_postcode(item.name) + elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(item.name) + elif item.kind == 'street': + token_info.add_street(self._compute_partial_tokens(item.name)) + elif item.kind == 'place': + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and \ + item.kind not in ('country', 'full'): + addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: hnrs = self._split_housenumbers(hnrs) @@ -429,11 +454,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if addr_terms: token_info.add_address_terms(addr_terms) + def _compute_partial_tokens(self, name): """ Normalize the given term, split it into partial words and return then token list for them. """ - norm_name = self.name_processor.get_search_normalized(name) + norm_name = self._search_normalized(name) tokens = [] need_lookup = [] @@ -456,28 +482,34 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. """ - full_names = self._compute_full_names(names) full_tokens = set() partial_tokens = set() - for name in full_names: - norm_name = self.name_processor.get_normalized(name) - full, part = self._cache.names.get(norm_name, (None, None)) + for name in names: + analyzer_id = name.get_attr('analyzer') + norm_name = self._normalized(name.name) + if analyzer_id is None: + token_id = norm_name + else: + token_id = f'{norm_name}@{analyzer_id}' + + full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = self.name_processor.get_variants_ascii(norm_name) + variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name) if not variants: continue with self.conn.cursor() as cur: cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", - (norm_name, variants)) + (token_id, variants)) full, part = cur.fetchone() - self._cache.names[norm_name] = (full, part) + self._cache.names[token_id] = (full, part) full_tokens.add(full) partial_tokens.update(part) @@ -485,23 +517,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - @staticmethod - def _compute_full_names(names): - """ Return the set of all full name word ids to be used with the - given dictionary of names. - """ - full_names = set() - for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): - if name: - full_names.add(name) - - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) - - return full_names - - def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ @@ -509,7 +524,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.name_processor.get_search_normalized(postcode) + term = self._search_normalized(postcode) if not term: return