X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/243725aae1aa35e4cd3bf2dc241828ca9e9d79e5..cfbd3652eff7b962424de8f5d078cd930b798e86:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 9600e65b..1799ae86 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -119,12 +119,13 @@ class LegacyICUTokenizer(AbstractTokenizer): if not conn.table_exists('search_name'): return with conn.cursor(name="hnr_counter") as cur: - cur.execute("""SELECT word_id, word_token FROM word + cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) + FROM word WHERE type = 'H' AND NOT EXISTS(SELECT * FROM search_name WHERE ARRAY[word.word_id] && name_vector) - AND (char_length(word_token) > 6 - OR word_token not similar to '\\d+') + AND (char_length(coalesce(word, word_token)) > 6 + OR coalesce(word, word_token) not similar to '\\d+') """) candidates = {token: wid for wid, token in cur} with conn.cursor(name="hnr_counter") as cur: @@ -137,6 +138,7 @@ class LegacyICUTokenizer(AbstractTokenizer): for hnr in row[0].split(';'): candidates.pop(hnr, None) LOG.info("There are %s outdated housenumbers.", len(candidates)) + LOG.debug("Outdated housenumbers: %s", candidates.keys()) if candidates: with conn.cursor() as cur: cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", @@ -454,9 +456,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): names, address = self.sanitizer.process_names(place) if names: - fulls, partials = self._compute_name_tokens(names) - - token_info.add_names(fulls, partials) + token_info.set_names(*self._compute_name_tokens(names)) if place.is_country(): self._add_country_full_names(place.country_code, names) @@ -464,57 +464,59 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if address: self._process_place_address(token_info, address) - return token_info.data + return token_info.to_dict() def _process_place_address(self, token_info, address): - hnr_tokens = set() - hnrs = set() - addr_terms = [] - streets = [] for item in address: if item.kind == 'postcode': self._add_postcode(item.name) elif item.kind == 'housenumber': - token, hnr = self._compute_housenumber_token(item) - if token is not None: - hnr_tokens.add(token) - hnrs.add(hnr) + token_info.add_housenumber(*self._compute_housenumber_token(item)) elif item.kind == 'street': - streets.extend(self._retrieve_full_tokens(item.name)) + token_info.add_street(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': if not item.suffix: token_info.add_place(self._compute_partial_tokens(item.name)) elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full'): - addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) - - if hnrs: - token_info.add_housenumbers(hnr_tokens, hnrs) - - if addr_terms: - token_info.add_address_terms(addr_terms) - - if streets: - token_info.add_street(streets) + token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) def _compute_housenumber_token(self, hnr): """ Normalize the housenumber and return the word token and the canonical form. """ - norm_name = self._search_normalized(hnr.name) - if not norm_name: - return None, None + analyzer = self.token_analysis.analysis.get('@housenumber') + result = None, None - token = self._cache.housenumbers.get(norm_name) - if token is None: - with self.conn.cursor() as cur: - cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) - token = cur.fetchone()[0] - self._cache.housenumbers[norm_name] = token + if analyzer is None: + # When no custom analyzer is set, simply normalize and transliterate + norm_name = self._search_normalized(hnr.name) + if norm_name: + result = self._cache.housenumbers.get(norm_name, result) + if result[0] is None: + with self.conn.cursor() as cur: + cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + result = cur.fetchone()[0], norm_name + self._cache.housenumbers[norm_name] = result + else: + # Otherwise use the analyzer to determine the canonical name. + # Per convention we use the first variant as the 'lookup name', the + # name that gets saved in the housenumber field of the place. + norm_name = analyzer.normalize(hnr.name) + if norm_name: + result = self._cache.housenumbers.get(norm_name, result) + if result[0] is None: + variants = analyzer.get_variants_ascii(norm_name) + if variants: + with self.conn.cursor() as cur: + cur.execute("SELECT create_analyzed_hnr_id(%s, %s)", + (norm_name, list(variants))) + result = cur.fetchone()[0], variants[0] + self._cache.housenumbers[norm_name] = result - return token, norm_name + return result def _compute_partial_tokens(self, name): @@ -588,7 +590,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): continue with self.conn.cursor() as cur: - cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", + cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", (token_id, variants)) full, part = cur.fetchone() @@ -626,48 +628,75 @@ class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self): - self.data = {} + self.names = None + self.housenumbers = set() + self.housenumber_tokens = set() + self.street_tokens = set() + self.place_tokens = set() + self.address_tokens = {} + @staticmethod def _mk_array(tokens): - return '{%s}' % ','.join((str(s) for s in tokens)) + return f"{{{','.join((str(s) for s in tokens))}}}" + + def to_dict(self): + """ Return the token information in database importable format. + """ + out = {} + + if self.names: + out['names'] = self.names + + if self.housenumbers: + out['hnr'] = ';'.join(self.housenumbers) + out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) + + if self.street_tokens: + out['street'] = self._mk_array(self.street_tokens) + + if self.place_tokens: + out['place'] = self._mk_array(self.place_tokens) - def add_names(self, fulls, partials): + if self.address_tokens: + out['addr'] = self.address_tokens + + return out + + + def set_names(self, fulls, partials): """ Adds token information for the normalised names. """ - self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) + self.names = self._mk_array(itertools.chain(fulls, partials)) - def add_housenumbers(self, tokens, hnrs): + def add_housenumber(self, token, hnr): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(tokens) - self.data['hnr'] = ';'.join(hnrs) + if token: + self.housenumbers.add(hnr) + self.housenumber_tokens.add(token) def add_street(self, tokens): """ Add addr:street match terms. """ - self.data['street'] = self._mk_array(tokens) + self.street_tokens.update(tokens) def add_place(self, tokens): """ Add addr:place search and match terms. """ - if tokens: - self.data['place'] = self._mk_array(tokens) + self.place_tokens.update(tokens) - def add_address_terms(self, terms): + def add_address_term(self, key, partials): """ Add additional address terms. """ - tokens = {key: self._mk_array(partials) - for key, partials in terms if partials} - - if tokens: - self.data['addr'] = tokens + if partials: + self.address_tokens[key] = self._mk_array(partials) class _TokenCache: