X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/37eeccbf4cd7c25239b78d6c3747fccb1bca519c..691ec0858601f91adb942d2cb5d3a9b844005780:/nominatim/tokenizer/icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index ea6e5d3c..9c25b6d7 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -1,3 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. """ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. @@ -106,6 +112,47 @@ class LegacyICUTokenizer(AbstractTokenizer): conn.commit() + def _cleanup_housenumbers(self): + """ Remove unused house numbers. + """ + with connect(self.dsn) as conn: + if not conn.table_exists('search_name'): + return + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT word_id, word_token FROM word + WHERE type = 'H' + AND NOT EXISTS(SELECT * FROM search_name + WHERE ARRAY[word.word_id] && name_vector) + AND (char_length(word_token) > 6 + OR word_token not similar to '\\d+') + """) + candidates = {token: wid for wid, token in cur} + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT housenumber FROM placex + WHERE housenumber is not null + AND (char_length(housenumber) > 6 + OR housenumber not similar to '\\d+') + """) + for row in cur: + for hnr in row[0].split(';'): + candidates.pop(hnr, None) + LOG.info("There are %s outdated housenumbers.", len(candidates)) + if candidates: + with conn.cursor() as cur: + cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", + (list(candidates.values()), )) + conn.commit() + + + + def update_word_tokens(self): + """ Remove unused tokens. + """ + LOG.warning("Cleaning up housenumber tokens.") + self._cleanup_housenumbers() + LOG.warning("Tokenizer house-keeping done.") + + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should @@ -343,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def add_country_names(self, country_code, names): - """ Add names for the given country to the search index. + """ Add default names for the given country to the search index. """ # Make sure any name preprocessing for country names applies. info = PlaceInfo({'name': names, 'country_code': country_code, 'rank_address': 4, 'class': 'boundary', 'type': 'administrative'}) self._add_country_full_names(country_code, - self.sanitizer.process_names(info)[0]) + self.sanitizer.process_names(info)[0], + internal=True) - def _add_country_full_names(self, country_code, names): + def _add_country_full_names(self, country_code, names, internal=False): """ Add names for the given country from an already sanitized name list. """ @@ -365,21 +413,41 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): with self.conn.cursor() as cur: # Get existing names - cur.execute("""SELECT word_token FROM word - WHERE type = 'C' and word = %s""", + cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal + FROM word + WHERE type = 'C' and word = %s""", (country_code, )) - word_tokens.difference_update((t[0] for t in cur)) + existing_tokens = {True: set(), False: set()} # internal/external names + for word in cur: + existing_tokens[word[1]].add(word[0]) + + # Delete names that no longer exist. + gone_tokens = existing_tokens[internal] - word_tokens + if internal: + gone_tokens.update(existing_tokens[False] & word_tokens) + if gone_tokens: + cur.execute("""DELETE FROM word + USING unnest(%s) as token + WHERE type = 'C' and word = %s + and word_token = token""", + (list(gone_tokens), country_code)) # Only add those names that are not yet in the list. - if word_tokens: - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT token, 'C', %s - FROM unnest(%s) as token) - """, (country_code, list(word_tokens))) - - # No names are deleted at the moment. - # If deletion is made possible, then the static names from the - # initial 'country_name' table should be kept. + new_tokens = word_tokens - existing_tokens[True] + if not internal: + new_tokens -= existing_tokens[False] + if new_tokens: + if internal: + sql = """INSERT INTO word (word_token, type, word, info) + (SELECT token, 'C', %s, '{"internal": "yes"}' + FROM unnest(%s) as token) + """ + else: + sql = """INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s + FROM unnest(%s) as token) + """ + cur.execute(sql, (country_code, list(new_tokens))) def process_place(self, place): @@ -407,28 +475,34 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): - hnrs = [] + hnrs = set() addr_terms = [] + streets = [] for item in address: if item.kind == 'postcode': self._add_postcode(item.name) - elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(item.name) + elif item.kind == 'housenumber': + norm_name = self._make_standard_hnr(item.name) + if norm_name: + hnrs.add(norm_name) elif item.kind == 'street': - token_info.add_street(self._compute_partial_tokens(item.name)) + streets.extend(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': - token_info.add_place(self._compute_partial_tokens(item.name)) - elif not item.kind.startswith('_') and \ + if not item.suffix: + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full'): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + token_info.add_housenumbers(self.conn, hnrs) if addr_terms: token_info.add_address_terms(addr_terms) + if streets: + token_info.add_street(streets) + def _compute_partial_tokens(self, name): """ Normalize the given term, split it into partial words and return @@ -458,6 +532,26 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens + def _retrieve_full_tokens(self, name): + """ Get the full name token for the given name, if it exists. + The name is only retrived for the standard analyser. + """ + norm_name = self._search_normalized(name) + + # return cached if possible + if norm_name in self._cache.fulls: + return self._cache.fulls[norm_name] + + with self.conn.cursor() as cur: + cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'", + (norm_name, )) + full = [row[0] for row in cur] + + self._cache.fulls[norm_name] = full + + return full + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -514,24 +608,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): self._cache.postcodes.add(postcode) - @staticmethod - def _split_housenumbers(hnrs): - if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: - # split numbers if necessary - simple_list = [] - for hnr in hnrs: - simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr))) - - if len(simple_list) > 1: - hnrs = list(set(simple_list)) - else: - hnrs = simple_list - - return hnrs - - - - class _TokenInfo: """ Collect token information to be sent back to the database. """ @@ -561,8 +637,7 @@ class _TokenInfo: def add_street(self, tokens): """ Add addr:street match terms. """ - if tokens: - self.data['street'] = self._mk_array(tokens) + self.data['street'] = self._mk_array(tokens) def add_place(self, tokens): @@ -591,6 +666,7 @@ class _TokenCache: def __init__(self): self.names = {} self.partials = {} + self.fulls = {} self.postcodes = set() self.housenumbers = {}