X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/b8c544cc98bec8174d28cf6895dd47ddb9c7b57b..1fcc9717bb6c543aa6e6cd7b5d0a65971dfec409:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 3ce4895b..1799ae86 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -119,12 +119,13 @@ class LegacyICUTokenizer(AbstractTokenizer): if not conn.table_exists('search_name'): return with conn.cursor(name="hnr_counter") as cur: - cur.execute("""SELECT word_id, word_token FROM word + cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) + FROM word WHERE type = 'H' AND NOT EXISTS(SELECT * FROM search_name WHERE ARRAY[word.word_id] && name_vector) - AND (char_length(word_token) > 6 - OR word_token not similar to '\\d+') + AND (char_length(coalesce(word, word_token)) > 6 + OR coalesce(word, word_token) not similar to '\\d+') """) candidates = {token: wid for wid, token in cur} with conn.cursor(name="hnr_counter") as cur: @@ -137,6 +138,7 @@ class LegacyICUTokenizer(AbstractTokenizer): for hnr in row[0].split(';'): candidates.pop(hnr, None) LOG.info("There are %s outdated housenumbers.", len(candidates)) + LOG.debug("Outdated housenumbers: %s", candidates.keys()) if candidates: with conn.cursor() as cur: cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", @@ -485,18 +487,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ Normalize the housenumber and return the word token and the canonical form. """ - norm_name = self._search_normalized(hnr.name) - if not norm_name: - return None, None + analyzer = self.token_analysis.analysis.get('@housenumber') + result = None, None - token = self._cache.housenumbers.get(norm_name) - if token is None: - with self.conn.cursor() as cur: - cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) - token = cur.fetchone()[0] - self._cache.housenumbers[norm_name] = token + if analyzer is None: + # When no custom analyzer is set, simply normalize and transliterate + norm_name = self._search_normalized(hnr.name) + if norm_name: + result = self._cache.housenumbers.get(norm_name, result) + if result[0] is None: + with self.conn.cursor() as cur: + cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + result = cur.fetchone()[0], norm_name + self._cache.housenumbers[norm_name] = result + else: + # Otherwise use the analyzer to determine the canonical name. + # Per convention we use the first variant as the 'lookup name', the + # name that gets saved in the housenumber field of the place. + norm_name = analyzer.normalize(hnr.name) + if norm_name: + result = self._cache.housenumbers.get(norm_name, result) + if result[0] is None: + variants = analyzer.get_variants_ascii(norm_name) + if variants: + with self.conn.cursor() as cur: + cur.execute("SELECT create_analyzed_hnr_id(%s, %s)", + (norm_name, list(variants))) + result = cur.fetchone()[0], variants[0] + self._cache.housenumbers[norm_name] = result - return token, norm_name + return result def _compute_partial_tokens(self, name): @@ -570,7 +590,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): continue with self.conn.cursor() as cur: - cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", + cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", (token_id, variants)) full, part = cur.fetchone()