X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/ca7b46511d41d67e229f758e638367c241815c11..a6dab5e300de9c5664f714eb2c9290d18f01067f:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index e9812ba0..171d4392 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -16,7 +16,7 @@ from textwrap import dedent from nominatim.db.connection import connect from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor -from nominatim.indexer.place_info import PlaceInfo +from nominatim.data.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer @@ -290,33 +290,72 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ Update postcode tokens in the word table from the location_postcode table. """ - to_delete = [] + analyzer = self.token_analysis.analysis.get('@postcode') + with self.conn.cursor() as cur: - # This finds us the rows in location_postcode and word that are - # missing in the other table. - cur.execute("""SELECT * FROM - (SELECT pc, word FROM - (SELECT distinct(postcode) as pc FROM location_postcode) p - FULL JOIN - (SELECT word FROM word WHERE type = 'P') w - ON pc = word) x - WHERE pc is null or word is null""") - - with CopyBuffer() as copystr: - for postcode, word in cur: - if postcode is None: - to_delete.append(word) - else: - copystr.add(self._search_normalized(postcode), - 'P', postcode) - - if to_delete: - cur.execute("""DELETE FROM WORD - WHERE type ='P' and word = any(%s) - """, (to_delete, )) - - copystr.copy_out(cur, 'word', - columns=['word_token', 'type', 'word']) + # First get all postcode names currently in the word table. + cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'") + word_entries = set((entry[0] for entry in cur)) + + # Then compute the required postcode names from the postcode table. + needed_entries = set() + cur.execute("SELECT country_code, postcode FROM location_postcode") + for cc, postcode in cur: + info = PlaceInfo({'country_code': cc, + 'class': 'place', 'type': 'postcode', + 'address': {'postcode': postcode}}) + address = self.sanitizer.process_names(info)[1] + for place in address: + if place.kind == 'postcode': + if analyzer is None: + postcode_name = place.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(place.name) + variant_base = place.get_attr("variant") + + if variant_base: + needed_entries.add(f'{postcode_name}@{variant_base}') + else: + needed_entries.add(postcode_name) + break + + # Now update the word table. + self._delete_unused_postcode_words(word_entries - needed_entries) + self._add_missing_postcode_words(needed_entries - word_entries) + + def _delete_unused_postcode_words(self, tokens): + if tokens: + with self.conn.cursor() as cur: + cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)", + (list(tokens), )) + + def _add_missing_postcode_words(self, tokens): + if not tokens: + return + + analyzer = self.token_analysis.analysis.get('@postcode') + terms = [] + + for postcode_name in tokens: + if '@' in postcode_name: + term, variant = postcode_name.split('@', 2) + term = self._search_normalized(term) + variants = {term} + if analyzer is not None: + variants.update(analyzer.get_variants_ascii(variant)) + variants = list(variants) + else: + variants = [self._search_normalized(postcode_name)] + terms.append((postcode_name, variants)) + + if terms: + with self.conn.cursor() as cur: + cur.execute_values("""SELECT create_postcode_word(pc, var) + FROM (VALUES %s) AS v(pc, var)""", + terms) + + def update_special_phrases(self, phrases, should_replace): @@ -607,7 +646,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - analyzer = self.token_analysis.get_analyzer('@postcode') + analyzer = self.token_analysis.analysis.get('@postcode') if analyzer is None: postcode_name = item.name.strip().upper() @@ -616,7 +655,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): postcode_name = analyzer.normalize(item.name) variant_base = item.get_attr("variant") - if variant_base is not None: + if variant_base: postcode = f'{postcode_name}@{variant_base}' else: postcode = postcode_name @@ -624,10 +663,10 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if postcode not in self._cache.postcodes: term = self._search_normalized(postcode_name) if not term: - return + return None variants = {term} - if analyzer is not None and variant_base is not None: + if analyzer is not None and variant_base: variants.update(analyzer.get_variants_ascii(variant_base)) with self.conn.cursor() as cur: @@ -635,6 +674,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): (postcode, list(variants))) self._cache.postcodes.add(postcode) + return postcode_name + class _TokenInfo: """ Collect token information to be sent back to the database. @@ -675,6 +716,9 @@ class _TokenInfo: if self.address_tokens: out['addr'] = self.address_tokens + if self.postcode: + out['postcode'] = self.postcode + return out