From: Sarah Hoffmann Date: Sat, 1 Mar 2025 09:20:33 +0000 (+0100) Subject: remove postcode computation for word table during import X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/a574b98e4a59ae2460cce406e083c5c25fa15caa remove postcode computation for word table during import --- diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 5595fcb2..3da1171f 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer): return postcode.strip().upper() def update_postcodes_from_db(self) -> None: - """ Update postcode tokens in the word table from the location_postcode - table. + """ Postcode update. + + Removes all postcodes from the word table because they are not + needed. Postcodes are recognised by pattern. """ assert self.conn is not None - analyzer = self.token_analysis.analysis.get('@postcode') with self.conn.cursor() as cur: - # First get all postcode names currently in the word table. - cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'") - word_entries = set((entry[0] for entry in cur)) - - # Then compute the required postcode names from the postcode table. - needed_entries = set() - cur.execute("SELECT country_code, postcode FROM location_postcode") - for cc, postcode in cur: - info = PlaceInfo({'country_code': cc, - 'class': 'place', 'type': 'postcode', - 'address': {'postcode': postcode}}) - address = self.sanitizer.process_names(info)[1] - for place in address: - if place.kind == 'postcode': - if analyzer is None: - postcode_name = place.name.strip().upper() - variant_base = None - else: - postcode_name = analyzer.get_canonical_id(place) - variant_base = place.get_attr("variant") - - if variant_base: - needed_entries.add(f'{postcode_name}@{variant_base}') - else: - needed_entries.add(postcode_name) - break - - # Now update the word table. - self._delete_unused_postcode_words(word_entries - needed_entries) - self._add_missing_postcode_words(needed_entries - word_entries) - - def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None: - assert self.conn is not None - if tokens: - with self.conn.cursor() as cur: - cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)", - (list(tokens), )) - - def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None: - assert self.conn is not None - if not tokens: - return - - analyzer = self.token_analysis.analysis.get('@postcode') - terms = [] - - for postcode_name in tokens: - if '@' in postcode_name: - term, variant = postcode_name.split('@', 2) - term = self._search_normalized(term) - if analyzer is None: - variants = [term] - else: - variants = analyzer.compute_variants(variant) - if term not in variants: - variants.append(term) - else: - variants = [self._search_normalized(postcode_name)] - terms.append((postcode_name, variants)) - - if terms: - with self.conn.cursor() as cur: - cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms) + cur.execute("DELETE FROM word WHERE type = 'P'") def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: @@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer): analyzer = self.token_analysis.analysis.get('@postcode') if analyzer is None: - postcode_name = item.name.strip().upper() - variant_base = None - else: - postcode_name = analyzer.get_canonical_id(item) - variant_base = item.get_attr("variant") - - if variant_base: - postcode = f'{postcode_name}@{variant_base}' + return item.name.strip().upper() else: - postcode = postcode_name - - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode_name) - if not term: - return None - - variants = {term} - if analyzer is not None and variant_base: - variants.update(analyzer.compute_variants(variant_base)) - - with self.conn.cursor() as cur: - cur.execute("SELECT create_postcode_word(%s, %s)", - (postcode, list(variants))) - self._cache.postcodes.add(postcode) - - return postcode_name + return analyzer.get_canonical_id(item) class _TokenInfo: @@ -836,5 +752,4 @@ class _TokenCache: self.names: Dict[str, Tuple[int, List[int]]] = {} self.partials: Dict[str, int] = {} self.fulls: Dict[str, List[int]] = {} - self.postcodes: Set[str] = set() self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {} diff --git a/test/python/api/search/test_icu_query_analyzer.py b/test/python/api/search/test_icu_query_analyzer.py index eb453fda..fc200bca 100644 --- a/test/python/api/search/test_icu_query_analyzer.py +++ b/test/python/api/search/test_icu_query_analyzer.py @@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn): @pytest.mark.asyncio @pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']), - ('3', ['H', 'P', 'W', 'w']) + ('3', ['H', 'W', 'w']) ]) async def test_penalty_postcodes_and_housenumbers(conn, term, order): ana = await tok.create_query_analyzer(conn) - await add_word(conn, 1, term, 'P', None) await add_word(conn, 2, term, 'H', term) await add_word(conn, 3, term, 'w', term) await add_word(conn, 4, term, 'W', term) @@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn): assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER assert len(query.nodes[1].starting[0].tokens) == 1 assert query.nodes[1].starting[0].tokens[0].token == 1 - assert not query.nodes[2].starting - assert not query.nodes[3].starting + assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE) + assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER) + assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER) + assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER) @pytest.mark.asyncio diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index a2bf6766..06a3cd6c 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -265,37 +265,13 @@ class TestPostcodes: 'address': {'postcode': postcode}})) - def test_update_postcodes_from_db_empty(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('de', '12345'), ('se', '132 34'), - ('bm', 'AB23'), ('fr', '12345'))) - - self.analyzer.update_postcodes_from_db() - - assert word_table.count() == 5 - assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'} - - - def test_update_postcodes_from_db_ambigious(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('in', '123456'), ('sg', '123456'))) - - self.analyzer.update_postcodes_from_db() - - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'123456', '123456@123 456'} - - - def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45'))) + def test_update_postcodes_deleted(self, word_table): word_table.add_postcode(' 1234', '1234') word_table.add_postcode(' 5678', '5678') self.analyzer.update_postcodes_from_db() - assert word_table.count() == 5 - assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'} + assert word_table.count() == 0 def test_process_place_postcode_simple(self, word_table): @@ -303,16 +279,12 @@ class TestPostcodes: assert info['postcode'] == '12345' - assert word_table.get_postcodes() == {'12345', } - def test_process_place_postcode_with_space(self, word_table): info = self.process_postcode('in', '123 567') assert info['postcode'] == '123567' - assert word_table.get_postcodes() == {'123567@123 567', } - def test_update_special_phrase_empty_table(analyzer, word_table): @@ -477,9 +449,9 @@ class TestPlaceAddress: @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) def test_process_place_postcode(self, word_table, pcode): - self.process_address(postcode=pcode) + info = self.process_address(postcode=pcode) - assert word_table.get_postcodes() == {pcode, } + assert info['postcode'] == pcode @pytest.mark.parametrize('hnr', ['123a', '1', '101'])