X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/b90e719da595b6760b39b7cd64ee29447de2d5e8..efafa5271957fb54b356ec1c90e8613f14de40d4:/test/python/tokenizer/test_icu.py diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 5dbe292e..a19578c9 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -1,5 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. """ -Tests for Legacy ICU tokenizer. +Tests for ICU tokenizer. """ import shutil import yaml @@ -20,20 +26,17 @@ def word_table(temp_db_conn): @pytest.fixture -def test_config(def_config, tmp_path): - def_config.project_dir = tmp_path / 'project' - def_config.project_dir.mkdir() - +def test_config(project_env, tmp_path): sqldir = tmp_path / 'sql' sqldir.mkdir() (sqldir / 'tokenizer').mkdir() (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'") - shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'), + shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'), str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql')) - def_config.lib_dir.sql = sqldir + project_env.lib_dir.sql = sqldir - return def_config + return project_env @pytest.fixture @@ -144,12 +147,6 @@ LANGUAGE plpgsql; """) -@pytest.fixture -def getorcreate_hnr_id(temp_db_cursor): - temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) - RETURNS INTEGER AS $$ - SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""") - def test_init_new(tokenizer_factory, test_config, db_prop): tok = tokenizer_factory() @@ -197,6 +194,47 @@ def test_update_sql_functions(db_prop, temp_db_cursor, assert test_content == set((('1133', ), )) +def test_finalize_import(tokenizer_factory, temp_db_conn, + temp_db_cursor, test_config, sql_preprocessor_cfg): + func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql' + func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT + AS $$ SELECT 'b'::text $$ LANGUAGE SQL""") + + tok = tokenizer_factory() + tok.init_new_db(test_config) + + tok.finalize_import(test_config) + + temp_db_cursor.scalar('SELECT test()') == 'b' + + +def test_check_database(test_config, tokenizer_factory, + temp_db_cursor, sql_preprocessor_cfg): + tok = tokenizer_factory() + tok.init_new_db(test_config) + + assert tok.check_database(test_config) is None + + +def test_update_statistics_reverse_only(word_table, tokenizer_factory): + tok = tokenizer_factory() + tok.update_statistics() + + +def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory): + word_table.add_full_word(1000, 'hello') + table_factory('search_name', + 'place_id BIGINT, name_vector INT[]', + [(12, [1000])]) + tok = tokenizer_factory() + + tok.update_statistics() + + assert temp_db_cursor.scalar("""SELECT count(*) FROM word + WHERE type = 'W' and + (info->>'count')::int > 0""") > 0 + + def test_normalize_postcode(analyzer): with analyzer() as anl: anl.normalize_postcode('123') == '123' @@ -367,6 +405,13 @@ class TestPlaceAddress: yield anl + @pytest.fixture + def getorcreate_hnr_id(self, temp_db_cursor): + temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) + RETURNS INTEGER AS $$ + SELECT -nextval('seq_word')::INTEGER; $$ LANGUAGE SQL""") + + def process_address(self, **kwargs): return self.analyzer.process_place(PlaceInfo({'address': kwargs})) @@ -432,9 +477,25 @@ class TestPlaceAddress: def test_process_place_street(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}})) + info = self.process_address(street='Grand Road') + + assert eval(info['street']) == self.name_token_set('#Grand Road') + + + def test_process_place_nonexisting_street(self): info = self.process_address(street='Grand Road') - assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD') + assert 'street' not in info + + + def test_process_place_multiple_street_tags(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road', + 'ref': '05989'}})) + info = self.process_address(**{'street': 'Grand Road', + 'street:sym_ul': '05989'}) + + assert eval(info['street']) == self.name_token_set('#Grand Road', '#05989') def test_process_place_street_empty(self): @@ -443,12 +504,28 @@ class TestPlaceAddress: assert 'street' not in info + def test_process_place_street_from_cache(self): + self.analyzer.process_place(PlaceInfo({'name': {'name' : 'Grand Road'}})) + self.process_address(street='Grand Road') + + # request address again + info = self.process_address(street='Grand Road') + + assert eval(info['street']) == self.name_token_set('#Grand Road') + + def test_process_place_place(self): info = self.process_address(place='Honu Lulu') assert eval(info['place']) == self.name_token_set('HONU', 'LULU') + def test_process_place_place_extra(self): + info = self.process_address(**{'place:en': 'Honu Lulu'}) + + assert 'place' not in info + + def test_process_place_place_empty(self): info = self.process_address(place='🜵') @@ -468,6 +545,14 @@ class TestPlaceAddress: assert result == {'city': city, 'suburb': city, 'state': state} + def test_process_place_multiple_address_terms(self): + info = self.process_address(**{'city': 'Bruxelles', 'city:de': 'Brüssel'}) + + result = {k: eval(v) for k,v in info['addr'].items()} + + assert result == {'city': self.name_token_set('Bruxelles')} + + def test_process_place_address_terms_empty(self): info = self.process_address(country='de', city=' ', street='Hauptstr', full='right behind the church')