X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/4abaf712341758f50484fe1fe2764a7a5216de78..25318e8a69512c9543ca5c7fee57839d5b417cb0:/test/python/test_tokenizer_legacy.py?ds=sidebyside diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 15ae50a4..35e24ca4 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -46,7 +46,7 @@ def tokenizer_factory(dsn, tmp_path, property_table): @pytest.fixture def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor): - monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None) tok = tokenizer_factory() tok.init_new_db(test_config) @@ -60,7 +60,7 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor, RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL; """) - monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None) monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();') tok = tokenizer_factory() tok.init_new_db(test_config) @@ -73,7 +73,7 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor, @pytest.fixture def make_standard_name(temp_db_cursor): temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) - RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""") + RETURNS TEXT AS $$ SELECT '#' || lower(name) || '#'; $$ LANGUAGE SQL""") @pytest.fixture @@ -86,16 +86,6 @@ def create_postcode_id(temp_db_cursor): $$ LANGUAGE SQL""") -@pytest.fixture -def create_housenumbers(temp_db_cursor): - temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers( - housenumbers TEXT[], - OUT tokens TEXT, OUT normtext TEXT) - AS $$ - SELECT housenumbers::TEXT, array_to_string(housenumbers, ';') - $$ LANGUAGE SQL""") - - @pytest.fixture def make_keywords(temp_db_cursor, temp_db_with_extensions): temp_db_cursor.execute( @@ -105,7 +95,7 @@ def make_keywords(temp_db_cursor, temp_db_with_extensions): def test_init_new(tokenizer_factory, test_config, monkeypatch, temp_db_conn, sql_preprocessor): monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv') - monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None) tok = tokenizer_factory() tok.init_new_db(test_config) @@ -119,8 +109,7 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, assert outfile.stat().st_mode == 33261 -def test_init_module_load_failed(tokenizer_factory, test_config, - monkeypatch, temp_db_conn): +def test_init_module_load_failed(tokenizer_factory, test_config): tok = tokenizer_factory() with pytest.raises(UsageError): @@ -134,7 +123,7 @@ def test_init_module_custom(tokenizer_factory, test_config, (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so') monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir)) - monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None) tok = tokenizer_factory() tok.init_new_db(test_config) @@ -154,7 +143,7 @@ def test_update_sql_functions(sql_preprocessor, temp_db_conn, tokenizer_factory, test_config, table_factory, monkeypatch, temp_db_cursor): monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133') - monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None) tok = tokenizer_factory() tok.init_new_db(test_config) monkeypatch.undo() @@ -174,7 +163,7 @@ def test_update_sql_functions(sql_preprocessor, temp_db_conn, def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch): - monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + monkeypatch.setattr(legacy_tokenizer, '_check_module', lambda m, c: None) tok = tokenizer_factory() tok.migrate_database(test_config) @@ -220,83 +209,131 @@ def test_update_special_phrase_empty_table(analyzer, word_table, make_standard_n analyzer.update_special_phrases([ ("König bei", "amenity", "royal", "near"), ("Könige", "amenity", "royal", "-"), + ("könige", "amenity", "royal", "-"), ("strasse", "highway", "primary", "in") - ]) + ], True) assert word_table.get_special() \ - == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'), - (' könige', 'könige', 'amenity', 'royal', None), - (' strasse', 'strasse', 'highway', 'primary', 'in'))) + == set(((' #könig bei#', 'könig bei', 'amenity', 'royal', 'near'), + (' #könige#', 'könige', 'amenity', 'royal', None), + (' #strasse#', 'strasse', 'highway', 'primary', 'in'))) -def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor, - make_standard_name): - word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' bar', 'bar', 'highway', 'road', None) +def test_update_special_phrase_delete_all(analyzer, word_table, make_standard_name): + word_table.add_special(' #foo#', 'foo', 'amenity', 'prison', 'in') + word_table.add_special(' #bar#', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 - analyzer.update_special_phrases([]) + analyzer.update_special_phrases([], True) assert word_table.count_special() == 0 +def test_update_special_phrases_no_replace(analyzer, word_table, make_standard_name): + word_table.add_special(' #foo#', 'foo', 'amenity', 'prison', 'in') + word_table.add_special(' #bar#', 'bar', 'highway', 'road', None) + + assert word_table.count_special() == 2 + + analyzer.update_special_phrases([], False) + + assert word_table.count_special() == 2 + + def test_update_special_phrase_modify(analyzer, word_table, make_standard_name): - word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in') - word_table.add_special(' bar', 'bar', 'highway', 'road', None) + word_table.add_special(' #foo#', 'foo', 'amenity', 'prison', 'in') + word_table.add_special(' #bar#', 'bar', 'highway', 'road', None) assert word_table.count_special() == 2 analyzer.update_special_phrases([ - ('prison', 'amenity', 'prison', 'in'), - ('bar', 'highway', 'road', '-'), - ('garden', 'leisure', 'garden', 'near') - ]) + ('prison', 'amenity', 'prison', 'in'), + ('bar', 'highway', 'road', '-'), + ('garden', 'leisure', 'garden', 'near') + ], True) assert word_table.get_special() \ - == set(((' prison', 'prison', 'amenity', 'prison', 'in'), - (' bar', 'bar', 'highway', 'road', None), - (' garden', 'garden', 'leisure', 'garden', 'near'))) + == set(((' #prison#', 'prison', 'amenity', 'prison', 'in'), + (' #bar#', 'bar', 'highway', 'road', None), + (' #garden#', 'garden', 'leisure', 'garden', 'near'))) -def test_process_place_names(analyzer, make_keywords): +def test_add_country_names(analyzer, word_table, make_standard_name): + analyzer.add_country_names('de', ['Germany', 'Deutschland', 'germany']) + + assert word_table.get_country() \ + == {('de', ' #germany#'), + ('de', ' #deutschland#')} + + +def test_add_more_country_names(analyzer, word_table, make_standard_name): + word_table.add_country('fr', ' #france#') + word_table.add_country('it', ' #italy#') + word_table.add_country('it', ' #itala#') + + analyzer.add_country_names('it', ['Italy', 'IT']) + + assert word_table.get_country() \ + == {('fr', ' #france#'), + ('it', ' #italy#'), + ('it', ' #itala#'), + ('it', ' #it#')} + +def test_process_place_names(analyzer, make_keywords): info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}}) assert info['names'] == '{1,2,3}' -@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345']) -def test_process_place_postcode(analyzer, create_postcode_id, word_table, pc): - info = analyzer.process_place({'address': {'postcode' : pc}}) +@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) +def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode): + analyzer.process_place({'address': {'postcode' : pcode}}) - assert word_table.get_postcodes() == {pc, } + assert word_table.get_postcodes() == {pcode, } -@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836']) -def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pc): - info = analyzer.process_place({'address': {'postcode' : pc}}) +@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) +def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode): + analyzer.process_place({'address': {'postcode' : pcode}}) assert not word_table.get_postcodes() -@pytest.mark.parametrize('hnr', ['123a', '1', '101']) -def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr): - info = analyzer.process_place({'address': {'housenumber' : hnr}}) +class TestHousenumberName: + + @staticmethod + @pytest.fixture(autouse=True) + def setup_create_housenumbers(temp_db_cursor): + temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers( + housenumbers TEXT[], + OUT tokens TEXT, OUT normtext TEXT) + AS $$ + SELECT housenumbers::TEXT, array_to_string(housenumbers, ';') + $$ LANGUAGE SQL""") + + + @staticmethod + @pytest.mark.parametrize('hnr', ['123a', '1', '101']) + def test_process_place_housenumbers_simple(analyzer, hnr): + info = analyzer.process_place({'address': {'housenumber' : hnr}}) - assert info['hnr'] == hnr - assert info['hnr_tokens'].startswith("{") + assert info['hnr'] == hnr + assert info['hnr_tokens'].startswith("{") -def test_process_place_housenumbers_lists(analyzer, create_housenumbers): - info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}}) + @staticmethod + def test_process_place_housenumbers_lists(analyzer): + info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}}) - assert set(info['hnr'].split(';')) == set(('1', '2', '3')) + assert set(info['hnr'].split(';')) == set(('1', '2', '3')) -def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers): - info = analyzer.process_place({'address': {'housenumber' : '134', - 'conscriptionnumber' : '134', - 'streetnumber' : '99a'}}) + @staticmethod + def test_process_place_housenumbers_duplicates(analyzer): + info = analyzer.process_place({'address': {'housenumber' : '134', + 'conscriptionnumber' : '134', + 'streetnumber' : '99a'}}) - assert set(info['hnr'].split(';')) == set(('134', '99a')) + assert set(info['hnr'].split(';')) == set(('134', '99a'))