test/python/test_tokenizer_legacy.py

   1 """
   2 Test for legacy tokenizer.
   3 """
   4 import shutil
   5
   6 import pytest
   7
   8 from nominatim.tokenizer import legacy_tokenizer
   9 from nominatim.db import properties
  10 from nominatim.errors import UsageError
  11
  12 @pytest.fixture
  13 def test_config(def_config, tmp_path):
  14     def_config.project_dir = tmp_path / 'project'
  15     def_config.project_dir.mkdir()
  16
  17     module_dir = tmp_path / 'module_src'
  18     module_dir.mkdir()
  19     (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
  20
  21     def_config.lib_dir.module = module_dir
  22
  23     sqldir = tmp_path / 'sql'
  24     sqldir.mkdir()
  25     (sqldir / 'tokenizer').mkdir()
  26     (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
  27     (sqldir / 'words.sql').write_text("SELECT 'a'")
  28     shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
  29                 str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
  30
  31     def_config.lib_dir.sql = sqldir
  32     def_config.lib_dir.data = sqldir
  33
  34     return def_config
  35
  36
  37 @pytest.fixture
  38 def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
  39
  40     def _maker():
  41         return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
  42
  43     return _maker
  44
  45 @pytest.fixture
  46 def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
  47     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
  48     tok = tokenizer_factory()
  49     tok.init_new_db(test_config)
  50
  51
  52 @pytest.fixture
  53 def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
  54              word_table, temp_db_with_extensions, tmp_path):
  55     sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
  56     sql.write_text("""
  57         CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
  58           RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
  59         """)
  60
  61     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
  62     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
  63     tok = tokenizer_factory()
  64     tok.init_new_db(test_config)
  65     monkeypatch.undo()
  66
  67     with tok.name_analyzer() as analyzer:
  68         yield analyzer
  69
  70
  71 def test_init_new(tokenizer_factory, test_config, monkeypatch,
  72                   temp_db_conn, sql_preprocessor):
  73     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
  74     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
  75
  76     tok = tokenizer_factory()
  77     tok.init_new_db(test_config)
  78
  79     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
  80
  81     outfile = test_config.project_dir / 'module' / 'nominatim.so'
  82
  83     assert outfile.exists()
  84     assert outfile.read_text() == 'TEST nomiantim.so'
  85     assert outfile.stat().st_mode == 33261
  86
  87
  88 def test_init_module_load_failed(tokenizer_factory, test_config,
  89                                  monkeypatch, temp_db_conn):
  90     tok = tokenizer_factory()
  91
  92     with pytest.raises(UsageError):
  93         tok.init_new_db(test_config)
  94
  95
  96 def test_init_module_custom(tokenizer_factory, test_config,
  97                             monkeypatch, tmp_path, sql_preprocessor):
  98     module_dir = (tmp_path / 'custom').resolve()
  99     module_dir.mkdir()
 100     (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
 101
 102     monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
 103     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
 104
 105     tok = tokenizer_factory()
 106     tok.init_new_db(test_config)
 107
 108     assert not (test_config.project_dir / 'module').exists()
 109
 110
 111 def test_init_from_project(tokenizer_setup, tokenizer_factory):
 112     tok = tokenizer_factory()
 113
 114     tok.init_from_project()
 115
 116     assert tok.normalization is not None
 117
 118
 119 def test_update_sql_functions(sql_preprocessor, temp_db_conn,
 120                               tokenizer_factory, test_config, table_factory,
 121                               monkeypatch, temp_db_cursor):
 122     monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
 123     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
 124     tok = tokenizer_factory()
 125     tok.init_new_db(test_config)
 126     monkeypatch.undo()
 127
 128     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
 129
 130     table_factory('test', 'txt TEXT')
 131
 132     func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
 133     func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
 134                                                    ('{{modulepath}}')""")
 135
 136     tok.update_sql_functions(test_config)
 137
 138     test_content = temp_db_cursor.row_set('SELECT * FROM test')
 139     assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
 140
 141
 142 def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
 143     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
 144     tok = tokenizer_factory()
 145     tok.migrate_database(test_config)
 146
 147     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
 148     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
 149
 150     outfile = test_config.project_dir / 'module' / 'nominatim.so'
 151
 152     assert outfile.exists()
 153     assert outfile.read_text() == 'TEST nomiantim.so'
 154     assert outfile.stat().st_mode == 33261
 155
 156
 157 def test_normalize(analyzer):
 158     assert analyzer.normalize('TEsT') == 'test'