test/python/test_tokenizer_legacy.py

   1 """
   2 Test for legacy tokenizer.
   3 """
   4 import shutil
   5
   6 import pytest
   7
   8 from nominatim.tokenizer import legacy_tokenizer
   9 from nominatim.db import properties
  10 from nominatim.errors import UsageError
  11
  12 @pytest.fixture
  13 def test_config(def_config, tmp_path):
  14     def_config.project_dir = tmp_path / 'project'
  15     def_config.project_dir.mkdir()
  16
  17     module_dir = tmp_path / 'module_src'
  18     module_dir.mkdir()
  19     (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
  20
  21     def_config.lib_dir.module = module_dir
  22
  23     sqldir = tmp_path / 'sql'
  24     sqldir.mkdir()
  25     (sqldir / 'tokenizer').mkdir()
  26     (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
  27     (sqldir / 'words.sql').write_text("SELECT 'a'")
  28     shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
  29                 str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
  30
  31     def_config.lib_dir.sql = sqldir
  32     def_config.lib_dir.data = sqldir
  33
  34     return def_config
  35
  36
  37 @pytest.fixture
  38 def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
  39     (tmp_path / 'tokenizer').mkdir()
  40
  41     def _maker():
  42         return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
  43
  44     return _maker
  45
  46 @pytest.fixture
  47 def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
  48     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
  49     tok = tokenizer_factory()
  50     tok.init_new_db(test_config)
  51
  52
  53 @pytest.fixture
  54 def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
  55              word_table, temp_db_with_extensions, tmp_path):
  56     sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
  57     sql.write_text("""
  58         CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
  59           RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
  60         """)
  61
  62     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
  63     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
  64     tok = tokenizer_factory()
  65     tok.init_new_db(test_config)
  66     monkeypatch.undo()
  67
  68     with tok.name_analyzer() as analyzer:
  69         yield analyzer
  70
  71
  72 def test_init_new(tokenizer_factory, test_config, monkeypatch,
  73                   temp_db_conn, sql_preprocessor):
  74     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
  75     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
  76
  77     tok = tokenizer_factory()
  78     tok.init_new_db(test_config)
  79
  80     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
  81
  82     outfile = test_config.project_dir / 'module' / 'nominatim.so'
  83
  84     assert outfile.exists()
  85     assert outfile.read_text() == 'TEST nomiantim.so'
  86     assert outfile.stat().st_mode == 33261
  87
  88
  89 def test_init_module_load_failed(tokenizer_factory, test_config,
  90                                  monkeypatch, temp_db_conn):
  91     tok = tokenizer_factory()
  92
  93     with pytest.raises(UsageError):
  94         tok.init_new_db(test_config)
  95
  96
  97 def test_init_module_custom(tokenizer_factory, test_config,
  98                             monkeypatch, tmp_path, sql_preprocessor):
  99     module_dir = (tmp_path / 'custom').resolve()
 100     module_dir.mkdir()
 101     (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
 102
 103     monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
 104     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
 105
 106     tok = tokenizer_factory()
 107     tok.init_new_db(test_config)
 108
 109     assert not (test_config.project_dir / 'module').exists()
 110
 111
 112 def test_init_from_project(tokenizer_setup, tokenizer_factory):
 113     tok = tokenizer_factory()
 114
 115     tok.init_from_project()
 116
 117     assert tok.normalization is not None
 118
 119
 120 def test_update_sql_functions(sql_preprocessor, temp_db_conn,
 121                               tokenizer_factory, test_config, table_factory,
 122                               monkeypatch, temp_db_cursor):
 123     monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
 124     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
 125     tok = tokenizer_factory()
 126     tok.init_new_db(test_config)
 127     monkeypatch.undo()
 128
 129     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
 130
 131     table_factory('test', 'txt TEXT')
 132
 133     func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
 134     func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
 135                                                    ('{{modulepath}}')""")
 136
 137     tok.update_sql_functions(test_config)
 138
 139     test_content = temp_db_cursor.row_set('SELECT * FROM test')
 140     assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
 141
 142
 143 def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
 144     monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
 145     tok = tokenizer_factory()
 146     tok.migrate_database(test_config)
 147
 148     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
 149     assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
 150
 151     outfile = test_config.project_dir / 'module' / 'nominatim.so'
 152
 153     assert outfile.exists()
 154     assert outfile.read_text() == 'TEST nomiantim.so'
 155     assert outfile.stat().st_mode == 33261
 156
 157
 158 def test_normalize(analyzer):
 159     assert analyzer.normalize('TEsT') == 'test'