require_once(CONST_LibDir.'/SearchDescription.php');
require_once(CONST_LibDir.'/SearchContext.php');
require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');
class Geocode
{
@define('CONST_LibDir', dirname(dirname(__FILE__)));
require_once(CONST_LibDir.'/init-cmd.php');
-require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/ParameterParser.php');
ini_set('memory_limit', '800M');
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
+require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB;
$oDB->connect();
require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/log.php');
-require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/PlaceLookup.php');
require_once(CONST_LibDir.'/ReverseGeocode.php');
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
+require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB();
$oDB->connect();
tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
- if init_db:
- tokenizer.init_new_db(config)
+ tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', module_name)
import logging
import re
import shutil
+from textwrap import dedent
from icu import Transliterator
import psycopg2
self.normalization = None
- def init_new_db(self, config):
+ def init_new_db(self, config, init_db=True):
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
self.normalization = config.TERM_NORMALIZATION
+ self._install_php(config)
+
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
conn.commit()
- self.update_sql_functions(config)
- self._init_db_tables(config)
+ if init_db:
+ self.update_sql_functions(config)
+ self._init_db_tables(config)
def init_from_project(self):
return LegacyNameAnalyzer(self.dsn, normalizer)
+ def _install_php(self, config):
+ """ Install the php script for the tokenizer.
+ """
+ php_file = self.data_dir / "tokenizer.php"
+ php_file.write_text(dedent("""\
+ <?php
+ @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+ @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+ require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+ """.format(config)))
+
+
def _init_db_tables(self, config):
""" Set up the word table and fill it with pre-computed word
frequencies.
('Default_Language', 'DEFAULT_LANGUAGE', str),
('Log_DB', 'LOG_DB', bool),
('Log_File', 'LOG_FILE', str),
- ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
- ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
('MapIcon_URL', 'MAPICON_URL', str),
)
@define('CONST_Debug', $_GET['debug'] ?? false);
@define('CONST_LibDir', '{0}');
+ @define('CONST_TokenizerDir', '{2}');
@define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
- """.format(config.lib_dir.php, NOMINATIM_VERSION))
+ """.format(config.lib_dir.php, NOMINATIM_VERSION,
+ config.project_dir / 'tokenizer'))
for php_name, conf_name, var_type in PHP_CONST_DEFS:
if var_type == bool:
from nominatim import cli
from nominatim.config import Configuration
from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
from steps.utils import run_script
class NominatimEnvironment:
"""
self.write_nominatim_config(self.api_test_db)
- if self.api_db_done:
- return
-
- self.api_db_done = True
+ if not self.api_db_done:
+ self.api_db_done = True
- if self._reuse_or_drop_db(self.api_test_db):
- return
+ if not self._reuse_or_drop_db(self.api_test_db):
+ testdata = Path('__file__') / '..' / '..' / 'testdb'
+ self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
- testdata = Path('__file__') / '..' / '..' / 'testdb'
- self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+ try:
+ self.run_nominatim('import', '--osm-file', str(self.api_test_file))
+ self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+ self.run_nominatim('freeze')
- try:
- self.run_nominatim('import', '--osm-file', str(self.api_test_file))
- self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
- self.run_nominatim('freeze')
+ phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+ run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+ except:
+ self.db_drop_database(self.api_test_db)
+ raise
- phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
- run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
- except:
- self.db_drop_database(self.api_test_db)
- raise
+ tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
def setup_unknown_db(self):
self.analyser_cache = {}
- def init_new_db(self, config):
+ def init_new_db(self, *args, **kwargs):
assert self.init_state == None
self.init_state = "new"
@pytest.fixture
def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
+ (tmp_path / 'tokenizer').mkdir()
def _maker():
return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
def run_website_script(envdir, config):
config.lib_dir.php = envdir / 'php'
+ config.project_dir = envdir
refresh.setup_website(envdir, config)
proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',