]> git.openstreetmap.org Git - nominatim.git/commitdiff
boilerplate for PHP code of tokenizer
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 28 Apr 2021 08:59:07 +0000 (10:59 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 09:31:52 +0000 (11:31 +0200)
This adds an installation step for PHP code for the tokenizer. The
PHP code is split in two parts. The updateable code is found in
lib-php. The tokenizer installs an additional script in the
project directory which then includes the code from lib-php and
defines all settings that are static to the database. The website
code then always includes the PHP from the project directory.

lib-php/Geocode.php
lib-php/admin/query.php
lib-php/admin/warm.php
lib-php/tokenizer/legacy_tokenizer.php [new file with mode: 0644]
nominatim/tokenizer/factory.py
nominatim/tokenizer/legacy_tokenizer.py
nominatim/tools/refresh.py
test/bdd/steps/nominatim_environment.py
test/python/dummy_tokenizer.py
test/python/test_tokenizer_legacy.py
test/python/test_tools_refresh_setup_website.py

index 6cec6a85e088ee40c69917eb152e54bf571fc1a7..d9c1b3c08a11823656e54053c734fe72093ba7b9 100644 (file)
@@ -8,6 +8,7 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
 require_once(CONST_LibDir.'/SearchDescription.php');
 require_once(CONST_LibDir.'/SearchContext.php');
 require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');
 
 class Geocode
 {
index 268b87cc0e0bbe1e2e895cf655736e5cb53c80d9..21121fbd316ac2269c137c8786361ac8da5735d1 100644 (file)
@@ -2,7 +2,6 @@
 @define('CONST_LibDir', dirname(dirname(__FILE__)));
 
 require_once(CONST_LibDir.'/init-cmd.php');
-require_once(CONST_LibDir.'/Geocode.php');
 require_once(CONST_LibDir.'/ParameterParser.php');
 ini_set('memory_limit', '800M');
 
@@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
 @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
 @define('CONST_Log_DB', getSettingBool('LOG_DB'));
 @define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
 @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
 @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
 @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
 @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
 @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
 @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
 @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
 
+require_once(CONST_LibDir.'/Geocode.php');
 
 $oDB = new Nominatim\DB;
 $oDB->connect();
index d7950af9b1912b91d70c2f29b691c06a328fa15c..d6aa3d9b0d5f0978045b037427520a059a936c3b 100644 (file)
@@ -3,7 +3,6 @@
 
 require_once(CONST_LibDir.'/init-cmd.php');
 require_once(CONST_LibDir.'/log.php');
-require_once(CONST_LibDir.'/Geocode.php');
 require_once(CONST_LibDir.'/PlaceLookup.php');
 require_once(CONST_LibDir.'/ReverseGeocode.php');
 
@@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
 @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
 @define('CONST_Log_DB', getSettingBool('LOG_DB'));
 @define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
 @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
 @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
 @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
 @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
 @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
 @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
 @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
 
+require_once(CONST_LibDir.'/Geocode.php');
 
 $oDB = new Nominatim\DB();
 $oDB->connect();
diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php
new file mode 100644 (file)
index 0000000..b3d9bbc
--- /dev/null
@@ -0,0 +1 @@
+<?php
index 5f03ba582fad9955b2990ffb307e4bb710d518b4..e0c06293ff56e22b9d90938bb5af923b0e279bdb 100644 (file)
@@ -54,8 +54,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
     tokenizer_module = _import_tokenizer(module_name)
 
     tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
-    if init_db:
-        tokenizer.init_new_db(config)
+    tokenizer.init_new_db(config, init_db=init_db)
 
     with connect(config.get_libpq_dsn()) as conn:
         properties.set_property(conn, 'tokenizer', module_name)
index ac26c5cd5d160257ad66f30fe289d6bd9b791e12..d6755835cae40fb496c287b44f15166917040bf5 100644 (file)
@@ -5,6 +5,7 @@ from collections import OrderedDict
 import logging
 import re
 import shutil
+from textwrap import dedent
 
 from icu import Transliterator
 import psycopg2
@@ -87,7 +88,7 @@ class LegacyTokenizer:
         self.normalization = None
 
 
-    def init_new_db(self, config):
+    def init_new_db(self, config, init_db=True):
         """ Set up a new tokenizer for the database.
 
             This copies all necessary data in the project directory to make
@@ -99,13 +100,16 @@ class LegacyTokenizer:
 
         self.normalization = config.TERM_NORMALIZATION
 
+        self._install_php(config)
+
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
             self._save_config(conn, config)
             conn.commit()
 
-        self.update_sql_functions(config)
-        self._init_db_tables(config)
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
 
 
     def init_from_project(self):
@@ -165,6 +169,18 @@ class LegacyTokenizer:
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
+
+
     def _init_db_tables(self, config):
         """ Set up the word table and fill it with pre-computed word
             frequencies.
index 8fc0c69dd16dca16ec680daebf6544d25f50d847..6720465fd9220387781b8df939742a1aee482a0e 100644 (file)
@@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
     ('Default_Language', 'DEFAULT_LANGUAGE', str),
     ('Log_DB', 'LOG_DB', bool),
     ('Log_File', 'LOG_FILE', str),
-    ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
     ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
     ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
     ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
     ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
     ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
-    ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
     ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
     ('MapIcon_URL', 'MAPICON_URL', str),
 )
@@ -175,9 +173,11 @@ def setup_website(basedir, config):
 
                       @define('CONST_Debug', $_GET['debug'] ?? false);
                       @define('CONST_LibDir', '{0}');
+                      @define('CONST_TokenizerDir', '{2}');
                       @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
 
-                      """.format(config.lib_dir.php, NOMINATIM_VERSION))
+                      """.format(config.lib_dir.php, NOMINATIM_VERSION,
+                                 config.project_dir / 'tokenizer'))
 
     for php_name, conf_name, var_type in PHP_CONST_DEFS:
         if var_type == bool:
index 345e134975db02b77425634d4ecb4a08b8242661..24f777f6c834952ea5ea2cd341ce175216fb8562 100644 (file)
@@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
 from nominatim import cli
 from nominatim.config import Configuration
 from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
 from steps.utils import run_script
 
 class NominatimEnvironment:
@@ -179,27 +180,25 @@ class NominatimEnvironment:
         """
         self.write_nominatim_config(self.api_test_db)
 
-        if self.api_db_done:
-            return
-
-        self.api_db_done = True
+        if not self.api_db_done:
+            self.api_db_done = True
 
-        if self._reuse_or_drop_db(self.api_test_db):
-            return
+            if not self._reuse_or_drop_db(self.api_test_db):
+                testdata = Path('__file__') / '..' / '..' / 'testdb'
+                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
 
-        testdata = Path('__file__') / '..' / '..' / 'testdb'
-        self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+                try:
+                    self.run_nominatim('import', '--osm-file', str(self.api_test_file))
+                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    self.run_nominatim('freeze')
 
-        try:
-            self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-            self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
-            self.run_nominatim('freeze')
+                    phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                    run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                except:
+                    self.db_drop_database(self.api_test_db)
+                    raise
 
-            phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
-            run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
-        except:
-            self.db_drop_database(self.api_test_db)
-            raise
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
 
 
     def setup_unknown_db(self):
index 0868b57db18956979019f8c7c32422597b02a5be..d3f006deb3b029ffe05db5b9b6d7f13ba3d20b91 100644 (file)
@@ -16,7 +16,7 @@ class DummyTokenizer:
         self.analyser_cache = {}
 
 
-    def init_new_db(self, config):
+    def init_new_db(self, *args, **kwargs):
         assert self.init_state == None
         self.init_state = "new"
 
index 0d1169ada4ef429edd6de90223638d25b9af9ee3..45c400fe730f7325fcbaba7e48b5a10788fb8e1e 100644 (file)
@@ -36,6 +36,7 @@ def test_config(def_config, tmp_path):
 
 @pytest.fixture
 def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
+    (tmp_path / 'tokenizer').mkdir()
 
     def _maker():
         return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
index 18b146fc22b4b07b5c4696bf2f0c8a7cfea30bd6..dc822e3c166051bf6812b75d41fb64703427ae97 100644 (file)
@@ -26,6 +26,7 @@ def test_script(envdir):
 
 def run_website_script(envdir, config):
     config.lib_dir.php = envdir / 'php'
+    config.project_dir = envdir
     refresh.setup_website(envdir, config)
 
     proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',