]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
committerGitHub <noreply@github.com>
Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
Make ICU tokenizer the default

21 files changed:
.github/actions/build-nominatim/action.yml
.github/actions/setup-postgresql/action.yml
.github/workflows/ci-tests.yml
CMakeLists.txt
docs/admin/Installation.md
docs/admin/Migration.md
docs/customize/Tokenizers.md
nominatim/config.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
settings/env.defaults
test/bdd/environment.py
test/bdd/steps/nominatim_environment.py
test/bdd/steps/steps_db_ops.py
test/python/conftest.py
test/python/mocks.py
test/python/tools/test_database_import.py
test/python/tools/test_migration.py
test/python/tools/test_postcodes.py
vagrant/Install-on-Ubuntu-18.sh
vagrant/Install-on-Ubuntu-20.sh

index 757decd4f63c7379033ac956acb38cf6c12b0cad..042166ad7fc357a8b4b129961290e2e3ae0cf147 100644 (file)
@@ -5,6 +5,10 @@ inputs:
         description: 'Version of Ubuntu to install on'
         required: false
         default: '20'
+    cmake-args:
+        description: 'Additional options to hand to cmake'
+        required: false
+        default: ''
 
 runs:
     using: "composite"
@@ -21,18 +25,13 @@ runs:
           shell: bash
           env:
             UBUNTUVER: ${{ inputs.ubuntu }}
-
-        - name: Download dependencies
-          run: |
-              if [ ! -f country_grid.sql.gz ]; then
-                  wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
-              fi
-              cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
-          shell: bash
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
 
         - name: Configure
-          run: mkdir build && cd build && cmake ../Nominatim
+          run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
           shell: bash
+          env:
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
 
         - name: Build
           run: |
index 060a678941b2610c5bfc29c4bada275812a7ccd3..19a19e17e12c65c06044a381045d523e160ab16a 100644 (file)
@@ -22,7 +22,7 @@ runs:
 
         - name: Install PostgreSQL
           run: |
-              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
+              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
           shell: bash
           env:
               PGVER: ${{ inputs.postgresql-version }}
index 6ebf1ab97d154c7d5841d77f35e2cbdf3a1d8d56..a08a995f9c5459c9a3bee7bc214db019887b7eb0 100644 (file)
@@ -113,19 +113,9 @@ jobs:
               working-directory: Nominatim/test/bdd
 
 
-    icu-test:
+    legacy-test:
         needs: create-archive
-        strategy:
-            matrix:
-                ubuntu: [20]
-                include:
-                    - ubuntu: 20
-                      postgresql: 13
-                      postgis: 3
-                      pytest: py.test-3
-                      php: 7.4
-
-        runs-on: ubuntu-${{ matrix.ubuntu }}.04
+        runs-on: ubuntu-20.04
 
         steps:
             - uses: actions/download-artifact@v2
@@ -138,35 +128,27 @@ jobs:
             - name: Setup PHP
               uses: shivammathur/setup-php@v2
               with:
-                  php-version: ${{ matrix.php }}
-                  coverage: xdebug
-                  tools: phpunit, phpcs, composer
-
-            - uses: actions/setup-python@v2
-              with:
-                python-version: 3.6
-              if: matrix.ubuntu == 18
+                  php-version: 7.4
 
             - uses: ./Nominatim/.github/actions/setup-postgresql
               with:
-                  postgresql-version: ${{ matrix.postgresql }}
-                  postgis-version: ${{ matrix.postgis }}
+                  postgresql-version: 13
+                  postgis-version: 3
+
+            - name: Install Postgresql server dev
+              run: sudo apt-get install postgresql-server-dev-13
 
             - uses: ./Nominatim/.github/actions/build-nominatim
               with:
-                  ubuntu: ${{ matrix.ubuntu }}
+                  ubuntu: 20
+                  cmake-args: -DBUILD_MODULE=on
 
             - name: Install test prerequsites
               run: sudo apt-get install -y -qq python3-behave
-              if: matrix.ubuntu == 20
-
-            - name: Install test prerequsites
-              run: pip3 install behave==1.2.6
-              if: matrix.ubuntu == 18
 
-            - name: BDD tests (icu tokenizer)
+            - name: BDD tests (legacy tokenizer)
               run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
               working-directory: Nominatim/test/bdd
 
 
index af7dbc2a43de5f0577502b299a5d816bc981db03..8360d549d24ff707ea9d3cb8aa2d650c55ca7fad 100644 (file)
@@ -44,7 +44,7 @@ endif()
 
 set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
 set(BUILD_API on CACHE BOOL "Build everything for the API server")
-set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
+set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
 set(BUILD_TESTS on CACHE BOOL "Build test suite")
 set(BUILD_DOCS on CACHE BOOL "Build documentation")
 set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
index 8c4c670b39dc901c25285b3cd030c5f850d5bec2..f5411604ef4a5bfeae7b06ed94796b271e63ea8e 100644 (file)
@@ -158,6 +158,17 @@ make
 sudo make install
 ```
 
+!!! warning
+    The default installation no longer compiles the PostgreSQL module that
+    is needed for the legacy tokenizer from older Nominatim versions. If you
+    are upgrading an older database or want to run the
+    [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
+    some other reason, you need to enable the PostgreSQL module via
+    cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
+    you need to have the server development headers for PostgreSQL installed.
+    On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
+
+
 Nominatim installs itself into `/usr/local` per default. To choose a different
 installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
 cmake command. Make sure that the `bin` directory is available in your path
index 11ee7f0558c37406599d681334b602524492f363..950f7e193beee883a5fd4d06c2ea65bcd1e500a3 100644 (file)
@@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
 
 ## 4.0.0 -> master
 
+### ICU tokenizer is the new default
+
+Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
+by default. This only has an effect on newly installed databases. When
+updating older databases, it keeps its installed tokenizer. If you still
+run with the legacy tokenizer, make sure to compile Nominatim with the
+PostgreSQL module, see [Installation](Installation.md#building-nominatim).
+
 ### geocodejson output changed
 
 The `type` field of the geocodejson output has changed. It now contains
index d849eb48c0d457c7c57b27e2807ff55675e2fa33..19d867ddd800063494d72ad6ac078025d7ce2347 100644 (file)
@@ -19,7 +19,22 @@ they can be configured.
 
 The legacy tokenizer implements the analysis algorithms of older Nominatim
 versions. It uses a special Postgresql module to normalize names and queries.
-This tokenizer is currently the default.
+This tokenizer is automatically installed and used when upgrading an older
+database. It should not be used for new installations anymore.
+
+### Compiling the PostgreSQL module
+
+The tokeinzer needs a special C module for PostgreSQL which is not compiled
+by default. If you need the legacy tokenizer, compile Nominatim as follows:
+
+```
+mkdir build
+cd build
+cmake -DBUILD_MODULE=on
+make
+```
+
+### Enabling the tokenizer
 
 To enable the tokenizer add the following line to your project configuration:
 
@@ -47,6 +62,7 @@ normalization functions are hard-coded.
 The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
+This tokenizer is currently the default.
 
 To enable the tokenizer add the following line to your project configuration:
 
index 13d9cd8a0d502e4b1f1b5ab58ca7e0761772cfaf..a3f91055fc76b37bf338291c6b7aa2350afb4d21 100644 (file)
@@ -187,7 +187,7 @@ class Configuration:
         if configfile.suffix in ('.yaml', '.yml'):
             result = self._load_from_yaml(configfile)
         elif configfile.suffix == '.json':
-            with configfile.open('r') as cfg:
+            with configfile.open('r', encoding='utf-8') as cfg:
                 result = json.load(cfg)
         else:
             raise UsageError(f"Config file '{configfile}' has unknown format.")
index b553dbc641d708175e8f7281f05cf14cf4673484..9c7138ce67fa5174d0e947c72bf7a71313fe3435 100644 (file)
@@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                 @define('CONST_Max_Word_Frequency', 10000000);
                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 
 
     def _save_config(self):
index 3b8f75692f964e9c2e84dc3ada92b156dd0afb7b..97ce6d16644cff6a19369c9db5a7a12af8387078 100644 (file)
@@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
                 @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
                 @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
                 require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)))
+                """.format(config)), encoding='utf-8')
 
 
     def _init_db_tables(self, config):
index e5dfe4a6094c01b9ea9605b5de06b8cfc666b274..3115f4382aacf582c5a1054e78c03130bde9f00f 100644 (file)
@@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
 # Tokenizer used for normalizing and parsing queries and names.
 # The tokenizer is set up during import and cannot be changed afterwards
 # without a reimport.
-# Currently available tokenizers: legacy
-NOMINATIM_TOKENIZER="legacy"
+# Currently available tokenizers: icu, legacy
+NOMINATIM_TOKENIZER="icu"
 
 # Number of occurrences of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
index 0acc73b436158f373474a63cf028166f87460b9a..ee07e6028c1ca67497082316821fb219a0c0d9f6 100644 (file)
@@ -59,5 +59,5 @@ def after_scenario(context, scenario):
 
 def before_tag(context, tag):
     if tag == 'fail-legacy':
-        if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
+        if context.config.userdata['TOKENIZER'] == 'legacy':
             context.scenario.skip("Not implemented in legacy tokenizer")
index 7de32e484f50f1f645c79137a45541b2820538ca..70a03e6ec31e37a148f1ea0255c3b283913704b5 100644 (file)
@@ -207,7 +207,7 @@ class NominatimEnvironment:
                     self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                     self.run_nominatim('freeze')
 
-                    if self.tokenizer != 'icu':
+                    if self.tokenizer == 'legacy':
                         phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                         run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                     else:
index e02cad8f4ac92c46d17ab4a4e7c74c1a3016cefc..4c711b725b34588599fa01fe27cbbf16698fde6a 100644 (file)
@@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
     plist.sort()
 
     with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'icu':
+        if nctx.tokenizer != 'legacy':
             cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                         (plist,))
         else:
index f4581bf92aae0b879e362bccef351616029be118..405262950b78319d2924274ad497f933b03490f2 100644 (file)
@@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
                      country_code VARCHAR(2)""")
 
 
-@pytest.fixture
-def word_table(temp_db_conn):
-    return mocks.MockWordTable(temp_db_conn)
-
-
 @pytest.fixture
 def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
     table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
index 2cd2e3e38297f1469f82a3bf236fc2b6ac969eb2..9c6ef53215dd91c24368a6153385da9011f52445 100644 (file)
@@ -14,7 +14,7 @@ import psycopg2.extras
 from nominatim.db import properties
 
 # This must always point to the mock word table for the default tokenizer.
-from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
+from mock_icu_word_table import MockIcuWordTable as MockWordTable
 
 class MockPlacexTable:
     """ A placex table for testing.
index 8ac31bc0f4a2f9c20e8e18637bc8a151a1184145..68d19a07beb78e0b3b4adffa298016d2da3eff4d 100644 (file)
@@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
 
 @pytest.mark.parametrize("threads", (1, 5))
 def test_load_data(dsn, place_row, placex_table, osmline_table,
-                   word_table, temp_db_cursor, threads):
+                   temp_db_cursor, threads):
     for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
         temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
                                   RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
index 8fef0dc1a4025461545e0ca8d42af1e12db2d2fc..d102b97da9d3ab10392810b15336e8c7ab281ae0 100644 (file)
@@ -14,6 +14,8 @@ from nominatim.tools import migration
 from nominatim.errors import UsageError
 import nominatim.version
 
+from mock_legacy_word_table import MockLegacyWordTable
+
 class DummyTokenizer:
 
     def update_sql_functions(self, config):
@@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
     monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
                         lambda *args: DummyTokenizer())
 
+@pytest.fixture
+def legacy_word_table(temp_db_conn):
+    return MockLegacyWordTable(temp_db_conn)
+
 
 def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
     table_factory('country_name', 'name HSTORE, country_code TEXT')
@@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
 
 
 def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
-                                            word_table, placex_table):
+                                            legacy_word_table, placex_table):
     placex_table.add(housenumber='3A')
 
     temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
index 556d6457a0e9c726e1bae855be318f4fe2f89d2f..bdfe309471f0995188c9fdd32cc13815e9cac9ee 100644 (file)
@@ -65,7 +65,7 @@ def tokenizer():
     return dummy_tokenizer.DummyTokenizer(None, None)
 
 @pytest.fixture
-def postcode_table(temp_db_conn, placex_table, word_table):
+def postcode_table(temp_db_conn, placex_table):
     return MockPostcodeTable(temp_db_conn)
 
 
index 40ee7ba8f63a5be109cfb0175389ce9f682e433b..3537bcf4486c0050298cb4afe2499a91c318907a 100755 (executable)
@@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
     sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                         libboost-filesystem-dev libexpat1-dev zlib1g-dev\
                         libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-10 postgresql-10-postgis-2.4 \
+                        postgresql-10-postgis-2.4 \
                         postgresql-contrib-10 postgresql-10-postgis-scripts \
                         php php-pgsql php-intl libicu-dev python3-pip \
-                        python3-psutil python3-jinja2 python3-icu git
+                        python3-psutil python3-jinja2 python3-yaml python3-icu git
 
 # Some of the Python packages that come with Ubuntu 18.04 are too old, so
 # install the latest version from pip:
index cdfb20f28f2d0a366f67c12a2217138265611156..1ea180e84b64a4b76455b458e8e5fdd82c3dafb2 100755 (executable)
@@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
     sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                         libboost-filesystem-dev libexpat1-dev zlib1g-dev \
                         libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-12 postgresql-12-postgis-3 \
+                        postgresql-12-postgis-3 \
                         postgresql-contrib-12 postgresql-12-postgis-3-scripts \
                         php php-pgsql php-intl libicu-dev python3-dotenv \
                         python3-psycopg2 python3-psutil python3-jinja2 \
-                        python3-icu python3-datrie git
+                        python3-icu python3-datrie python3-yaml git
 
 #
 # System Configuration