Merge pull request #2707 from lonvia/make-icu-tokenizer-the-default

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)

committer GitHub <noreply@github.com>

Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
committer GitHub <noreply@github.com>
Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml

index 757decd4f63c7379033ac956acb38cf6c12b0cad..042166ad7fc357a8b4b129961290e2e3ae0cf147 100644 (file)
--- a/.github/actions/build-nominatim/action.yml
+++ b/.github/actions/build-nominatim/action.yml
@@ -5,6 +5,10 @@ inputs:
          description: 'Version of Ubuntu to install on'
          required: false
          default: '20'
+    cmake-args:
+        description: 'Additional options to hand to cmake'
+        required: false
+        default: ''
  
  runs:
      using: "composite"
@@ -21,18 +25,13 @@ runs:
            shell: bash
            env:
              UBUNTUVER: ${{ inputs.ubuntu }}
-
-        - name: Download dependencies
-          run: |
-              if [ ! -f country_grid.sql.gz ]; then
-                  wget --no-verbose https://www.nominatim.org/data/country_grid.sql.gz
-              fi
-              cp country_grid.sql.gz Nominatim/data/country_osm_grid.sql.gz
-          shell: bash
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
  
          - name: Configure
-          run: mkdir build && cd build && cmake ../Nominatim
+          run: mkdir build && cd build && cmake $CMAKE_ARGS ../Nominatim
            shell: bash
+          env:
+            CMAKE_ARGS: ${{ inputs.cmake-args }}
  
          - name: Build
            run: |
diff --git a/.github/actions/setup-postgresql/action.yml b/.github/actions/setup-postgresql/action.yml

index 060a678941b2610c5bfc29c4bada275812a7ccd3..19a19e17e12c65c06044a381045d523e160ab16a 100644 (file)
--- a/.github/actions/setup-postgresql/action.yml
+++ b/.github/actions/setup-postgresql/action.yml
@@ -22,7 +22,7 @@ runs:
  
          - name: Install PostgreSQL
            run: |
-              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER} postgresql-server-dev-${PGVER}
+              sudo apt-get install -y -qq --no-install-suggests --no-install-recommends postgresql-client-${PGVER} postgresql-${PGVER}-postgis-${POSTGISVER} postgresql-${PGVER}-postgis-${POSTGISVER}-scripts postgresql-contrib-${PGVER} postgresql-${PGVER}
            shell: bash
            env:
                PGVER: ${{ inputs.postgresql-version }}
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml

index 6ebf1ab97d154c7d5841d77f35e2cbdf3a1d8d56..a08a995f9c5459c9a3bee7bc214db019887b7eb0 100644 (file)
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -113,19 +113,9 @@ jobs:
                working-directory: Nominatim/test/bdd
  
  
-    icu-test:
+    legacy-test:
          needs: create-archive
-        strategy:
-            matrix:
-                ubuntu: [20]
-                include:
-                    - ubuntu: 20
-                      postgresql: 13
-                      postgis: 3
-                      pytest: py.test-3
-                      php: 7.4
-
-        runs-on: ubuntu-${{ matrix.ubuntu }}.04
+        runs-on: ubuntu-20.04
  
          steps:
              - uses: actions/download-artifact@v2
@@ -138,35 +128,27 @@ jobs:
              - name: Setup PHP
                uses: shivammathur/setup-php@v2
                with:
-                  php-version: ${{ matrix.php }}
-                  coverage: xdebug
-                  tools: phpunit, phpcs, composer
-
-            - uses: actions/setup-python@v2
-              with:
-                python-version: 3.6
-              if: matrix.ubuntu == 18
+                  php-version: 7.4
  
              - uses: ./Nominatim/.github/actions/setup-postgresql
                with:
-                  postgresql-version: ${{ matrix.postgresql }}
-                  postgis-version: ${{ matrix.postgis }}
+                  postgresql-version: 13
+                  postgis-version: 3
+
+            - name: Install Postgresql server dev
+              run: sudo apt-get install postgresql-server-dev-13
  
              - uses: ./Nominatim/.github/actions/build-nominatim
                with:
-                  ubuntu: ${{ matrix.ubuntu }}
+                  ubuntu: 20
+                  cmake-args: -DBUILD_MODULE=on
  
              - name: Install test prerequsites
                run: sudo apt-get install -y -qq python3-behave
-              if: matrix.ubuntu == 20
-
-            - name: Install test prerequsites
-              run: pip3 install behave==1.2.6
-              if: matrix.ubuntu == 18
  
-            - name: BDD tests (icu tokenizer)
+            - name: BDD tests (legacy tokenizer)
                run: |
-                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
+                  behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy --format=progress3
                working-directory: Nominatim/test/bdd
  
  
diff --git a/CMakeLists.txt b/CMakeLists.txt

index af7dbc2a43de5f0577502b299a5d816bc981db03..8360d549d24ff707ea9d3cb8aa2d650c55ca7fad 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ endif()
  
  set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
  set(BUILD_API on CACHE BOOL "Build everything for the API server")
-set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
+set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
  set(BUILD_TESTS on CACHE BOOL "Build test suite")
  set(BUILD_DOCS on CACHE BOOL "Build documentation")
  set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md

index 8c4c670b39dc901c25285b3cd030c5f850d5bec2..f5411604ef4a5bfeae7b06ed94796b271e63ea8e 100644 (file)
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -158,6 +158,17 @@ make
  sudo make install
  ```
  
+!!! warning
+    The default installation no longer compiles the PostgreSQL module that
+    is needed for the legacy tokenizer from older Nominatim versions. If you
+    are upgrading an older database or want to run the
+    [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
+    some other reason, you need to enable the PostgreSQL module via
+    cmake: `cmake -DBUILD_MODULE=on ../Nominatim`. To compile the module
+    you need to have the server development headers for PostgreSQL installed.
+    On Ubuntu/Debian run: `sudo apt install postgresql-server-dev-<postgresql version>`
+
+
  Nominatim installs itself into `/usr/local` per default. To choose a different
  installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
  cmake command. Make sure that the `bin` directory is available in your path
diff --git a/docs/admin/Migration.md b/docs/admin/Migration.md

index 11ee7f0558c37406599d681334b602524492f363..950f7e193beee883a5fd4d06c2ea65bcd1e500a3 100644 (file)
--- a/docs/admin/Migration.md
+++ b/docs/admin/Migration.md
@@ -17,6 +17,14 @@ breaking changes. **Please read them before running the migration.**
  
  ## 4.0.0 -> master
  
+### ICU tokenizer is the new default
+
+Nominatim now installs the [ICU tokenizer](../customize/Tokenizers.md#icu-tokenizer)
+by default. This only has an effect on newly installed databases. When
+updating older databases, it keeps its installed tokenizer. If you still
+run with the legacy tokenizer, make sure to compile Nominatim with the
+PostgreSQL module, see [Installation](Installation.md#building-nominatim).
+
  ### geocodejson output changed
  
  The `type` field of the geocodejson output has changed. It now contains
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md

index d849eb48c0d457c7c57b27e2807ff55675e2fa33..19d867ddd800063494d72ad6ac078025d7ce2347 100644 (file)
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -19,7 +19,22 @@ they can be configured.
  
  The legacy tokenizer implements the analysis algorithms of older Nominatim
  versions. It uses a special Postgresql module to normalize names and queries.
-This tokenizer is currently the default.
+This tokenizer is automatically installed and used when upgrading an older
+database. It should not be used for new installations anymore.
+
+### Compiling the PostgreSQL module
+
+The tokeinzer needs a special C module for PostgreSQL which is not compiled
+by default. If you need the legacy tokenizer, compile Nominatim as follows:
+
+```
+mkdir build
+cd build
+cmake -DBUILD_MODULE=on
+make
+```
+
+### Enabling the tokenizer
  
  To enable the tokenizer add the following line to your project configuration:
  
@@ -47,6 +62,7 @@ normalization functions are hard-coded.
  The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
  normalize names and queries. It also offers configurable decomposition and
  abbreviation handling.
+This tokenizer is currently the default.
  
  To enable the tokenizer add the following line to your project configuration:
  
diff --git a/nominatim/config.py b/nominatim/config.py

index 13d9cd8a0d502e4b1f1b5ab58ca7e0761772cfaf..a3f91055fc76b37bf338291c6b7aa2350afb4d21 100644 (file)
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -187,7 +187,7 @@ class Configuration:
          if configfile.suffix in ('.yaml', '.yml'):
              result = self._load_from_yaml(configfile)
          elif configfile.suffix == '.json':
-            with configfile.open('r') as cfg:
+            with configfile.open('r', encoding='utf-8') as cfg:
                  result = json.load(cfg)
          else:
              raise UsageError(f"Config file '{configfile}' has unknown format.")
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index b553dbc641d708175e8f7281f05cf14cf4673484..9c7138ce67fa5174d0e947c72bf7a71313fe3435 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -187,7 +187,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
                  @define('CONST_Max_Word_Frequency', 10000000);
                  @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
                  @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
  
  
      def _save_config(self):
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index 3b8f75692f964e9c2e84dc3ada92b156dd0afb7b..97ce6d16644cff6a19369c9db5a7a12af8387078 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -255,7 +255,7 @@ class LegacyTokenizer(AbstractTokenizer):
                  @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
                  @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
                  require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-                """.format(config)))
+                """.format(config)), encoding='utf-8')
  
  
      def _init_db_tables(self, config):
diff --git a/settings/env.defaults b/settings/env.defaults

index e5dfe4a6094c01b9ea9605b5de06b8cfc666b274..3115f4382aacf582c5a1054e78c03130bde9f00f 100644 (file)
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
  # Tokenizer used for normalizing and parsing queries and names.
  # The tokenizer is set up during import and cannot be changed afterwards
  # without a reimport.
-# Currently available tokenizers: legacy
-NOMINATIM_TOKENIZER="legacy"
+# Currently available tokenizers: icu, legacy
+NOMINATIM_TOKENIZER="icu"
  
  # Number of occurrences of a word before it is considered frequent.
  # Similar to the concept of stop words. Frequent partial words get ignored
diff --git a/test/bdd/environment.py b/test/bdd/environment.py

index 0acc73b436158f373474a63cf028166f87460b9a..ee07e6028c1ca67497082316821fb219a0c0d9f6 100644 (file)
--- a/test/bdd/environment.py
+++ b/test/bdd/environment.py
@@ -59,5 +59,5 @@ def after_scenario(context, scenario):
  
  def before_tag(context, tag):
      if tag == 'fail-legacy':
-        if context.config.userdata['TOKENIZER'] in (None, 'legacy'):
+        if context.config.userdata['TOKENIZER'] == 'legacy':
              context.scenario.skip("Not implemented in legacy tokenizer")
diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py

index 7de32e484f50f1f645c79137a45541b2820538ca..70a03e6ec31e37a148f1ea0255c3b283913704b5 100644 (file)
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -207,7 +207,7 @@ class NominatimEnvironment:
                      self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                      self.run_nominatim('freeze')
  
-                    if self.tokenizer != 'icu':
+                    if self.tokenizer == 'legacy':
                          phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                          run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                      else:
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py

index e02cad8f4ac92c46d17ab4a4e7c74c1a3016cefc..4c711b725b34588599fa01fe27cbbf16698fde6a 100644 (file)
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -266,7 +266,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
      plist.sort()
  
      with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        if nctx.tokenizer == 'icu':
+        if nctx.tokenizer != 'legacy':
              cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                          (plist,))
          else:
diff --git a/test/python/conftest.py b/test/python/conftest.py

index f4581bf92aae0b879e362bccef351616029be118..405262950b78319d2924274ad497f933b03490f2 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -211,11 +211,6 @@ def osmline_table(temp_db_with_extensions, table_factory):
                       country_code VARCHAR(2)""")
  
  
-@pytest.fixture
-def word_table(temp_db_conn):
-    return mocks.MockWordTable(temp_db_conn)
-
-
  @pytest.fixture
  def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions):
      table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
diff --git a/test/python/mocks.py b/test/python/mocks.py

index 2cd2e3e38297f1469f82a3bf236fc2b6ac969eb2..9c6ef53215dd91c24368a6153385da9011f52445 100644 (file)
--- a/test/python/mocks.py
+++ b/test/python/mocks.py
@@ -14,7 +14,7 @@ import psycopg2.extras
  from nominatim.db import properties
  
  # This must always point to the mock word table for the default tokenizer.
-from mock_legacy_word_table import MockLegacyWordTable as MockWordTable
+from mock_icu_word_table import MockIcuWordTable as MockWordTable
  
  class MockPlacexTable:
      """ A placex table for testing.
diff --git a/test/python/tools/test_database_import.py b/test/python/tools/test_database_import.py

index 8ac31bc0f4a2f9c20e8e18637bc8a151a1184145..68d19a07beb78e0b3b4adffa298016d2da3eff4d 100644 (file)
--- a/test/python/tools/test_database_import.py
+++ b/test/python/tools/test_database_import.py
@@ -179,7 +179,7 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory, w
  
  @pytest.mark.parametrize("threads", (1, 5))
  def test_load_data(dsn, place_row, placex_table, osmline_table,
-                   word_table, temp_db_cursor, threads):
+                   temp_db_cursor, threads):
      for func in ('precompute_words', 'getorcreate_housenumber_id', 'make_standard_name'):
          temp_db_cursor.execute(f"""CREATE FUNCTION {func} (src TEXT)
                                    RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
diff --git a/test/python/tools/test_migration.py b/test/python/tools/test_migration.py

index 8fef0dc1a4025461545e0ca8d42af1e12db2d2fc..d102b97da9d3ab10392810b15336e8c7ab281ae0 100644 (file)
--- a/test/python/tools/test_migration.py
+++ b/test/python/tools/test_migration.py
@@ -14,6 +14,8 @@ from nominatim.tools import migration
  from nominatim.errors import UsageError
  import nominatim.version
  
+from mock_legacy_word_table import MockLegacyWordTable
+
  class DummyTokenizer:
  
      def update_sql_functions(self, config):
@@ -26,6 +28,10 @@ def postprocess_mock(monkeypatch):
      monkeypatch.setattr(migration.tokenizer_factory, 'get_tokenizer_for_db',
                          lambda *args: DummyTokenizer())
  
+@pytest.fixture
+def legacy_word_table(temp_db_conn):
+    return MockLegacyWordTable(temp_db_conn)
+
  
  def test_no_migration_old_versions(temp_db_with_extensions, table_factory, def_config):
      table_factory('country_name', 'name HSTORE, country_code TEXT')
@@ -156,7 +162,7 @@ def test_add_nominatim_property_table_repeat(temp_db_conn, temp_db_cursor,
  
  
  def test_change_housenumber_transliteration(temp_db_conn, temp_db_cursor,
-                                            word_table, placex_table):
+                                            legacy_word_table, placex_table):
      placex_table.add(housenumber='3A')
  
      temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
diff --git a/test/python/tools/test_postcodes.py b/test/python/tools/test_postcodes.py

index 556d6457a0e9c726e1bae855be318f4fe2f89d2f..bdfe309471f0995188c9fdd32cc13815e9cac9ee 100644 (file)
--- a/test/python/tools/test_postcodes.py
+++ b/test/python/tools/test_postcodes.py
@@ -65,7 +65,7 @@ def tokenizer():
      return dummy_tokenizer.DummyTokenizer(None, None)
  
  @pytest.fixture
-def postcode_table(temp_db_conn, placex_table, word_table):
+def postcode_table(temp_db_conn, placex_table):
      return MockPostcodeTable(temp_db_conn)
  
  
diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh

index 40ee7ba8f63a5be109cfb0175389ce9f682e433b..3537bcf4486c0050298cb4afe2499a91c318907a 100755 (executable)
--- a/vagrant/Install-on-Ubuntu-18.sh
+++ b/vagrant/Install-on-Ubuntu-18.sh
@@ -25,10 +25,10 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
      sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                          libboost-filesystem-dev libexpat1-dev zlib1g-dev\
                          libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-10 postgresql-10-postgis-2.4 \
+                        postgresql-10-postgis-2.4 \
                          postgresql-contrib-10 postgresql-10-postgis-scripts \
                          php php-pgsql php-intl libicu-dev python3-pip \
-                        python3-psutil python3-jinja2 python3-icu git
+                        python3-psutil python3-jinja2 python3-yaml python3-icu git
  
  # Some of the Python packages that come with Ubuntu 18.04 are too old, so
  # install the latest version from pip:
diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh

index cdfb20f28f2d0a366f67c12a2217138265611156..1ea180e84b64a4b76455b458e8e5fdd82c3dafb2 100755 (executable)
--- a/vagrant/Install-on-Ubuntu-20.sh
+++ b/vagrant/Install-on-Ubuntu-20.sh
@@ -24,11 +24,11 @@ export DEBIAN_FRONTEND=noninteractive #DOCS:
      sudo apt install -y build-essential cmake g++ libboost-dev libboost-system-dev \
                          libboost-filesystem-dev libexpat1-dev zlib1g-dev \
                          libbz2-dev libpq-dev libproj-dev \
-                        postgresql-server-dev-12 postgresql-12-postgis-3 \
+                        postgresql-12-postgis-3 \
                          postgresql-contrib-12 postgresql-12-postgis-3-scripts \
                          php php-pgsql php-intl libicu-dev python3-dotenv \
                          python3-psycopg2 python3-psutil python3-jinja2 \
-                        python3-icu python3-datrie git
+                        python3-icu python3-datrie python3-yaml git
  
  #
  # System Configuration
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
committer	GitHub <noreply@github.com>
	Wed, 11 May 2022 06:52:49 +0000 (08:52 +0200)
.github/actions/build-nominatim/action.yml		patch \| blob \| history
.github/actions/setup-postgresql/action.yml		patch \| blob \| history
.github/workflows/ci-tests.yml		patch \| blob \| history
CMakeLists.txt		patch \| blob \| history
docs/admin/Installation.md		patch \| blob \| history
docs/admin/Migration.md		patch \| blob \| history
docs/customize/Tokenizers.md		patch \| blob \| history
nominatim/config.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
settings/env.defaults		patch \| blob \| history
test/bdd/environment.py		patch \| blob \| history
test/bdd/steps/nominatim_environment.py		patch \| blob \| history
test/bdd/steps/steps_db_ops.py		patch \| blob \| history
test/python/conftest.py		patch \| blob \| history
test/python/mocks.py		patch \| blob \| history
test/python/tools/test_database_import.py		patch \| blob \| history
test/python/tools/test_migration.py		patch \| blob \| history
test/python/tools/test_postcodes.py		patch \| blob \| history
vagrant/Install-on-Ubuntu-18.sh		patch \| blob \| history
vagrant/Install-on-Ubuntu-20.sh		patch \| blob \| history