From 4002bee0c160c4d8137d5f58b0134a37a66b91b3 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 10 May 2022 12:02:50 +0200 Subject: [PATCH] make ICU the default tokenizer --- CMakeLists.txt | 2 +- docs/admin/Installation.md | 9 +++++++++ docs/customize/Tokenizers.md | 18 +++++++++++++++++- settings/env.defaults | 4 ++-- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af7dbc2a..8360d549 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ endif() set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database") set(BUILD_API on CACHE BOOL "Build everything for the API server") -set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module") +set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer") set(BUILD_TESTS on CACHE BOOL "Build test suite") set(BUILD_DOCS on CACHE BOOL "Build documentation") set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page") diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md index 8c4c670b..6b585579 100644 --- a/docs/admin/Installation.md +++ b/docs/admin/Installation.md @@ -158,6 +158,15 @@ make sudo make install ``` +!!! warning + The default installation no longer compiles the PostgreSQL module that + is needed for the legacy tokenizer from older Nominatim versions. If you + are upgrading an older database or want to run the + [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for + some other reason, you need to enable the PostgreSQL module via + cmake: `cmake -DBUILD_MODULE=on ../Nominatim` + + Nominatim installs itself into `/usr/local` per default. To choose a different installation directory add `-DCMAKE_INSTALL_PREFIX=` to the cmake command. Make sure that the `bin` directory is available in your path diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index d849eb48..19d867dd 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -19,7 +19,22 @@ they can be configured. The legacy tokenizer implements the analysis algorithms of older Nominatim versions. It uses a special Postgresql module to normalize names and queries. -This tokenizer is currently the default. +This tokenizer is automatically installed and used when upgrading an older +database. It should not be used for new installations anymore. + +### Compiling the PostgreSQL module + +The tokeinzer needs a special C module for PostgreSQL which is not compiled +by default. If you need the legacy tokenizer, compile Nominatim as follows: + +``` +mkdir build +cd build +cmake -DBUILD_MODULE=on +make +``` + +### Enabling the tokenizer To enable the tokenizer add the following line to your project configuration: @@ -47,6 +62,7 @@ normalization functions are hard-coded. The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to normalize names and queries. It also offers configurable decomposition and abbreviation handling. +This tokenizer is currently the default. To enable the tokenizer add the following line to your project configuration: diff --git a/settings/env.defaults b/settings/env.defaults index e5dfe4a6..3115f438 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH= # Tokenizer used for normalizing and parsing queries and names. # The tokenizer is set up during import and cannot be changed afterwards # without a reimport. -# Currently available tokenizers: legacy -NOMINATIM_TOKENIZER="legacy" +# Currently available tokenizers: icu, legacy +NOMINATIM_TOKENIZER="icu" # Number of occurrences of a word before it is considered frequent. # Similar to the concept of stop words. Frequent partial words get ignored -- 2.39.5