From 4002bee0c160c4d8137d5f58b0134a37a66b91b3 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 10 May 2022 12:02:50 +0200
Subject: [PATCH] make ICU the default tokenizer

---
 CMakeLists.txt               |  2 +-
 docs/admin/Installation.md   |  9 +++++++++
 docs/customize/Tokenizers.md | 18 +++++++++++++++++-
 settings/env.defaults        |  4 ++--
 4 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index af7dbc2a..8360d549 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ endif()
 
 set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
 set(BUILD_API on CACHE BOOL "Build everything for the API server")
-set(BUILD_MODULE on CACHE BOOL "Build PostgreSQL module")
+set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
 set(BUILD_TESTS on CACHE BOOL "Build test suite")
 set(BUILD_DOCS on CACHE BOOL "Build documentation")
 set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md
index 8c4c670b..6b585579 100644
--- a/docs/admin/Installation.md
+++ b/docs/admin/Installation.md
@@ -158,6 +158,15 @@ make
 sudo make install
 ```
 
+!!! warning
+    The default installation no longer compiles the PostgreSQL module that
+    is needed for the legacy tokenizer from older Nominatim versions. If you
+    are upgrading an older database or want to run the
+    [legacy tokenizer](../customize/Tokenizers.md#legacy-tokenizer) for
+    some other reason, you need to enable the PostgreSQL module via
+    cmake: `cmake -DBUILD_MODULE=on ../Nominatim`
+
+
 Nominatim installs itself into `/usr/local` per default. To choose a different
 installation directory add `-DCMAKE_INSTALL_PREFIX=<install root>` to the
 cmake command. Make sure that the `bin` directory is available in your path
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md
index d849eb48..19d867dd 100644
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -19,7 +19,22 @@ they can be configured.
 
 The legacy tokenizer implements the analysis algorithms of older Nominatim
 versions. It uses a special Postgresql module to normalize names and queries.
-This tokenizer is currently the default.
+This tokenizer is automatically installed and used when upgrading an older
+database. It should not be used for new installations anymore.
+
+### Compiling the PostgreSQL module
+
+The tokeinzer needs a special C module for PostgreSQL which is not compiled
+by default. If you need the legacy tokenizer, compile Nominatim as follows:
+
+```
+mkdir build
+cd build
+cmake -DBUILD_MODULE=on
+make
+```
+
+### Enabling the tokenizer
 
 To enable the tokenizer add the following line to your project configuration:
 
@@ -47,6 +62,7 @@ normalization functions are hard-coded.
 The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
 normalize names and queries. It also offers configurable decomposition and
 abbreviation handling.
+This tokenizer is currently the default.
 
 To enable the tokenizer add the following line to your project configuration:
 
diff --git a/settings/env.defaults b/settings/env.defaults
index e5dfe4a6..3115f438 100644
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -21,8 +21,8 @@ NOMINATIM_DATABASE_MODULE_PATH=
 # Tokenizer used for normalizing and parsing queries and names.
 # The tokenizer is set up during import and cannot be changed afterwards
 # without a reimport.
-# Currently available tokenizers: legacy
-NOMINATIM_TOKENIZER="legacy"
+# Currently available tokenizers: icu, legacy
+NOMINATIM_TOKENIZER="icu"
 
 # Number of occurrences of a word before it is considered frequent.
 # Similar to the concept of stop words. Frequent partial words get ignored
-- 
2.39.5