From: Sarah Hoffmann Date: Wed, 10 Nov 2021 16:14:13 +0000 (+0100) Subject: avoid special characters in word tokens X-Git-Tag: v4.0.1~4^2~1 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/188695266699e2afbc30f25b0d91de6b9511e425 avoid special characters in word tokens Transliteration should only consist of ASCII letters and numbers. Avoid any other characters. --- diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 41760c49..e5cbeb6f 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -21,8 +21,8 @@ transliteration: - !include icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" - ":: NFD ()" - - "[^[:Ascii:]] >" - ":: lower ()" + - "[^a-z0-9[:Space:]] >" - ":: NFC ()" sanitizers: - step: split-name-list