From 0e186835b914074e1784ccce398a6eca3e78bfb7 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 2 Dec 2022 10:15:02 +0100 Subject: [PATCH] contract duplicate spaces in transliteration string There are some pathological cases where an isolated letter may be deleted because it is in itself meaningless. If this happens in the middle of a sentence, then the transliteration contains two consecutive spaces. Add a final rule to fix this. See #2909. --- settings/icu_tokenizer.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 16339970..f30578a2 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -24,6 +24,7 @@ transliteration: - ":: lower ()" - "[^a-z0-9[:Space:]] >" - ":: NFC ()" + - "[:Space:]+ > ' '" sanitizers: - step: clean-housenumbers filter-kind: -- 2.39.5