From 63dc4b39bc6bc0bf5a95d0c1a8298f5349637a9e Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 28 Apr 2022 17:20:56 +0200
Subject: [PATCH] ICU: better letter identification in normalization

The Letter class does not include non-spacing marks that can also
have a consonant or vowel meaning, especially in Indian languages.
Use the alnum propoerty instead which includes them all. Also
include the vowel-canceling Virama, which is not a letter by itself
but changes the transliteration.
---
 settings/icu_tokenizer.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index bebd49e9..cd9c0d6d 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -8,8 +8,8 @@ normalization:
     - "Âª > a"
     - "Âº > o"
     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
-    - "Ã > 'ss'" # German szet is unimbigiously equal to double ss
-    - "[^[:Letter:] [:Number:] [:Space:]] >"
+    - "Ã > 'ss'" # German szet is unambiguously equal to double ss
+    - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
     - "[:Lm:] >"
     - ":: [[:Number:]] Latin ()"
     - ":: [[:Number:]] Ascii ();"
-- 
2.39.5