From fb54bd3fcff5a8a44ae59f0d552ed316b08966b3 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 10 Jan 2022 17:40:03 +0100 Subject: [PATCH] consider "modifier letter apostrophe" to be punctuation While technically being a letter, the apostrophe is often replaced with a normal apostrophe in writing which is a punctuation mark. This makes sure that the modifier letter apostrophe yields the same normalization results and thus is really interchangable. Only has an effect after the next reimport. Fixes #2569. --- settings/icu_tokenizer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index e5cbeb6f..a3c62e67 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -7,7 +7,7 @@ normalization: - "'nº' > 'no'" - "ª > a" - "º > o" - - "[[:Punctuation:][:Symbol:]] > ' '" + - "[[:Punctuation:][:Symbol:]\u02bc] > ' '" - "ß > 'ss'" # German szet is unimbigiously equal to double ss - "[^[:Letter:] [:Number:] [:Space:]] >" - "[:Lm:] >" -- 2.39.5