X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/b9fbfeff67b420905a4176f4f5e9312746d0c42e..4fd2e961b6daaabba02f6f720f01b918364e5500:/settings/legacy_icu_tokenizer.yaml diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml index a3f1c027..7972b156 100644 --- a/settings/legacy_icu_tokenizer.yaml +++ b/settings/legacy_icu_tokenizer.yaml @@ -1,20 +1,29 @@ normalization: - - ":: NFD ()" - - "[[:Nonspacing Mark:] [:Cf:]] >" - ":: lower ()" + - !include icu-rules/unicode-digits-to-decimal.yaml + - "'№' > 'no'" + - "'n°' > 'no'" + - "'nº' > 'no'" + - "ª > a" + - "º > o" + - "[[:Punctuation:][:Symbol:]] > ' '" - "ß > 'ss'" # German szet is unimbigiously equal to double ss - - "[[:Punctuation:][:Space:]]+ > ' '" - - ":: NFC ()" + - "[^[:Letter:] [:Number:] [:Space:]] >" + - "[:Lm:] >" + - ":: [[:Number:]] Latin ()" + - ":: [[:Number:]] Ascii ();" + - ":: [[:Number:]] NFD ();" + - "[[:Nonspacing Mark:] [:Cf:]] >;" + - "[:Space:]+ > ' '" transliteration: + - ":: Latin ()" - !include icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" - ":: NFD ()" - - "'' >" - - "[[:Nonspacing Mark:] [:Cf:]] >" - "[^[:Ascii:]] >" - ":: lower ()" - - "[[:Punctuation:][:Space:]]+ > ' '" - ":: NFC ()" + - "[:Space:]+ > ' '" variants: - words: - ~hal => hal