From: Sarah Hoffmann Date: Sat, 26 Jun 2021 17:38:08 +0000 (+0200) Subject: improve normalization X-Git-Tag: v4.0.0~58^2~6 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4fd2e961b6daaabba02f6f720f01b918364e5500?ds=sidebyside improve normalization Make sure all special symbols are removed during normalization already. Those won't be interpreted in any way because they are unlikely to be searched for. --- diff --git a/settings/icu-rules/extended-unicode-to-asccii.yaml b/settings/icu-rules/extended-unicode-to-asccii.yaml index 921874f5..959774d2 100644 --- a/settings/icu-rules/extended-unicode-to-asccii.yaml +++ b/settings/icu-rules/extended-unicode-to-asccii.yaml @@ -1,4 +1,4 @@ -- ":: Latin ()" +- "'ł' > 'l'" - "'ª' > 'a'" - "'µ' > 'u'" - "'º' > 'o'" diff --git a/settings/icu-rules/unicode-digits-to-decimal.yaml b/settings/icu-rules/unicode-digits-to-decimal.yaml new file mode 100644 index 00000000..55b3274a --- /dev/null +++ b/settings/icu-rules/unicode-digits-to-decimal.yaml @@ -0,0 +1,24 @@ +- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰] > 0" +- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵] > 1" +- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶] > 2" +- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷] > 3" +- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸] > 4" +- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹] > 5" +- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺] > 6" +- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻] > 7" +- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼] > 8" +- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽] > 9" +- "[𑜺⑩⑽⒑❿➉➓⓾] > '10'" +- "[⑪⑾⒒⓫] > '11'" +- "[⑫⑿⒓⓬] > '12'" +- "[⑬⒀⒔⓭] > '13'" +- "[⑭⒁⒕⓮] > '14'" +- "[⑮⒂⒖⓯] > '15'" +- "[⑯⒃⒗⓰] > '16'" +- "[⑰⒄⒘⓱] > '17'" +- "[⑱⒅⒙⓲] > '18'" +- "[⑲⒆⒚⓳] > '19'" +- "[𑜻⑳⒇⒛⓴] > '20'" +- "⅐ > ' 1/7'" +- "⅑ > ' 1/9'" +- "⅒ > ' 1/10'" diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml index a3f1c027..7972b156 100644 --- a/settings/legacy_icu_tokenizer.yaml +++ b/settings/legacy_icu_tokenizer.yaml @@ -1,20 +1,29 @@ normalization: - - ":: NFD ()" - - "[[:Nonspacing Mark:] [:Cf:]] >" - ":: lower ()" + - !include icu-rules/unicode-digits-to-decimal.yaml + - "'№' > 'no'" + - "'n°' > 'no'" + - "'nº' > 'no'" + - "ª > a" + - "º > o" + - "[[:Punctuation:][:Symbol:]] > ' '" - "ß > 'ss'" # German szet is unimbigiously equal to double ss - - "[[:Punctuation:][:Space:]]+ > ' '" - - ":: NFC ()" + - "[^[:Letter:] [:Number:] [:Space:]] >" + - "[:Lm:] >" + - ":: [[:Number:]] Latin ()" + - ":: [[:Number:]] Ascii ();" + - ":: [[:Number:]] NFD ();" + - "[[:Nonspacing Mark:] [:Cf:]] >;" + - "[:Space:]+ > ' '" transliteration: + - ":: Latin ()" - !include icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" - ":: NFD ()" - - "'' >" - - "[[:Nonspacing Mark:] [:Cf:]] >" - "[^[:Ascii:]] >" - ":: lower ()" - - "[[:Punctuation:][:Space:]]+ > ' '" - ":: NFC ()" + - "[:Space:]+ > ' '" variants: - words: - ~hal => hal