]> git.openstreetmap.org Git - nominatim.git/commitdiff
improve normalization
authorSarah Hoffmann <lonvia@denofr.de>
Sat, 26 Jun 2021 17:38:08 +0000 (19:38 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
Make sure all special symbols are removed during normalization already.
Those won't be interpreted in any way because they are unlikely to be
searched for.

settings/icu-rules/extended-unicode-to-asccii.yaml
settings/icu-rules/unicode-digits-to-decimal.yaml [new file with mode: 0644]
settings/legacy_icu_tokenizer.yaml

index 921874f50d17674140acf511610870c5e678990e..959774d2b0511a130e038bbe9503d0de0b90e310 100644 (file)
@@ -1,4 +1,4 @@
-- ":: Latin ()"
+- "'ł' > 'l'"
 - "'ª' > 'a'"
 - "'µ' > 'u'"
 - "'º' > 'o'"
 - "'ª' > 'a'"
 - "'µ' > 'u'"
 - "'º' > 'o'"
diff --git a/settings/icu-rules/unicode-digits-to-decimal.yaml b/settings/icu-rules/unicode-digits-to-decimal.yaml
new file mode 100644 (file)
index 0000000..55b3274
--- /dev/null
@@ -0,0 +1,24 @@
+- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰] > 0"
+- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵] > 1"
+- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶] > 2"
+- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷] > 3"
+- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸] > 4"
+- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹] > 5"
+- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺] > 6"
+- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻] > 7"
+- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼] > 8"
+- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽] > 9"
+- "[𑜺⑩⑽⒑❿➉➓⓾] > '10'"
+- "[⑪⑾⒒⓫] > '11'"
+- "[⑫⑿⒓⓬] > '12'"
+- "[⑬⒀⒔⓭] > '13'"
+- "[⑭⒁⒕⓮] > '14'"
+- "[⑮⒂⒖⓯] > '15'"
+- "[⑯⒃⒗⓰] > '16'"
+- "[⑰⒄⒘⓱] > '17'"
+- "[⑱⒅⒙⓲] > '18'"
+- "[⑲⒆⒚⓳] > '19'"
+- "[𑜻⑳⒇⒛⓴] > '20'"
+- "⅐ > ' 1/7'"
+- "⅑ > ' 1/9'"
+- "⅒  > ' 1/10'"
index a3f1c02735238b21d96af8d9b5f3bbeaa7629749..7972b156d455d8841daa020328252edef9da9056 100644 (file)
@@ -1,20 +1,29 @@
 normalization:
 normalization:
-    - ":: NFD ()"
-    - "[[:Nonspacing Mark:] [:Cf:]] >"
     - ":: lower ()"
     - ":: lower ()"
+    - !include icu-rules/unicode-digits-to-decimal.yaml
+    - "'№' > 'no'"
+    - "'n°' > 'no'"
+    - "'nº' > 'no'"
+    - "ª > a"
+    - "º > o"
+    - "[[:Punctuation:][:Symbol:]]  > ' '"
     - "ß > 'ss'" # German szet is unimbigiously equal to double ss
     - "ß > 'ss'" # German szet is unimbigiously equal to double ss
-    - "[[:Punctuation:][:Space:]]+ > ' '"
-    - ":: NFC ()"
+    - "[^[:Letter:] [:Number:] [:Space:]] >"
+    - "[:Lm:] >"
+    - ":: [[:Number:]] Latin ()"
+    - ":: [[:Number:]] Ascii ();"
+    - ":: [[:Number:]] NFD ();"
+    - "[[:Nonspacing Mark:] [:Cf:]] >;"
+    - "[:Space:]+ > ' '"
 transliteration:
 transliteration:
+    - ":: Latin ()"
     - !include icu-rules/extended-unicode-to-asccii.yaml
     - ":: Ascii ()"
     - ":: NFD ()"
     - !include icu-rules/extended-unicode-to-asccii.yaml
     - ":: Ascii ()"
     - ":: NFD ()"
-    - "'' >"
-    - "[[:Nonspacing Mark:] [:Cf:]] >"
     - "[^[:Ascii:]] >"
     - ":: lower ()"
     - "[^[:Ascii:]] >"
     - ":: lower ()"
-    - "[[:Punctuation:][:Space:]]+ > ' '"
     - ":: NFC ()"
     - ":: NFC ()"
+    - "[:Space:]+ > ' '"
 variants:
   - words:
     - ~hal => hal
 variants:
   - words:
     - ~hal => hal