]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
Merge pull request #3122 from miku0/sanitizer-final
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27     - "[:Space:]+ > ' '"
28 sanitizers:
29     - step: clean-housenumbers
30       filter-kind:
31         - housenumber
32         - conscriptionnumber
33         - streetnumber
34       convert-to-name:
35         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
36     - step: clean-postcodes
37       convert-to-address: yes
38       default-pattern: "[A-Z0-9- ]{3,12}"
39     - step: clean-tiger-tags
40     - step: split-name-list
41       delimiters: ;
42     - step: strip-brace-terms
43     - step: tag-analyzer-by-language
44       filter-kind: [".*name.*"]
45       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
46       use-defaults: all
47       mode: append
48     - step: tag-japanese
49 token-analysis:
50     - analyzer: generic
51     - id: "@housenumber"
52       analyzer: housenumbers
53     - id: "@postcode"
54       analyzer: postcodes
55     - id: bg
56       analyzer: generic
57       mode: variant-only
58       variants:
59           - !include icu-rules/variants-bg.yaml
60     - id: ca
61       analyzer: generic
62       mode: variant-only
63       variants:
64           - !include icu-rules/variants-ca.yaml
65     - id: cs
66       analyzer: generic
67       mode: variant-only
68       variants:
69           - !include icu-rules/variants-cs.yaml
70     - id: da
71       analyzer: generic
72       mode: variant-only
73       variants:
74           - !include icu-rules/variants-da.yaml
75     - id: de
76       analyzer: generic
77       mode: variant-only
78       variants:
79           - !include icu-rules/variants-de.yaml
80       mutations:
81           - pattern: ä
82             replacements: ["ä", "ae"]
83           - pattern: ö
84             replacements: ["ö", "oe"]
85           - pattern: ü
86             replacements: ["ü", "ue"]
87     - id: el
88       analyzer: generic
89       mode: variant-only
90       variants:
91           - !include icu-rules/variants-el.yaml
92     - id: en
93       analyzer: generic
94       mode: variant-only
95       variants:
96           - !include icu-rules/variants-en.yaml
97     - id: es
98       analyzer: generic
99       mode: variant-only
100       variants:
101           - !include icu-rules/variants-es.yaml
102     - id: et
103       analyzer: generic
104       mode: variant-only
105       variants:
106           - !include icu-rules/variants-et.yaml
107     - id: eu
108       analyzer: generic
109       mode: variant-only
110       variants:
111           - !include icu-rules/variants-eu.yaml
112     - id: fi
113       analyzer: generic
114       mode: variant-only
115       variants:
116           - !include icu-rules/variants-fi.yaml
117     - id: fr
118       analyzer: generic
119       mode: variant-only
120       variants:
121           - !include icu-rules/variants-fr.yaml
122     - id: gl
123       analyzer: generic
124       mode: variant-only
125       variants:
126           - !include icu-rules/variants-gl.yaml
127     - id: hu
128       analyzer: generic
129       mode: variant-only
130       variants:
131           - !include icu-rules/variants-hu.yaml
132     - id: it
133       analyzer: generic
134       mode: variant-only
135       variants:
136           - !include icu-rules/variants-it.yaml
137     - id: ja
138       analyzer: generic
139       mode: variant-only
140       variants:
141           - !include icu-rules/variants-ja.yaml
142     - id: mg
143       analyzer: generic
144       mode: variant-only
145       variants:
146           - !include icu-rules/variants-mg.yaml
147     - id: ms
148       analyzer: generic
149       mode: variant-only
150       variants:
151           - !include icu-rules/variants-ms.yaml
152     - id: nl
153       analyzer: generic
154       mode: variant-only
155       variants:
156           - !include icu-rules/variants-nl.yaml
157     - id: no
158       analyzer: generic
159       mode: variant-only
160       variants:
161           - !include icu-rules/variants-no.yaml
162     - id: pl
163       analyzer: generic
164       mode: variant-only
165       variants:
166           - !include icu-rules/variants-pl.yaml
167     - id: pt
168       analyzer: generic
169       mode: variant-only
170       variants:
171           - !include icu-rules/variants-pt.yaml
172     - id: ro
173       analyzer: generic
174       mode: variant-only
175       variants:
176           - !include icu-rules/variants-ro.yaml
177     - id: ru
178       analyzer: generic
179       mode: variant-only
180       variants:
181           - !include icu-rules/variants-ru.yaml
182     - id: sk
183       analyzer: generic
184       mode: variant-only
185       variants:
186           - !include icu-rules/variants-sk.yaml
187     - id: sl
188       analyzer: generic
189       mode: variant-only
190       variants:
191           - !include icu-rules/variants-sl.yaml
192     - id: sv
193       analyzer: generic
194       mode: variant-only
195       variants:
196           - !include icu-rules/variants-sv.yaml
197     - id: tr
198       analyzer: generic
199       mode: variant-only
200       variants:
201           - !include icu-rules/variants-tr.yaml
202     - id: uk
203       analyzer: generic
204       mode: variant-only
205       variants:
206           - !include icu-rules/variants-uk.yaml
207     - id: vi
208       analyzer: generic
209       mode: variant-only
210       variants:
211           - !include icu-rules/variants-vi.yaml