]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
add ci-test for taginfo file generation
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27     - "[:Space:]+ > ' '"
28 sanitizers:
29     - step: clean-housenumbers
30       filter-kind:
31         - housenumber
32         - conscriptionnumber
33         - streetnumber
34       convert-to-name:
35         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
36     - step: clean-postcodes
37       convert-to-address: yes
38       default-pattern: "[A-Z0-9- ]{3,12}"
39     - step: clean-tiger-tags
40     - step: split-name-list
41       delimiters: ;
42     - step: strip-brace-terms
43     - step: tag-analyzer-by-language
44       filter-kind: [".*name.*"]
45       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
46       use-defaults: all
47       mode: append
48 token-analysis:
49     - analyzer: generic
50     - id: "@housenumber"
51       analyzer: housenumbers
52     - id: "@postcode"
53       analyzer: postcodes
54     - id: bg
55       analyzer: generic
56       mode: variant-only
57       variants:
58           - !include icu-rules/variants-bg.yaml
59     - id: ca
60       analyzer: generic
61       mode: variant-only
62       variants:
63           - !include icu-rules/variants-ca.yaml
64     - id: cs
65       analyzer: generic
66       mode: variant-only
67       variants:
68           - !include icu-rules/variants-cs.yaml
69     - id: da
70       analyzer: generic
71       mode: variant-only
72       variants:
73           - !include icu-rules/variants-da.yaml
74     - id: de
75       analyzer: generic
76       mode: variant-only
77       variants:
78           - !include icu-rules/variants-de.yaml
79       mutations:
80           - pattern: ä
81             replacements: ["ä", "ae"]
82           - pattern: ö
83             replacements: ["ö", "oe"]
84           - pattern: ü
85             replacements: ["ü", "ue"]
86     - id: el
87       analyzer: generic
88       mode: variant-only
89       variants:
90           - !include icu-rules/variants-el.yaml
91     - id: en
92       analyzer: generic
93       mode: variant-only
94       variants:
95           - !include icu-rules/variants-en.yaml
96     - id: es
97       analyzer: generic
98       mode: variant-only
99       variants:
100           - !include icu-rules/variants-es.yaml
101     - id: et
102       analyzer: generic
103       mode: variant-only
104       variants:
105           - !include icu-rules/variants-et.yaml
106     - id: eu
107       analyzer: generic
108       mode: variant-only
109       variants:
110           - !include icu-rules/variants-eu.yaml
111     - id: fi
112       analyzer: generic
113       mode: variant-only
114       variants:
115           - !include icu-rules/variants-fi.yaml
116     - id: fr
117       analyzer: generic
118       mode: variant-only
119       variants:
120           - !include icu-rules/variants-fr.yaml
121     - id: gl
122       analyzer: generic
123       mode: variant-only
124       variants:
125           - !include icu-rules/variants-gl.yaml
126     - id: hu
127       analyzer: generic
128       mode: variant-only
129       variants:
130           - !include icu-rules/variants-hu.yaml
131     - id: it
132       analyzer: generic
133       mode: variant-only
134       variants:
135           - !include icu-rules/variants-it.yaml
136     - id: ja
137       analyzer: generic
138       mode: variant-only
139       variants:
140           - !include icu-rules/variants-ja.yaml
141     - id: mg
142       analyzer: generic
143       mode: variant-only
144       variants:
145           - !include icu-rules/variants-mg.yaml
146     - id: ms
147       analyzer: generic
148       mode: variant-only
149       variants:
150           - !include icu-rules/variants-ms.yaml
151     - id: nl
152       analyzer: generic
153       mode: variant-only
154       variants:
155           - !include icu-rules/variants-nl.yaml
156     - id: no
157       analyzer: generic
158       mode: variant-only
159       variants:
160           - !include icu-rules/variants-no.yaml
161     - id: pl
162       analyzer: generic
163       mode: variant-only
164       variants:
165           - !include icu-rules/variants-pl.yaml
166     - id: pt
167       analyzer: generic
168       mode: variant-only
169       variants:
170           - !include icu-rules/variants-pt.yaml
171     - id: ro
172       analyzer: generic
173       mode: variant-only
174       variants:
175           - !include icu-rules/variants-ro.yaml
176     - id: ru
177       analyzer: generic
178       mode: variant-only
179       variants:
180           - !include icu-rules/variants-ru.yaml
181     - id: sk
182       analyzer: generic
183       mode: variant-only
184       variants:
185           - !include icu-rules/variants-sk.yaml
186     - id: sl
187       analyzer: generic
188       mode: variant-only
189       variants:
190           - !include icu-rules/variants-sl.yaml
191     - id: sv
192       analyzer: generic
193       mode: variant-only
194       variants:
195           - !include icu-rules/variants-sv.yaml
196     - id: tr
197       analyzer: generic
198       mode: variant-only
199       variants:
200           - !include icu-rules/variants-tr.yaml
201     - id: uk
202       analyzer: generic
203       mode: variant-only
204       variants:
205           - !include icu-rules/variants-uk.yaml
206     - id: vi
207       analyzer: generic
208       mode: variant-only
209       variants:
210           - !include icu-rules/variants-vi.yaml