]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
create idx_place_interpolations for import already
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unimbigiously equal to double ss
12     - "[^[:Letter:] [:Number:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27 sanitizers:
28     - step: clean-housenumbers
29       filter-kind:
30         - housenumber
31         - conscriptionnumber
32         - streetnumber
33       convert-to-name:
34         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
35     - step: split-name-list
36     - step: strip-brace-terms
37     - step: tag-analyzer-by-language
38       filter-kind: [".*name.*"]
39       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
40       use-defaults: all
41       mode: append
42 token-analysis:
43     - analyzer: generic
44     - id: bg
45       analyzer: generic
46       mode: variant-only
47       variants:
48           - !include icu-rules/variants-bg.yaml
49     - id: ca
50       analyzer: generic
51       mode: variant-only
52       variants:
53           - !include icu-rules/variants-ca.yaml
54     - id: cs
55       analyzer: generic
56       mode: variant-only
57       variants:
58           - !include icu-rules/variants-cs.yaml
59     - id: da
60       analyzer: generic
61       mode: variant-only
62       variants:
63           - !include icu-rules/variants-da.yaml
64     - id: de
65       analyzer: generic
66       mode: variant-only
67       variants:
68           - !include icu-rules/variants-de.yaml
69       mutations:
70           - pattern: ä
71             replacements: ["ä", "ae"]
72           - pattern: ö
73             replacements: ["ö", "oe"]
74           - pattern: ü
75             replacements: ["ü", "ue"]
76     - id: el
77       analyzer: generic
78       mode: variant-only
79       variants:
80           - !include icu-rules/variants-el.yaml
81     - id: en
82       analyzer: generic
83       mode: variant-only
84       variants:
85           - !include icu-rules/variants-en.yaml
86     - id: es
87       analyzer: generic
88       mode: variant-only
89       variants:
90           - !include icu-rules/variants-es.yaml
91     - id: et
92       analyzer: generic
93       mode: variant-only
94       variants:
95           - !include icu-rules/variants-et.yaml
96     - id: eu
97       analyzer: generic
98       mode: variant-only
99       variants:
100           - !include icu-rules/variants-eu.yaml
101     - id: fi
102       analyzer: generic
103       mode: variant-only
104       variants:
105           - !include icu-rules/variants-fi.yaml
106     - id: fr
107       analyzer: generic
108       mode: variant-only
109       variants:
110           - !include icu-rules/variants-fr.yaml
111     - id: gl
112       analyzer: generic
113       mode: variant-only
114       variants:
115           - !include icu-rules/variants-gl.yaml
116     - id: hu
117       analyzer: generic
118       mode: variant-only
119       variants:
120           - !include icu-rules/variants-hu.yaml
121     - id: it
122       analyzer: generic
123       mode: variant-only
124       variants:
125           - !include icu-rules/variants-it.yaml
126     - id: ja
127       analyzer: generic
128       mode: variant-only
129       variants:
130           - !include icu-rules/variants-ja.yaml
131     - id: mg
132       analyzer: generic
133       mode: variant-only
134       variants:
135           - !include icu-rules/variants-mg.yaml
136     - id: ms
137       analyzer: generic
138       mode: variant-only
139       variants:
140           - !include icu-rules/variants-ms.yaml
141     - id: nl
142       analyzer: generic
143       mode: variant-only
144       variants:
145           - !include icu-rules/variants-nl.yaml
146     - id: no
147       analyzer: generic
148       mode: variant-only
149       variants:
150           - !include icu-rules/variants-no.yaml
151     - id: pl
152       analyzer: generic
153       mode: variant-only
154       variants:
155           - !include icu-rules/variants-pl.yaml
156     - id: pt
157       analyzer: generic
158       mode: variant-only
159       variants:
160           - !include icu-rules/variants-pt.yaml
161     - id: ro
162       analyzer: generic
163       mode: variant-only
164       variants:
165           - !include icu-rules/variants-ro.yaml
166     - id: ru
167       analyzer: generic
168       mode: variant-only
169       variants:
170           - !include icu-rules/variants-ru.yaml
171     - id: sk
172       analyzer: generic
173       mode: variant-only
174       variants:
175           - !include icu-rules/variants-sk.yaml
176     - id: sl
177       analyzer: generic
178       mode: variant-only
179       variants:
180           - !include icu-rules/variants-sl.yaml
181     - id: sv
182       analyzer: generic
183       mode: variant-only
184       variants:
185           - !include icu-rules/variants-sv.yaml
186     - id: tr
187       analyzer: generic
188       mode: variant-only
189       variants:
190           - !include icu-rules/variants-tr.yaml
191     - id: uk
192       analyzer: generic
193       mode: variant-only
194       variants:
195           - !include icu-rules/variants-uk.yaml
196     - id: vi
197       analyzer: generic
198       mode: variant-only
199       variants:
200           - !include icu-rules/variants-vi.yaml