]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
Merge pull request #2784 from lonvia/doscs-customizing-icu-tokenizer
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27 sanitizers:
28     - step: clean-housenumbers
29       filter-kind:
30         - housenumber
31         - conscriptionnumber
32         - streetnumber
33       convert-to-name:
34         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
35     - step: clean-postcodes
36       convert-to-address: yes
37       default-pattern: "[A-Z0-9- ]{3,12}"
38     - step: split-name-list
39     - step: strip-brace-terms
40     - step: tag-analyzer-by-language
41       filter-kind: [".*name.*"]
42       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
43       use-defaults: all
44       mode: append
45 token-analysis:
46     - analyzer: generic
47     - id: "@housenumber"
48       analyzer: housenumbers
49     - id: "@postcode"
50       analyzer: postcodes
51     - id: bg
52       analyzer: generic
53       mode: variant-only
54       variants:
55           - !include icu-rules/variants-bg.yaml
56     - id: ca
57       analyzer: generic
58       mode: variant-only
59       variants:
60           - !include icu-rules/variants-ca.yaml
61     - id: cs
62       analyzer: generic
63       mode: variant-only
64       variants:
65           - !include icu-rules/variants-cs.yaml
66     - id: da
67       analyzer: generic
68       mode: variant-only
69       variants:
70           - !include icu-rules/variants-da.yaml
71     - id: de
72       analyzer: generic
73       mode: variant-only
74       variants:
75           - !include icu-rules/variants-de.yaml
76       mutations:
77           - pattern: ä
78             replacements: ["ä", "ae"]
79           - pattern: ö
80             replacements: ["ö", "oe"]
81           - pattern: ü
82             replacements: ["ü", "ue"]
83     - id: el
84       analyzer: generic
85       mode: variant-only
86       variants:
87           - !include icu-rules/variants-el.yaml
88     - id: en
89       analyzer: generic
90       mode: variant-only
91       variants:
92           - !include icu-rules/variants-en.yaml
93     - id: es
94       analyzer: generic
95       mode: variant-only
96       variants:
97           - !include icu-rules/variants-es.yaml
98     - id: et
99       analyzer: generic
100       mode: variant-only
101       variants:
102           - !include icu-rules/variants-et.yaml
103     - id: eu
104       analyzer: generic
105       mode: variant-only
106       variants:
107           - !include icu-rules/variants-eu.yaml
108     - id: fi
109       analyzer: generic
110       mode: variant-only
111       variants:
112           - !include icu-rules/variants-fi.yaml
113     - id: fr
114       analyzer: generic
115       mode: variant-only
116       variants:
117           - !include icu-rules/variants-fr.yaml
118     - id: gl
119       analyzer: generic
120       mode: variant-only
121       variants:
122           - !include icu-rules/variants-gl.yaml
123     - id: hu
124       analyzer: generic
125       mode: variant-only
126       variants:
127           - !include icu-rules/variants-hu.yaml
128     - id: it
129       analyzer: generic
130       mode: variant-only
131       variants:
132           - !include icu-rules/variants-it.yaml
133     - id: ja
134       analyzer: generic
135       mode: variant-only
136       variants:
137           - !include icu-rules/variants-ja.yaml
138     - id: mg
139       analyzer: generic
140       mode: variant-only
141       variants:
142           - !include icu-rules/variants-mg.yaml
143     - id: ms
144       analyzer: generic
145       mode: variant-only
146       variants:
147           - !include icu-rules/variants-ms.yaml
148     - id: nl
149       analyzer: generic
150       mode: variant-only
151       variants:
152           - !include icu-rules/variants-nl.yaml
153     - id: no
154       analyzer: generic
155       mode: variant-only
156       variants:
157           - !include icu-rules/variants-no.yaml
158     - id: pl
159       analyzer: generic
160       mode: variant-only
161       variants:
162           - !include icu-rules/variants-pl.yaml
163     - id: pt
164       analyzer: generic
165       mode: variant-only
166       variants:
167           - !include icu-rules/variants-pt.yaml
168     - id: ro
169       analyzer: generic
170       mode: variant-only
171       variants:
172           - !include icu-rules/variants-ro.yaml
173     - id: ru
174       analyzer: generic
175       mode: variant-only
176       variants:
177           - !include icu-rules/variants-ru.yaml
178     - id: sk
179       analyzer: generic
180       mode: variant-only
181       variants:
182           - !include icu-rules/variants-sk.yaml
183     - id: sl
184       analyzer: generic
185       mode: variant-only
186       variants:
187           - !include icu-rules/variants-sl.yaml
188     - id: sv
189       analyzer: generic
190       mode: variant-only
191       variants:
192           - !include icu-rules/variants-sv.yaml
193     - id: tr
194       analyzer: generic
195       mode: variant-only
196       variants:
197           - !include icu-rules/variants-tr.yaml
198     - id: uk
199       analyzer: generic
200       mode: variant-only
201       variants:
202           - !include icu-rules/variants-uk.yaml
203     - id: vi
204       analyzer: generic
205       mode: variant-only
206       variants:
207           - !include icu-rules/variants-vi.yaml