]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
postcodes: strip leading country codes
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27 sanitizers:
28     - step: clean-housenumbers
29       filter-kind:
30         - housenumber
31         - conscriptionnumber
32         - streetnumber
33       convert-to-name:
34         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
35     - step: clean-postcodes
36       convert-to-address: yes
37     - step: split-name-list
38     - step: strip-brace-terms
39     - step: tag-analyzer-by-language
40       filter-kind: [".*name.*"]
41       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
42       use-defaults: all
43       mode: append
44 token-analysis:
45     - analyzer: generic
46     - id: "@housenumber"
47       analyzer: housenumbers
48     - id: bg
49       analyzer: generic
50       mode: variant-only
51       variants:
52           - !include icu-rules/variants-bg.yaml
53     - id: ca
54       analyzer: generic
55       mode: variant-only
56       variants:
57           - !include icu-rules/variants-ca.yaml
58     - id: cs
59       analyzer: generic
60       mode: variant-only
61       variants:
62           - !include icu-rules/variants-cs.yaml
63     - id: da
64       analyzer: generic
65       mode: variant-only
66       variants:
67           - !include icu-rules/variants-da.yaml
68     - id: de
69       analyzer: generic
70       mode: variant-only
71       variants:
72           - !include icu-rules/variants-de.yaml
73       mutations:
74           - pattern: ä
75             replacements: ["ä", "ae"]
76           - pattern: ö
77             replacements: ["ö", "oe"]
78           - pattern: ü
79             replacements: ["ü", "ue"]
80     - id: el
81       analyzer: generic
82       mode: variant-only
83       variants:
84           - !include icu-rules/variants-el.yaml
85     - id: en
86       analyzer: generic
87       mode: variant-only
88       variants:
89           - !include icu-rules/variants-en.yaml
90     - id: es
91       analyzer: generic
92       mode: variant-only
93       variants:
94           - !include icu-rules/variants-es.yaml
95     - id: et
96       analyzer: generic
97       mode: variant-only
98       variants:
99           - !include icu-rules/variants-et.yaml
100     - id: eu
101       analyzer: generic
102       mode: variant-only
103       variants:
104           - !include icu-rules/variants-eu.yaml
105     - id: fi
106       analyzer: generic
107       mode: variant-only
108       variants:
109           - !include icu-rules/variants-fi.yaml
110     - id: fr
111       analyzer: generic
112       mode: variant-only
113       variants:
114           - !include icu-rules/variants-fr.yaml
115     - id: gl
116       analyzer: generic
117       mode: variant-only
118       variants:
119           - !include icu-rules/variants-gl.yaml
120     - id: hu
121       analyzer: generic
122       mode: variant-only
123       variants:
124           - !include icu-rules/variants-hu.yaml
125     - id: it
126       analyzer: generic
127       mode: variant-only
128       variants:
129           - !include icu-rules/variants-it.yaml
130     - id: ja
131       analyzer: generic
132       mode: variant-only
133       variants:
134           - !include icu-rules/variants-ja.yaml
135     - id: mg
136       analyzer: generic
137       mode: variant-only
138       variants:
139           - !include icu-rules/variants-mg.yaml
140     - id: ms
141       analyzer: generic
142       mode: variant-only
143       variants:
144           - !include icu-rules/variants-ms.yaml
145     - id: nl
146       analyzer: generic
147       mode: variant-only
148       variants:
149           - !include icu-rules/variants-nl.yaml
150     - id: no
151       analyzer: generic
152       mode: variant-only
153       variants:
154           - !include icu-rules/variants-no.yaml
155     - id: pl
156       analyzer: generic
157       mode: variant-only
158       variants:
159           - !include icu-rules/variants-pl.yaml
160     - id: pt
161       analyzer: generic
162       mode: variant-only
163       variants:
164           - !include icu-rules/variants-pt.yaml
165     - id: ro
166       analyzer: generic
167       mode: variant-only
168       variants:
169           - !include icu-rules/variants-ro.yaml
170     - id: ru
171       analyzer: generic
172       mode: variant-only
173       variants:
174           - !include icu-rules/variants-ru.yaml
175     - id: sk
176       analyzer: generic
177       mode: variant-only
178       variants:
179           - !include icu-rules/variants-sk.yaml
180     - id: sl
181       analyzer: generic
182       mode: variant-only
183       variants:
184           - !include icu-rules/variants-sl.yaml
185     - id: sv
186       analyzer: generic
187       mode: variant-only
188       variants:
189           - !include icu-rules/variants-sv.yaml
190     - id: tr
191       analyzer: generic
192       mode: variant-only
193       variants:
194           - !include icu-rules/variants-tr.yaml
195     - id: uk
196       analyzer: generic
197       mode: variant-only
198       variants:
199           - !include icu-rules/variants-uk.yaml
200     - id: vi
201       analyzer: generic
202       mode: variant-only
203       variants:
204           - !include icu-rules/variants-vi.yaml