]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
flex: add combining clean function
[nominatim.git] / settings / icu_tokenizer.yaml
1 normalization:
2     - ":: lower ()"
3     - ":: Hans-Hant"
4     - !include icu-rules/unicode-digits-to-decimal.yaml
5     - "'№' > 'no'"
6     - "'n°' > 'no'"
7     - "'nº' > 'no'"
8     - "ª > a"
9     - "º > o"
10     - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
11     - "ß > 'ss'" # German szet is unambiguously equal to double ss
12     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
13     - "[:Lm:] >"
14     - ":: [[:Number:]] Latin ()"
15     - ":: [[:Number:]] Ascii ();"
16     - ":: [[:Number:]] NFD ();"
17     - "[[:Nonspacing Mark:] [:Cf:]] >;"
18     - "[:Space:]+ > ' '"
19 transliteration:
20     - ":: Latin ()"
21     - !include icu-rules/extended-unicode-to-asccii.yaml
22     - ":: Ascii ()"
23     - ":: NFD ()"
24     - ":: lower ()"
25     - "[^a-z0-9[:Space:]] >"
26     - ":: NFC ()"
27     - "[:Space:]+ > ' '"
28 sanitizers:
29     - step: clean-housenumbers
30       filter-kind:
31         - housenumber
32         - conscriptionnumber
33         - streetnumber
34       convert-to-name:
35         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
36     - step: clean-postcodes
37       convert-to-address: yes
38       default-pattern: "[A-Z0-9- ]{3,12}"
39     - step: clean-tiger-tags
40     - step: split-name-list
41     - step: strip-brace-terms
42     - step: tag-analyzer-by-language
43       filter-kind: [".*name.*"]
44       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
45       use-defaults: all
46       mode: append
47 token-analysis:
48     - analyzer: generic
49     - id: "@housenumber"
50       analyzer: housenumbers
51     - id: "@postcode"
52       analyzer: postcodes
53     - id: bg
54       analyzer: generic
55       mode: variant-only
56       variants:
57           - !include icu-rules/variants-bg.yaml
58     - id: ca
59       analyzer: generic
60       mode: variant-only
61       variants:
62           - !include icu-rules/variants-ca.yaml
63     - id: cs
64       analyzer: generic
65       mode: variant-only
66       variants:
67           - !include icu-rules/variants-cs.yaml
68     - id: da
69       analyzer: generic
70       mode: variant-only
71       variants:
72           - !include icu-rules/variants-da.yaml
73     - id: de
74       analyzer: generic
75       mode: variant-only
76       variants:
77           - !include icu-rules/variants-de.yaml
78       mutations:
79           - pattern: ä
80             replacements: ["ä", "ae"]
81           - pattern: ö
82             replacements: ["ö", "oe"]
83           - pattern: ü
84             replacements: ["ü", "ue"]
85     - id: el
86       analyzer: generic
87       mode: variant-only
88       variants:
89           - !include icu-rules/variants-el.yaml
90     - id: en
91       analyzer: generic
92       mode: variant-only
93       variants:
94           - !include icu-rules/variants-en.yaml
95     - id: es
96       analyzer: generic
97       mode: variant-only
98       variants:
99           - !include icu-rules/variants-es.yaml
100     - id: et
101       analyzer: generic
102       mode: variant-only
103       variants:
104           - !include icu-rules/variants-et.yaml
105     - id: eu
106       analyzer: generic
107       mode: variant-only
108       variants:
109           - !include icu-rules/variants-eu.yaml
110     - id: fi
111       analyzer: generic
112       mode: variant-only
113       variants:
114           - !include icu-rules/variants-fi.yaml
115     - id: fr
116       analyzer: generic
117       mode: variant-only
118       variants:
119           - !include icu-rules/variants-fr.yaml
120     - id: gl
121       analyzer: generic
122       mode: variant-only
123       variants:
124           - !include icu-rules/variants-gl.yaml
125     - id: hu
126       analyzer: generic
127       mode: variant-only
128       variants:
129           - !include icu-rules/variants-hu.yaml
130     - id: it
131       analyzer: generic
132       mode: variant-only
133       variants:
134           - !include icu-rules/variants-it.yaml
135     - id: ja
136       analyzer: generic
137       mode: variant-only
138       variants:
139           - !include icu-rules/variants-ja.yaml
140     - id: mg
141       analyzer: generic
142       mode: variant-only
143       variants:
144           - !include icu-rules/variants-mg.yaml
145     - id: ms
146       analyzer: generic
147       mode: variant-only
148       variants:
149           - !include icu-rules/variants-ms.yaml
150     - id: nl
151       analyzer: generic
152       mode: variant-only
153       variants:
154           - !include icu-rules/variants-nl.yaml
155     - id: no
156       analyzer: generic
157       mode: variant-only
158       variants:
159           - !include icu-rules/variants-no.yaml
160     - id: pl
161       analyzer: generic
162       mode: variant-only
163       variants:
164           - !include icu-rules/variants-pl.yaml
165     - id: pt
166       analyzer: generic
167       mode: variant-only
168       variants:
169           - !include icu-rules/variants-pt.yaml
170     - id: ro
171       analyzer: generic
172       mode: variant-only
173       variants:
174           - !include icu-rules/variants-ro.yaml
175     - id: ru
176       analyzer: generic
177       mode: variant-only
178       variants:
179           - !include icu-rules/variants-ru.yaml
180     - id: sk
181       analyzer: generic
182       mode: variant-only
183       variants:
184           - !include icu-rules/variants-sk.yaml
185     - id: sl
186       analyzer: generic
187       mode: variant-only
188       variants:
189           - !include icu-rules/variants-sl.yaml
190     - id: sv
191       analyzer: generic
192       mode: variant-only
193       variants:
194           - !include icu-rules/variants-sv.yaml
195     - id: tr
196       analyzer: generic
197       mode: variant-only
198       variants:
199           - !include icu-rules/variants-tr.yaml
200     - id: uk
201       analyzer: generic
202       mode: variant-only
203       variants:
204           - !include icu-rules/variants-uk.yaml
205     - id: vi
206       analyzer: generic
207       mode: variant-only
208       variants:
209           - !include icu-rules/variants-vi.yaml