]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
keep break indicators [:-] during normalisation
[nominatim.git] / settings / icu_tokenizer.yaml
1 query-preprocessing:
2     - step: normalize
3 normalization:
4     - ":: lower ()"
5     - ":: Hans-Hant"
6     - !include icu-rules/unicode-digits-to-decimal.yaml
7     - "'№' > 'no'"
8     - "'n°' > 'no'"
9     - "'nº' > 'no'"
10     - "ª > a"
11     - "º > o"
12     - "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+  > '-'"
13     - "ß > 'ss'" # German szet is unambiguously equal to double ss
14     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
15     - "[:Lm:] >"
16     - ":: [[:Number:]] Latin ()"
17     - ":: [[:Number:]] Ascii ();"
18     - ":: [[:Number:]] NFD ();"
19     - "[[:Nonspacing Mark:] [:Cf:]] >;"
20     - "[-:]?[:Space:]+[-:]? > ' '"
21 transliteration:
22     - "[-:]  > ' '"
23     - ":: Latin ()"
24     - !include icu-rules/extended-unicode-to-asccii.yaml
25     - ":: Ascii ()"
26     - ":: NFD ()"
27     - ":: lower ()"
28     - "[^a-z0-9[:Space:]] >"
29     - ":: NFC ()"
30     - "[:Space:]+ > ' '"
31 sanitizers:
32     - step: clean-housenumbers
33       filter-kind:
34         - housenumber
35         - conscriptionnumber
36         - streetnumber
37       convert-to-name:
38         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
39     - step: clean-postcodes
40       convert-to-address: yes
41       default-pattern: "[A-Z0-9- ]{3,12}"
42     - step: clean-tiger-tags
43     - step: split-name-list
44       delimiters: ;
45     - step: strip-brace-terms
46     - step: tag-analyzer-by-language
47       filter-kind: [".*name.*"]
48       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
49       use-defaults: all
50       mode: append
51     - step: tag-japanese
52 token-analysis:
53     - analyzer: generic
54     - id: "@housenumber"
55       analyzer: housenumbers
56     - id: "@postcode"
57       analyzer: postcodes
58     - id: bg
59       analyzer: generic
60       mode: variant-only
61       variants:
62           - !include icu-rules/variants-bg.yaml
63     - id: ca
64       analyzer: generic
65       mode: variant-only
66       variants:
67           - !include icu-rules/variants-ca.yaml
68     - id: cs
69       analyzer: generic
70       mode: variant-only
71       variants:
72           - !include icu-rules/variants-cs.yaml
73     - id: da
74       analyzer: generic
75       mode: variant-only
76       variants:
77           - !include icu-rules/variants-da.yaml
78     - id: de
79       analyzer: generic
80       mode: variant-only
81       variants:
82           - !include icu-rules/variants-de.yaml
83       mutations:
84           - pattern: ä
85             replacements: ["ä", "ae"]
86           - pattern: ö
87             replacements: ["ö", "oe"]
88           - pattern: ü
89             replacements: ["ü", "ue"]
90     - id: el
91       analyzer: generic
92       mode: variant-only
93       variants:
94           - !include icu-rules/variants-el.yaml
95     - id: en
96       analyzer: generic
97       mode: variant-only
98       variants:
99           - !include icu-rules/variants-en.yaml
100     - id: es
101       analyzer: generic
102       mode: variant-only
103       variants:
104           - !include icu-rules/variants-es.yaml
105     - id: et
106       analyzer: generic
107       mode: variant-only
108       variants:
109           - !include icu-rules/variants-et.yaml
110     - id: eu
111       analyzer: generic
112       mode: variant-only
113       variants:
114           - !include icu-rules/variants-eu.yaml
115     - id: fi
116       analyzer: generic
117       mode: variant-only
118       variants:
119           - !include icu-rules/variants-fi.yaml
120     - id: fr
121       analyzer: generic
122       mode: variant-only
123       variants:
124           - !include icu-rules/variants-fr.yaml
125     - id: gl
126       analyzer: generic
127       mode: variant-only
128       variants:
129           - !include icu-rules/variants-gl.yaml
130     - id: hu
131       analyzer: generic
132       mode: variant-only
133       variants:
134           - !include icu-rules/variants-hu.yaml
135     - id: it
136       analyzer: generic
137       mode: variant-only
138       variants:
139           - !include icu-rules/variants-it.yaml
140     - id: ja
141       analyzer: generic
142       mode: variant-only
143       variants:
144           - !include icu-rules/variants-ja.yaml
145     - id: mg
146       analyzer: generic
147       mode: variant-only
148       variants:
149           - !include icu-rules/variants-mg.yaml
150     - id: ms
151       analyzer: generic
152       mode: variant-only
153       variants:
154           - !include icu-rules/variants-ms.yaml
155     - id: nl
156       analyzer: generic
157       mode: variant-only
158       variants:
159           - !include icu-rules/variants-nl.yaml
160     - id: no
161       analyzer: generic
162       mode: variant-only
163       variants:
164           - !include icu-rules/variants-no.yaml
165     - id: pl
166       analyzer: generic
167       mode: variant-only
168       variants:
169           - !include icu-rules/variants-pl.yaml
170     - id: pt
171       analyzer: generic
172       mode: variant-only
173       variants:
174           - !include icu-rules/variants-pt.yaml
175     - id: ro
176       analyzer: generic
177       mode: variant-only
178       variants:
179           - !include icu-rules/variants-ro.yaml
180     - id: ru
181       analyzer: generic
182       mode: variant-only
183       variants:
184           - !include icu-rules/variants-ru.yaml
185     - id: sk
186       analyzer: generic
187       mode: variant-only
188       variants:
189           - !include icu-rules/variants-sk.yaml
190     - id: sl
191       analyzer: generic
192       mode: variant-only
193       variants:
194           - !include icu-rules/variants-sl.yaml
195     - id: sv
196       analyzer: generic
197       mode: variant-only
198       variants:
199           - !include icu-rules/variants-sv.yaml
200     - id: tr
201       analyzer: generic
202       mode: variant-only
203       variants:
204           - !include icu-rules/variants-tr.yaml
205     - id: uk
206       analyzer: generic
207       mode: variant-only
208       variants:
209           - !include icu-rules/variants-uk.yaml
210     - id: vi
211       analyzer: generic
212       mode: variant-only
213       variants:
214           - !include icu-rules/variants-vi.yaml