]> git.openstreetmap.org Git - nominatim.git/blob - settings/icu_tokenizer.yaml
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / settings / icu_tokenizer.yaml
1 query-preprocessing:
2     - step: split_japanese_phrases
3     - step: normalize
4 normalization:
5     - ":: lower ()"
6     - ":: Hans-Hant"
7     - !include icu-rules/unicode-digits-to-decimal.yaml
8     - "'№' > 'no'"
9     - "'n°' > 'no'"
10     - "'nº' > 'no'"
11     - "ª > a"
12     - "º > o"
13     - "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+  > '-'"
14     - "ß > 'ss'" # German szet is unambiguously equal to double ss
15     - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
16     - "[:Lm:] >"
17     - ":: [[:Number:]] Latin ()"
18     - ":: [[:Number:]] Ascii ();"
19     - ":: [[:Number:]] NFD ();"
20     - "[[:Nonspacing Mark:] [:Cf:]] >;"
21     - "[-:]?[:Space:]+[-:]? > ' '"
22 transliteration:
23     - "[-:]  > ' '"
24     - ":: Latin ()"
25     - !include icu-rules/extended-unicode-to-asccii.yaml
26     - ":: Ascii ()"
27     - ":: NFD ()"
28     - ":: lower ()"
29     - "[^a-z0-9[:Space:]] >"
30     - ":: NFC ()"
31     - "[:Space:]+ > ' '"
32 sanitizers:
33     - step: clean-housenumbers
34       filter-kind:
35         - housenumber
36         - conscriptionnumber
37         - streetnumber
38       convert-to-name:
39         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
40     - step: clean-postcodes
41       convert-to-address: yes
42       default-pattern: "[A-Z0-9- ]{3,12}"
43     - step: clean-tiger-tags
44     - step: split-name-list
45       delimiters: ;
46     - step: strip-brace-terms
47     - step: tag-analyzer-by-language
48       filter-kind: [".*name.*"]
49       whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
50       use-defaults: all
51       mode: append
52     - step: tag-japanese
53 token-analysis:
54     - analyzer: generic
55     - id: "@housenumber"
56       analyzer: housenumbers
57     - id: "@postcode"
58       analyzer: postcodes
59     - id: bg
60       analyzer: generic
61       mode: variant-only
62       variants:
63           - !include icu-rules/variants-bg.yaml
64     - id: ca
65       analyzer: generic
66       mode: variant-only
67       variants:
68           - !include icu-rules/variants-ca.yaml
69     - id: cs
70       analyzer: generic
71       mode: variant-only
72       variants:
73           - !include icu-rules/variants-cs.yaml
74     - id: da
75       analyzer: generic
76       mode: variant-only
77       variants:
78           - !include icu-rules/variants-da.yaml
79     - id: de
80       analyzer: generic
81       mode: variant-only
82       variants:
83           - !include icu-rules/variants-de.yaml
84       mutations:
85           - pattern: ä
86             replacements: ["ä", "ae"]
87           - pattern: ö
88             replacements: ["ö", "oe"]
89           - pattern: ü
90             replacements: ["ü", "ue"]
91     - id: el
92       analyzer: generic
93       mode: variant-only
94       variants:
95           - !include icu-rules/variants-el.yaml
96     - id: en
97       analyzer: generic
98       mode: variant-only
99       variants:
100           - !include icu-rules/variants-en.yaml
101     - id: es
102       analyzer: generic
103       mode: variant-only
104       variants:
105           - !include icu-rules/variants-es.yaml
106     - id: et
107       analyzer: generic
108       mode: variant-only
109       variants:
110           - !include icu-rules/variants-et.yaml
111     - id: eu
112       analyzer: generic
113       mode: variant-only
114       variants:
115           - !include icu-rules/variants-eu.yaml
116     - id: fi
117       analyzer: generic
118       mode: variant-only
119       variants:
120           - !include icu-rules/variants-fi.yaml
121     - id: fr
122       analyzer: generic
123       mode: variant-only
124       variants:
125           - !include icu-rules/variants-fr.yaml
126     - id: gl
127       analyzer: generic
128       mode: variant-only
129       variants:
130           - !include icu-rules/variants-gl.yaml
131     - id: hu
132       analyzer: generic
133       mode: variant-only
134       variants:
135           - !include icu-rules/variants-hu.yaml
136     - id: it
137       analyzer: generic
138       mode: variant-only
139       variants:
140           - !include icu-rules/variants-it.yaml
141     - id: ja
142       analyzer: generic
143       mode: variant-only
144       variants:
145           - !include icu-rules/variants-ja.yaml
146     - id: mg
147       analyzer: generic
148       mode: variant-only
149       variants:
150           - !include icu-rules/variants-mg.yaml
151     - id: ms
152       analyzer: generic
153       mode: variant-only
154       variants:
155           - !include icu-rules/variants-ms.yaml
156     - id: nl
157       analyzer: generic
158       mode: variant-only
159       variants:
160           - !include icu-rules/variants-nl.yaml
161     - id: no
162       analyzer: generic
163       mode: variant-only
164       variants:
165           - !include icu-rules/variants-no.yaml
166     - id: pl
167       analyzer: generic
168       mode: variant-only
169       variants:
170           - !include icu-rules/variants-pl.yaml
171     - id: pt
172       analyzer: generic
173       mode: variant-only
174       variants:
175           - !include icu-rules/variants-pt.yaml
176     - id: ro
177       analyzer: generic
178       mode: variant-only
179       variants:
180           - !include icu-rules/variants-ro.yaml
181     - id: ru
182       analyzer: generic
183       mode: variant-only
184       variants:
185           - !include icu-rules/variants-ru.yaml
186     - id: sk
187       analyzer: generic
188       mode: variant-only
189       variants:
190           - !include icu-rules/variants-sk.yaml
191     - id: sl
192       analyzer: generic
193       mode: variant-only
194       variants:
195           - !include icu-rules/variants-sl.yaml
196     - id: sv
197       analyzer: generic
198       mode: variant-only
199       variants:
200           - !include icu-rules/variants-sv.yaml
201     - id: tr
202       analyzer: generic
203       mode: variant-only
204       variants:
205           - !include icu-rules/variants-tr.yaml
206     - id: uk
207       analyzer: generic
208       mode: variant-only
209       variants:
210           - !include icu-rules/variants-uk.yaml
211     - id: vi
212       analyzer: generic
213       mode: variant-only
214       variants:
215           - !include icu-rules/variants-vi.yaml