]> git.openstreetmap.org Git - nominatim.git/blob - test/python/tokenizer/sanitizers/test_clean_postcodes.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / test / python / tokenizer / sanitizers / test_clean_postcodes.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2025 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tests for the sanitizer that normalizes postcodes.
9 """
10 import pytest
11
12 from nominatim_db.tokenizer.place_sanitizer import PlaceSanitizer
13 from nominatim_db.data.place_info import PlaceInfo
14 from nominatim_db.data import country_info
15
16
17 @pytest.fixture
18 def sanitize(def_config, request):
19     country_info.setup_country_config(def_config)
20     sanitizer_args = {'step': 'clean-postcodes'}
21     for mark in request.node.iter_markers(name="sanitizer_params"):
22         sanitizer_args.update({k.replace('_', '-'): v for k, v in mark.kwargs.items()})
23
24     def _run(country=None, **kwargs):
25         pi = {'address': kwargs}
26         if country is not None:
27             pi['country_code'] = country
28
29         _, address = PlaceSanitizer([sanitizer_args], def_config).process_names(PlaceInfo(pi))
30
31         return sorted([(p.kind, p.name) for p in address])
32
33     return _run
34
35
36 @pytest.mark.parametrize("country", (None, 'ae'))
37 def test_postcode_no_country(sanitize, country):
38     assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')]
39
40
41 @pytest.mark.parametrize("country", (None, 'ae'))
42 @pytest.mark.sanitizer_params(convert_to_address=False)
43 def test_postcode_no_country_drop(sanitize, country):
44     assert sanitize(country=country, postcode='23231') == []
45
46
47 @pytest.mark.parametrize("postcode", ('12345', '  12345  ', 'de 12345',
48                                       'DE12345', 'DE 12345', 'DE-12345'))
49 def test_postcode_pass_good_format(sanitize, postcode):
50     assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')]
51
52
53 @pytest.mark.parametrize("postcode", ('123456', '', '   ', '.....',
54                                       'DE  12345', 'DEF12345', 'CH 12345'))
55 @pytest.mark.sanitizer_params(convert_to_address=False)
56 def test_postcode_drop_bad_format(sanitize, postcode):
57     assert sanitize(country='de', postcode=postcode) == []
58
59
60 @pytest.mark.parametrize("postcode", ('1234', '9435', '99000'))
61 def test_postcode_cyprus_pass(sanitize, postcode):
62     assert sanitize(country='cy', postcode=postcode) == [('postcode', postcode)]
63
64
65 @pytest.mark.parametrize("postcode", ('91234', '99a45', '567'))
66 @pytest.mark.sanitizer_params(convert_to_address=False)
67 def test_postcode_cyprus_fail(sanitize, postcode):
68     assert sanitize(country='cy', postcode=postcode) == []
69
70
71 @pytest.mark.parametrize("postcode", ('123456', 'A33F2G7'))
72 def test_postcode_kazakhstan_pass(sanitize, postcode):
73     assert sanitize(country='kz', postcode=postcode) == [('postcode', postcode)]
74
75
76 @pytest.mark.parametrize("postcode", ('V34T6Y923456', '99345'))
77 @pytest.mark.sanitizer_params(convert_to_address=False)
78 def test_postcode_kazakhstan_fail(sanitize, postcode):
79     assert sanitize(country='kz', postcode=postcode) == []
80
81
82 @pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534'))
83 def test_postcode_sweden_pass(sanitize, postcode):
84     assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')]
85
86
87 @pytest.mark.parametrize("postcode", ('67 345', '671123'))
88 @pytest.mark.sanitizer_params(convert_to_address=False)
89 def test_postcode_sweden_fail(sanitize, postcode):
90     assert sanitize(country='se', postcode=postcode) == []
91
92
93 @pytest.mark.parametrize("postcode", ('AD123', '123', 'AD 123', 'AD-123'))
94 def test_postcode_andorra_pass(sanitize, postcode):
95     assert sanitize(country='ad', postcode=postcode) == [('postcode', 'AD123')]
96
97
98 @pytest.mark.parametrize("postcode", ('AD1234', 'AD AD123', 'XX123'))
99 @pytest.mark.sanitizer_params(convert_to_address=False)
100 def test_postcode_andorra_fail(sanitize, postcode):
101     assert sanitize(country='ad', postcode=postcode) == []
102
103
104 @pytest.mark.parametrize("postcode", ('AI-2640', '2640', 'AI 2640'))
105 def test_postcode_anguilla_pass(sanitize, postcode):
106     assert sanitize(country='ai', postcode=postcode) == [('postcode', 'AI-2640')]
107
108
109 @pytest.mark.parametrize("postcode", ('AI-2000', 'AI US-2640', 'AI AI-2640'))
110 @pytest.mark.sanitizer_params(convert_to_address=False)
111 def test_postcode_anguilla_fail(sanitize, postcode):
112     assert sanitize(country='ai', postcode=postcode) == []
113
114
115 @pytest.mark.parametrize("postcode", ('BN1111', 'BN 1111', 'BN BN1111', 'BN BN 1111'))
116 def test_postcode_brunei_pass(sanitize, postcode):
117     assert sanitize(country='bn', postcode=postcode) == [('postcode', 'BN1111')]
118
119
120 @pytest.mark.parametrize("postcode", ('BN-1111', 'BNN1111'))
121 @pytest.mark.sanitizer_params(convert_to_address=False)
122 def test_postcode_brunei_fail(sanitize, postcode):
123     assert sanitize(country='bn', postcode=postcode) == []
124
125
126 @pytest.mark.parametrize("postcode", ('IM1 1AA', 'IM11AA', 'IM IM11AA'))
127 def test_postcode_isle_of_man_pass(sanitize, postcode):
128     assert sanitize(country='im', postcode=postcode) == [('postcode', 'IM1 1AA')]
129
130
131 @pytest.mark.parametrize("postcode", ('IZ1 1AA', 'IM1 AA'))
132 @pytest.mark.sanitizer_params(convert_to_address=False)
133 def test_postcode_isle_of_man_fail(sanitize, postcode):
134     assert sanitize(country='im', postcode=postcode) == []
135
136
137 @pytest.mark.parametrize("postcode", ('JE5 0LA', 'JE50LA', 'JE JE50LA', 'je JE5 0LA'))
138 def test_postcode_jersey_pass(sanitize, postcode):
139     assert sanitize(country='je', postcode=postcode) == [('postcode', 'JE5 0LA')]
140
141
142 @pytest.mark.parametrize("postcode", ('gb JE5 0LA', 'IM50LA', 'IM5 012'))
143 @pytest.mark.sanitizer_params(convert_to_address=False)
144 def test_postcode_jersey_fail(sanitize, postcode):
145     assert sanitize(country='je', postcode=postcode) == []
146
147
148 @pytest.mark.parametrize("postcode", ('KY1-1234', '1-1234', 'KY 1-1234'))
149 def test_postcode_cayman_islands_pass(sanitize, postcode):
150     assert sanitize(country='ky', postcode=postcode) == [('postcode', 'KY1-1234')]
151
152
153 @pytest.mark.parametrize("postcode", ('KY-1234', 'KZ1-1234', 'KY1 1234', 'KY1-123', 'KY KY1-1234'))
154 @pytest.mark.sanitizer_params(convert_to_address=False)
155 def test_postcode_cayman_islands_fail(sanitize, postcode):
156     assert sanitize(country='ky', postcode=postcode) == []
157
158
159 @pytest.mark.parametrize("postcode", ('LC11 222', '11 222', '11222', 'LC 11 222'))
160 def test_postcode_saint_lucia_pass(sanitize, postcode):
161     assert sanitize(country='lc', postcode=postcode) == [('postcode', 'LC11 222')]
162
163
164 @pytest.mark.parametrize("postcode", ('11 2222', 'LC LC11 222'))
165 @pytest.mark.sanitizer_params(convert_to_address=False)
166 def test_postcode_saint_lucia_fail(sanitize, postcode):
167     assert sanitize(country='lc', postcode=postcode) == []
168
169
170 @pytest.mark.parametrize("postcode", ('LV-1111', '1111', 'LV 1111', 'LV1111',))
171 def test_postcode_latvia_pass(sanitize, postcode):
172     assert sanitize(country='lv', postcode=postcode) == [('postcode', 'LV-1111')]
173
174
175 @pytest.mark.parametrize("postcode", ('111', '11111', 'LV LV-1111'))
176 @pytest.mark.sanitizer_params(convert_to_address=False)
177 def test_postcode_latvia_fail(sanitize, postcode):
178     assert sanitize(country='lv', postcode=postcode) == []
179
180
181 @pytest.mark.parametrize("postcode", ('MD-1111', '1111', 'MD 1111', 'MD1111'))
182 def test_postcode_moldova_pass(sanitize, postcode):
183     assert sanitize(country='md', postcode=postcode) == [('postcode', 'MD-1111')]
184
185
186 @pytest.mark.parametrize("postcode", ("MD MD-1111", "MD MD1111", "MD MD 1111"))
187 @pytest.mark.sanitizer_params(convert_to_address=False)
188 def test_postcode_moldova_fail(sanitize, postcode):
189     assert sanitize(country='md', postcode=postcode) == []
190
191
192 @pytest.mark.parametrize("postcode", ('VLT 1117', 'GDJ 1234', 'BZN 2222'))
193 def test_postcode_malta_pass(sanitize, postcode):
194     assert sanitize(country='mt', postcode=postcode) == [('postcode', postcode)]
195
196
197 @pytest.mark.parametrize("postcode", ('MTF 1111', 'MT MTF 1111', 'MTF1111', 'MT MTF1111'))
198 def test_postcode_malta_mtarfa_pass(sanitize, postcode):
199     assert sanitize(country='mt', postcode=postcode) == [('postcode', 'MTF 1111')]
200
201
202 @pytest.mark.parametrize("postcode", ('1111', 'MTMT 1111'))
203 @pytest.mark.sanitizer_params(convert_to_address=False)
204 def test_postcode_malta_fail(sanitize, postcode):
205     assert sanitize(country='mt', postcode=postcode) == []
206
207
208 @pytest.mark.parametrize("postcode", ('VC1111', '1111', 'VC-1111', 'VC 1111'))
209 def test_postcode_saint_vincent_pass(sanitize, postcode):
210     assert sanitize(country='vc', postcode=postcode) == [('postcode', 'VC1111')]
211
212
213 @pytest.mark.parametrize("postcode", ('VC11', 'VC VC1111'))
214 @pytest.mark.sanitizer_params(convert_to_address=False)
215 def test_postcode_saint_vincent_fail(sanitize, postcode):
216     assert sanitize(country='vc', postcode=postcode) == []
217
218
219 @pytest.mark.parametrize("postcode", ('VG1111', '1111', 'VG 1111', 'VG-1111'))
220 def test_postcode_virgin_islands_pass(sanitize, postcode):
221     assert sanitize(country='vg', postcode=postcode) == [('postcode', 'VG1111')]
222
223
224 @pytest.mark.parametrize("postcode", ('111', '11111', 'VG VG1111'))
225 @pytest.mark.sanitizer_params(convert_to_address=False)
226 def test_postcode_virgin_islands_fail(sanitize, postcode):
227     assert sanitize(country='vg', postcode=postcode) == []
228
229
230 @pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
231 @pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
232 def test_postcode_default_pattern_pass(sanitize, postcode):
233     assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())]
234
235
236 @pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224'))
237 @pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
238 def test_postcode_default_pattern_fail(sanitize, postcode):
239     assert sanitize(country='an', postcode=postcode) == []