]> git.openstreetmap.org Git - nominatim.git/blob - test/python/api/search/test_postcode_parser.py
make query upper-case when parsing postcodes
[nominatim.git] / test / python / api / search / test_postcode_parser.py
1
2 # SPDX-License-Identifier: GPL-3.0-or-later
3 #
4 # This file is part of Nominatim. (https://nominatim.org)
5 #
6 # Copyright (C) 2025 by the Nominatim developer community.
7 # For a full list of authors see the git log.
8 """
9 Test for parsing of postcodes in queries.
10 """
11 import re
12 from itertools import zip_longest
13
14 import pytest
15
16 from nominatim_api.search.postcode_parser import PostcodeParser
17 from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET
18
19
20 @pytest.fixture
21 def pc_config(project_env):
22     country_file = project_env.project_dir / 'country_settings.yaml'
23     country_file.write_text(r"""
24 ab:
25   postcode:
26     pattern: "ddddd ll"
27 ba:
28   postcode:
29     pattern: "ddddd"
30 de:
31   postcode:
32     pattern: "ddddd"
33 gr:
34   postcode:
35     pattern: "(ddd) ?(dd)"
36     output: \1 \2
37 in:
38   postcode:
39     pattern: "(ddd) ?(ddd)"
40     output: \1\2
41 mc:
42   postcode:
43     pattern: "980dd"
44 mz:
45   postcode:
46     pattern: "(dddd)(?:-dd)?"
47 bn:
48   postcode:
49     pattern: "(ll) ?(dddd)"
50     output: \1\2
51 ky:
52   postcode:
53     pattern: "(d)-(dddd)"
54     output: KY\1-\2
55
56 gb:
57   postcode:
58     pattern: "(l?ld[A-Z0-9]?) ?(dll)"
59     output: \1 \2
60
61     """)
62
63     return project_env
64
65
66 def mk_query(inp):
67     query = QueryStruct([])
68     phrase_split = re.split(r"([ ,:'-])", inp)
69
70     for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
71         query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
72
73     return query
74
75
76 @pytest.mark.parametrize('query,pos', [('45325 Berlin', 0),
77                                        ('45325:Berlin', 0),
78                                        ('45325,Berlin', 0),
79                                        ('Berlin 45325', 1),
80                                        ('Berlin,45325', 1),
81                                        ('Berlin:45325', 1),
82                                        ('Hansastr,45325 Berlin', 1),
83                                        ('Hansastr 45325 Berlin', 1)])
84 def test_simple_postcode(pc_config, query, pos):
85     parser = PostcodeParser(pc_config)
86
87     result = parser.parse(mk_query(query))
88
89     assert result == {(pos, pos + 1, '45325'), (pos, pos + 1, '453 25')}
90
91
92 @pytest.mark.parametrize('query', ['EC1R 3HF', 'ec1r 3hf'])
93 def test_postcode_matching_case_insensitive(pc_config, query):
94     parser = PostcodeParser(pc_config)
95
96     assert parser.parse(mk_query(query)) == {(0, 2, 'EC1R 3HF')}
97
98
99 def test_contained_postcode(pc_config):
100     parser = PostcodeParser(pc_config)
101
102     assert parser.parse(mk_query('12345 dx')) == {(0, 1, '12345'), (0, 1, '123 45'),
103                                                   (0, 2, '12345 DX')}
104
105
106 @pytest.mark.parametrize('query,frm,to', [('345987', 0, 1), ('345 987', 0, 2),
107                                           ('Aina 345 987', 1, 3),
108                                           ('Aina 23 345 987 ff', 2, 4)])
109 def test_postcode_with_space(pc_config, query, frm, to):
110     parser = PostcodeParser(pc_config)
111
112     result = parser.parse(mk_query(query))
113
114     assert result == {(frm, to, '345987')}
115
116
117 def test_overlapping_postcode(pc_config):
118     parser = PostcodeParser(pc_config)
119
120     assert parser.parse(mk_query('123 456 78')) == {(0, 2, '123456'), (1, 3, '456 78')}
121
122
123 @pytest.mark.parametrize('query', ['45325-Berlin', "45325'Berlin",
124                                    'Berlin-45325', "Berlin'45325", '45325Berlin'
125                                    '345-987', "345'987", '345,987', '345:987'])
126 def test_not_a_postcode(pc_config, query):
127     parser = PostcodeParser(pc_config)
128
129     assert not parser.parse(mk_query(query))
130
131
132 @pytest.mark.parametrize('query', ['ba 12233', 'ba-12233'])
133 def test_postcode_with_country_prefix(pc_config, query):
134     parser = PostcodeParser(pc_config)
135
136     assert (0, 2, '12233') in parser.parse(mk_query(query))
137
138
139 def test_postcode_with_joined_country_prefix(pc_config):
140     parser = PostcodeParser(pc_config)
141
142     assert parser.parse(mk_query('ba12233')) == {(0, 1, '12233')}
143
144
145 def test_postcode_with_non_matching_country_prefix(pc_config):
146     parser = PostcodeParser(pc_config)
147
148     assert not parser.parse(mk_query('ky12233'))
149
150
151 def test_postcode_inside_postcode_phrase(pc_config):
152     parser = PostcodeParser(pc_config)
153
154     query = QueryStruct([])
155     query.nodes[-1].ptype = PHRASE_STREET
156     query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
157     query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
158     query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
159
160     assert parser.parse(query) == {(2, 3, '4444')}
161
162
163 def test_partial_postcode_in_postcode_phrase(pc_config):
164     parser = PostcodeParser(pc_config)
165
166     query = QueryStruct([])
167     query.nodes[-1].ptype = PHRASE_POSTCODE
168     query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
169     query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
170
171     assert not parser.parse(query)