]> git.openstreetmap.org Git - nominatim.git/blob - test/python/tokenizer/token_analysis/test_analysis_postcodes.py
handle postcodes properly on word table updates
[nominatim.git] / test / python / tokenizer / token_analysis / test_analysis_postcodes.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tests for special postcode analysis and variant generation.
9 """
10 import pytest
11
12 from icu import Transliterator
13
14 import nominatim.tokenizer.token_analysis.postcodes as module
15 from nominatim.errors import UsageError
16
17 DEFAULT_NORMALIZATION = """ :: NFD ();
18                             '🜳' > ' ';
19                             [[:Nonspacing Mark:] [:Cf:]] >;
20                             :: lower ();
21                             [[:Punctuation:][:Space:]]+ > ' ';
22                             :: NFC ();
23                         """
24
25 DEFAULT_TRANSLITERATION = """ ::  Latin ();
26                               '🜵' > ' ';
27                           """
28
29 @pytest.fixture
30 def analyser():
31     rules = { 'analyzer': 'postcodes'}
32     config = module.configure(rules, DEFAULT_NORMALIZATION)
33
34     trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
35     norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
36
37     return module.create(norm, trans, config)
38
39
40 def get_normalized_variants(proc, name):
41     norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
42     return proc.get_variants_ascii(norm.transliterate(name).strip())
43
44
45 @pytest.mark.parametrize('name,norm', [('12', '12'),
46                                        ('A 34 ', 'A 34'),
47                                        ('34-av', '34-AV')])
48 def test_normalize(analyser, name, norm):
49     assert analyser.normalize(name) == norm
50
51
52 @pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
53                                                ('AB-998', {'ab 998', 'ab998'}),
54                                                ('23 FGH D3', {'23 fgh d3', '23fgh d3',
55                                                               '23 fghd3', '23fghd3'})])
56 def test_get_variants_ascii(analyser, postcode, variants):
57     out = analyser.get_variants_ascii(postcode)
58
59     assert len(out) == len(set(out))
60     assert set(out) == variants