From: Sarah Hoffmann Date: Tue, 31 May 2022 18:26:05 +0000 (+0200) Subject: Merge pull request #2732 from lonvia/fix-ordering-address-parts X-Git-Tag: v4.1.0~33 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/8a0e3e2f3d9bce87725a6e08dcc90a072a17995c?hp=bd0e157b9155a999b6c96bbd834319fa67b298d3 Merge pull request #2732 from lonvia/fix-ordering-address-parts Fix order when searching for addr:* components --- diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index a08a995f..4ce14f93 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -81,13 +81,16 @@ jobs: ubuntu: ${{ matrix.ubuntu }} - name: Install test prerequsites - run: sudo apt-get install -y -qq pylint python3-pytest python3-behave + run: sudo apt-get install -y -qq python3-pytest python3-behave if: matrix.ubuntu == 20 - name: Install test prerequsites - run: pip3 install pylint==2.6.0 pytest behave==1.2.6 + run: pip3 install pytest behave==1.2.6 if: matrix.ubuntu == 18 + - name: Install latest pylint + run: pip3 install pylint + - name: PHP linting run: phpcs --report-width=120 . working-directory: Nominatim diff --git a/docs/develop/Development-Environment.md b/docs/develop/Development-Environment.md index eba87c09..3cda610e 100644 --- a/docs/develop/Development-Environment.md +++ b/docs/develop/Development-Environment.md @@ -32,7 +32,7 @@ It has the following additional requirements: * [behave test framework](https://behave.readthedocs.io) >= 1.2.6 * [phpunit](https://phpunit.de) (9.5 is known to work) * [PHP CodeSniffer](https://github.com/squizlabs/PHP_CodeSniffer) -* [Pylint](https://pylint.org/) (2.6.0 is used for the CI) +* [Pylint](https://pylint.org/) (CI always runs the latest version from pip) * [pytest](https://pytest.org) The documentation is built with mkdocs: diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 7b5f3797..f0ec358b 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -128,7 +128,7 @@ class SetupAll: drop=args.no_updates) LOG.warning('Create search index for default country names.') country_info.create_country_names(conn, tokenizer, - args.config.LANGUAGES) + args.config.get_str_list('LANGUAGES')) if args.no_updates: freeze.drop_update_tables(conn) tokenizer.finalize_import(args.config) diff --git a/nominatim/config.py b/nominatim/config.py index ef261079..700af328 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -99,6 +99,17 @@ class Configuration: raise UsageError("Configuration error.") from exp + def get_str_list(self, name): + """ Return the given configuration parameter as a list of strings. + The values are assumed to be given as a comma-sparated list and + will be stripped before returning them. On empty values None + is returned. + """ + raw = self.__getattr__(name) + + return [v.strip() for v in raw.split(',')] if raw else None + + def get_path(self, name): """ Return the given configuration parameter as a Path. If a relative path is configured, then the function converts this diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index bf5544ed..4678af66 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -482,7 +482,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if not item.suffix: token_info.add_place(self._compute_partial_tokens(item.name)) elif not item.kind.startswith('_') and not item.suffix and \ - item.kind not in ('country', 'full'): + item.kind not in ('country', 'full', 'inclusion'): token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 7b78b22a..a292b180 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -475,7 +475,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer): token_info.add_street(self.conn, value) elif key == 'place': token_info.add_place(self.conn, value) - elif not key.startswith('_') and key not in ('country', 'full'): + elif not key.startswith('_') \ + and key not in ('country', 'full', 'inclusion'): addr_terms.append((key, value)) if hnrs: diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py index ed04c2d5..0ad00171 100644 --- a/nominatim/tools/country_info.py +++ b/nominatim/tools/country_info.py @@ -131,9 +131,6 @@ def create_country_names(conn, tokenizer, languages=None): empty then only name translations for the given languages are added to the index. """ - if languages: - languages = languages.split(',') - def _include_key(key): return ':' not in key or not languages or \ key[key.index(':') + 1:] in languages diff --git a/nominatim/tools/special_phrases/sp_csv_loader.py b/nominatim/tools/special_phrases/sp_csv_loader.py index 55a9d8d0..0bd93c00 100644 --- a/nominatim/tools/special_phrases/sp_csv_loader.py +++ b/nominatim/tools/special_phrases/sp_csv_loader.py @@ -11,43 +11,31 @@ """ import csv import os -from collections.abc import Iterator from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.errors import UsageError -class SPCsvLoader(Iterator): +class SPCsvLoader: """ Handles loading of special phrases from external csv file. """ def __init__(self, csv_path): super().__init__() self.csv_path = csv_path - self.has_been_read = False - def __next__(self): - if self.has_been_read: - raise StopIteration() - self.has_been_read = True - self.check_csv_validity() - return self.parse_csv() - - def parse_csv(self): - """ - Open and parse the given csv file. + def generate_phrases(self): + """ Open and parse the given csv file. Create the corresponding SpecialPhrases. """ - phrases = set() + self._check_csv_validity() with open(self.csv_path, encoding='utf-8') as fd: reader = csv.DictReader(fd, delimiter=',') for row in reader: - phrases.add( - SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator']) - ) - return phrases + yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator']) + - def check_csv_validity(self): + def _check_csv_validity(self): """ Check that the csv file has the right extension. """ diff --git a/nominatim/tools/special_phrases/sp_importer.py b/nominatim/tools/special_phrases/sp_importer.py index 8137142b..31bbc355 100644 --- a/nominatim/tools/special_phrases/sp_importer.py +++ b/nominatim/tools/special_phrases/sp_importer.py @@ -62,11 +62,10 @@ class SPImporter(): # Store pairs of class/type for further processing class_type_pairs = set() - for loaded_phrases in self.sp_loader: - for phrase in loaded_phrases: - result = self._process_phrase(phrase) - if result: - class_type_pairs.add(result) + for phrase in self.sp_loader.generate_phrases(): + result = self._process_phrase(phrase) + if result: + class_type_pairs.add(result) self._create_place_classtype_table_and_indexes(class_type_pairs) if should_replace: diff --git a/nominatim/tools/special_phrases/sp_wiki_loader.py b/nominatim/tools/special_phrases/sp_wiki_loader.py index 2f698092..ca4758ac 100644 --- a/nominatim/tools/special_phrases/sp_wiki_loader.py +++ b/nominatim/tools/special_phrases/sp_wiki_loader.py @@ -9,46 +9,56 @@ """ import re import logging -from collections.abc import Iterator from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.exec_utils import get_url LOG = logging.getLogger() -class SPWikiLoader(Iterator): + +def _get_wiki_content(lang): + """ + Request and return the wiki page's content + corresponding to special phrases for a given lang. + Requested URL Example : + https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN + """ + url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \ + + lang.upper() + return get_url(url) + + +class SPWikiLoader: """ Handles loading of special phrases from the wiki. """ - def __init__(self, config, languages=None): + def __init__(self, config): super().__init__() self.config = config # Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' ) - self.languages = self._load_languages() if not languages else list(languages) - - def __next__(self): - if not self.languages: - raise StopIteration + # Hack around a bug where building=yes was imported with quotes into the wiki + self.type_fix_pattern = re.compile(r'\"|"') + self._load_languages() - lang = self.languages.pop(0) - loaded_xml = self._get_wiki_content(lang) - LOG.warning('Importing phrases for lang: %s...', lang) - return self.parse_xml(loaded_xml) - def parse_xml(self, xml): - """ - Parses XML content and extracts special phrases from it. - Return a list of SpecialPhrase. + def generate_phrases(self): + """ Download the wiki pages for the configured languages + and extract the phrases from the page. """ - # One match will be of format [label, class, type, operator, plural] - matches = self.occurence_pattern.findall(xml) - returned_phrases = set() - for match in matches: - returned_phrases.add( - SpecialPhrase(match[0], match[1], match[2], match[3]) - ) - return returned_phrases + for lang in self.languages: + LOG.warning('Importing phrases for lang: %s...', lang) + loaded_xml = _get_wiki_content(lang) + + # One match will be of format [label, class, type, operator, plural] + matches = self.occurence_pattern.findall(loaded_xml) + + for match in matches: + yield SpecialPhrase(match[0], + match[1], + self.type_fix_pattern.sub('', match[2]), + match[3]) + def _load_languages(self): """ @@ -56,21 +66,11 @@ class SPWikiLoader(Iterator): or default if there is no languages configured. The system will extract special phrases only from all specified languages. """ - default_languages = [ + if self.config.LANGUAGES: + self.languages = self.config.get_str_list('LANGUAGES') + else: + self.languages = [ 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages - - @staticmethod - def _get_wiki_content(lang): - """ - Request and return the wiki page's content - corresponding to special phrases for a given lang. - Requested URL Example : - https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN - """ - url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \ - + lang.upper() - return get_url(url) diff --git a/nominatim/tools/special_phrases/special_phrase.py b/nominatim/tools/special_phrases/special_phrase.py index dc7f69fe..16935ccf 100644 --- a/nominatim/tools/special_phrases/special_phrase.py +++ b/nominatim/tools/special_phrases/special_phrase.py @@ -10,9 +10,7 @@ This class is a model used to transfer a special phrase through the process of load and importation. """ -import re - -class SpecialPhrase(): +class SpecialPhrase: """ Model representing a special phrase. """ @@ -20,7 +18,19 @@ class SpecialPhrase(): self.p_label = p_label.strip() self.p_class = p_class.strip() # Hack around a bug where building=yes was imported with quotes into the wiki - self.p_type = re.sub(r'\"|"', '', p_type.strip()) + self.p_type = p_type.strip() # Needed if some operator in the wiki are not written in english p_operator = p_operator.strip().lower() self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator + + def __eq__(self, other): + if not isinstance(other, SpecialPhrase): + return False + + return self.p_label == other.p_label \ + and self.p_class == other.p_class \ + and self.p_type == other.p_type \ + and self.p_operator == other.p_operator + + def __hash__(self): + return hash((self.p_label, self.p_class, self.p_type, self.p_operator)) diff --git a/test/python/config/test_config.py b/test/python/config/test_config.py index 9f9ca880..a9cbb48d 100644 --- a/test/python/config/test_config.py +++ b/test/python/config/test_config.py @@ -173,6 +173,23 @@ def test_get_int_empty(make_config): config.get_int('DATABASE_MODULE_PATH') +@pytest.mark.parametrize("value,outlist", [('sd', ['sd']), + ('dd,rr', ['dd', 'rr']), + (' a , b ', ['a', 'b'])]) +def test_get_str_list_success(make_config, monkeypatch, value, outlist): + config = make_config() + + monkeypatch.setenv('NOMINATIM_MYLIST', value) + + assert config.get_str_list('MYLIST') == outlist + + +def test_get_str_list_empty(make_config): + config = make_config() + + assert config.get_str_list('LANGUAGES') is None + + def test_get_path_empty(make_config): config = make_config() diff --git a/test/python/tools/test_country_info.py b/test/python/tools/test_country_info.py index 3c20b3e0..3f00d54e 100644 --- a/test/python/tools/test_country_info.py +++ b/test/python/tools/test_country_info.py @@ -49,7 +49,7 @@ def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cur assert temp_db_cursor.table_rows('country_osm_grid') > 100 -@pytest.mark.parametrize("languages", (None, ' fr,en')) +@pytest.mark.parametrize("languages", (None, ['fr', 'en'])) def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor, table_factory, tokenizer_mock, languages, loaded_country): diff --git a/test/python/tools/test_import_special_phrases.py b/test/python/tools/test_import_special_phrases.py index 41017694..0dcf549c 100644 --- a/test/python/tools/test_import_special_phrases.py +++ b/test/python/tools/test_import_special_phrases.py @@ -18,16 +18,12 @@ from nominatim.errors import UsageError from cursor import CursorForTesting @pytest.fixture -def testfile_dir(src_dir): - return src_dir / 'test' / 'testfiles' - - -@pytest.fixture -def sp_importer(temp_db_conn, def_config): +def sp_importer(temp_db_conn, def_config, monkeypatch): """ Return an instance of SPImporter. """ - loader = SPWikiLoader(def_config, ['en']) + monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en') + loader = SPWikiLoader(def_config) return SPImporter(def_config, temp_db_conn, loader) @@ -186,8 +182,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer, table_factory('place_classtype_amenity_animal_shelter') table_factory('place_classtype_wrongclass_wrongtype') - monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content', - lambda self, lang: xml_wiki_content) + monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content', + lambda lang: xml_wiki_content) tokenizer = tokenizer_mock() sp_importer.import_phrases(tokenizer, should_replace) diff --git a/test/python/tools/test_sp_csv_loader.py b/test/python/tools/test_sp_csv_loader.py index 6f5670f0..49d5a853 100644 --- a/test/python/tools/test_sp_csv_loader.py +++ b/test/python/tools/test_sp_csv_loader.py @@ -11,56 +11,39 @@ import pytest from nominatim.errors import UsageError from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader +from nominatim.tools.special_phrases.special_phrase import SpecialPhrase -def test_parse_csv(sp_csv_loader): +@pytest.fixture +def sp_csv_loader(src_dir): """ - Test method parse_csv() - Should return the right SpecialPhrase objects. + Return an instance of SPCsvLoader. """ - phrases = sp_csv_loader.parse_csv() - assert check_phrases_content(phrases) + csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve() + loader = SPCsvLoader(csv_path) + return loader + -def test_next(sp_csv_loader): +def test_generate_phrases(sp_csv_loader): """ - Test objects returned from the next() method. - It should return all SpecialPhrases objects of - the sp_csv_test.csv special phrases. + Test method parse_csv() + Should return the right SpecialPhrase objects. """ - phrases = next(sp_csv_loader) - assert check_phrases_content(phrases) + phrases = list(sp_csv_loader.generate_phrases()) + + assert len(phrases) == 42 + assert len(set(phrases)) == 41 + + assert SpecialPhrase('Billboard', 'advertising', 'billboard', '-') in phrases + assert SpecialPhrase('Zip Lines', 'aerialway', 'zip_line', '-') in phrases -def test_check_csv_validity(sp_csv_loader): + +def test_invalid_cvs_file(): """ Test method check_csv_validity() It should raise an exception when file with a different exception than .csv is given. """ - sp_csv_loader.csv_path = 'test.csv' - sp_csv_loader.check_csv_validity() - sp_csv_loader.csv_path = 'test.wrong' - with pytest.raises(UsageError): - assert sp_csv_loader.check_csv_validity() - -def check_phrases_content(phrases): - """ - Asserts that the given phrases list contains - the right phrases of the sp_csv_test.csv special phrases. - """ - return len(phrases) > 1 \ - and any(p.p_label == 'Billboard' - and p.p_class == 'advertising' - and p.p_type == 'billboard' - and p.p_operator == '-' for p in phrases) \ - and any(p.p_label == 'Zip Lines' - and p.p_class == 'aerialway' - and p.p_type == 'zip_line' - and p.p_operator == '-' for p in phrases) + loader = SPCsvLoader('test.wrong') -@pytest.fixture -def sp_csv_loader(src_dir): - """ - Return an instance of SPCsvLoader. - """ - csv_path = (src_dir / 'test' / 'testdata' / 'sp_csv_test.csv').resolve() - loader = SPCsvLoader(csv_path) - return loader + with pytest.raises(UsageError, match='not a csv file'): + next(loader.generate_phrases()) diff --git a/test/python/tools/test_sp_wiki_loader.py b/test/python/tools/test_sp_wiki_loader.py index bfe93c57..2f47734e 100644 --- a/test/python/tools/test_sp_wiki_loader.py +++ b/test/python/tools/test_sp_wiki_loader.py @@ -10,49 +10,32 @@ import pytest from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader -@pytest.fixture -def xml_wiki_content(src_dir): - """ - return the content of the static xml test file. - """ - xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt' - return xml_test_content.read_text() - @pytest.fixture -def sp_wiki_loader(monkeypatch, def_config, xml_wiki_content): +def sp_wiki_loader(src_dir, monkeypatch, def_config): """ Return an instance of SPWikiLoader. """ - loader = SPWikiLoader(def_config, ['en']) - monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content', - lambda self, lang: xml_wiki_content) - return loader + monkeypatch.setenv('NOMINATIM_LANGUAGES', 'en') + loader = SPWikiLoader(def_config) + def _mock_wiki_content(lang): + xml_test_content = src_dir / 'test' / 'testdata' / 'special_phrases_test_content.txt' + return xml_test_content.read_text() -def test_parse_xml(sp_wiki_loader, xml_wiki_content): - """ - Test method parse_xml() - Should return the right SpecialPhrase objects. - """ - phrases = sp_wiki_loader.parse_xml(xml_wiki_content) - check_phrases_content(phrases) + monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader._get_wiki_content', + _mock_wiki_content) + return loader -def test_next(sp_wiki_loader): +def test_generate_phrases(sp_wiki_loader): """ Test objects returned from the next() method. It should return all SpecialPhrases objects of the 'en' special phrases. """ - phrases = next(sp_wiki_loader) - check_phrases_content(phrases) + phrases = list(sp_wiki_loader.generate_phrases()) -def check_phrases_content(phrases): - """ - Asserts that the given phrases list contains - the right phrases of the 'en' special phrases. - """ assert set((p.p_label, p.p_class, p.p_type, p.p_operator) for p in phrases) ==\ {('Zip Line', 'aerialway', 'zip_line', '-'), ('Zip Lines', 'aerialway', 'zip_line', '-'), diff --git a/test/testdata/sp_csv_test.csv b/test/testdata/sp_csv_test.csv index 3dab967b..147526de 100644 --- a/test/testdata/sp_csv_test.csv +++ b/test/testdata/sp_csv_test.csv @@ -18,6 +18,7 @@ Zipline near,aerialway,zip_line,near,N Ziplines near,aerialway,zip_line,near,Y Zipwire,aerialway,zip_line,-,N Zipwires,aerialway,zip_line,-,Y +Zipwires,aerialway,zip_line,name,Y Zipwire in,aerialway,zip_line,in,N Zipwires in,aerialway,zip_line,in,Y Zipwire near,aerialway,zip_line,near,N diff --git a/test/testdata/special_phrases_test_content.txt b/test/testdata/special_phrases_test_content.txt index e790ca58..e5f340b9 100644 --- a/test/testdata/special_phrases_test_content.txt +++ b/test/testdata/special_phrases_test_content.txt @@ -70,7 +70,7 @@ wikitext text/x-wiki -== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || embassy || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]] +== en == {| class="wikitable sortable" |- ! Word / Phrase !! Key !! Value !! Operator !! Plural |- | Zip Line || aerialway || zip_line || - || N |- | Zip Lines || aerialway || zip_line || - || Y |- | Zip Line in || aerialway || zip_line || in || N |- | Zip Lines in || aerialway || zip_line || in || Y |- | Zip Line near || aerialway || zip_line || near || N |- | Animal shelter || amenity || animal_shelter || - || N |- | Animal shelters || amenity || animal_shelter || - || Y |- | Animal shelter in || amenity || animal_shelter || in || N |- | Animal shelters in || amenity || animal_shelter || in || Y |- | Animal shelter near || amenity || animal_shelter || near|| N |- | Animal shelters near || amenity || animal_shelter || NEAR|| Y |- | Drinking Water near || amenity || drinking_water || near || N |- | Water || amenity || drinking_water || - || N |- | Water in || amenity || drinking_water || In || N |- | Water near || amenity || drinking_water || near || N |- | Embassy || amenity || embassy || - || N |- | Embassys || amenity || "embassy" || - || Y |- | Embassies || amenity || embassy || - || Y |- |Coworkings near |amenity |coworking_space |near |Y |} [[Category:Word list]] cst5x7tt58izti1pxzgljf27tx8qjcj