X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/bc8b2d4ae0dbaef64448ddcb530de9626da9d82d..674185a0651a4f6719e7fe71e00cca147783a1f4:/nominatim/tools/special_phrases/sp_wiki_loader.py diff --git a/nominatim/tools/special_phrases/sp_wiki_loader.py b/nominatim/tools/special_phrases/sp_wiki_loader.py index 1ad9de7e..ca4758ac 100644 --- a/nominatim/tools/special_phrases/sp_wiki_loader.py +++ b/nominatim/tools/special_phrases/sp_wiki_loader.py @@ -1,48 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. """ Module containing the SPWikiLoader class. """ import re import logging -from collections.abc import Iterator from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.exec_utils import get_url LOG = logging.getLogger() -class SPWikiLoader(Iterator): + +def _get_wiki_content(lang): + """ + Request and return the wiki page's content + corresponding to special phrases for a given lang. + Requested URL Example : + https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN + """ + url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \ + + lang.upper() + return get_url(url) + + +class SPWikiLoader: """ Handles loading of special phrases from the wiki. """ - def __init__(self, config, languages=None): + def __init__(self, config): super().__init__() self.config = config # Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' ) - self.languages = self._load_languages() if not languages else list(languages) - - def __next__(self): - if not self.languages: - raise StopIteration + # Hack around a bug where building=yes was imported with quotes into the wiki + self.type_fix_pattern = re.compile(r'\"|"') + self._load_languages() - lang = self.languages.pop(0) - loaded_xml = self._get_wiki_content(lang) - LOG.warning('Importing phrases for lang: %s...', lang) - return self.parse_xml(loaded_xml) - def parse_xml(self, xml): - """ - Parses XML content and extracts special phrases from it. - Return a list of SpecialPhrase. + def generate_phrases(self): + """ Download the wiki pages for the configured languages + and extract the phrases from the page. """ - # One match will be of format [label, class, type, operator, plural] - matches = self.occurence_pattern.findall(xml) - returned_phrases = set() - for match in matches: - returned_phrases.add( - SpecialPhrase(match[0], match[1], match[2], match[3]) - ) - return returned_phrases + for lang in self.languages: + LOG.warning('Importing phrases for lang: %s...', lang) + loaded_xml = _get_wiki_content(lang) + + # One match will be of format [label, class, type, operator, plural] + matches = self.occurence_pattern.findall(loaded_xml) + + for match in matches: + yield SpecialPhrase(match[0], + match[1], + self.type_fix_pattern.sub('', match[2]), + match[3]) + def _load_languages(self): """ @@ -50,21 +66,11 @@ class SPWikiLoader(Iterator): or default if there is no languages configured. The system will extract special phrases only from all specified languages. """ - default_languages = [ + if self.config.LANGUAGES: + self.languages = self.config.get_str_list('LANGUAGES') + else: + self.languages = [ 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages - - @staticmethod - def _get_wiki_content(lang): - """ - Request and return the wiki page's content - corresponding to special phrases for a given lang. - Requested URL Example : - https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN - """ - url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \ - + lang.upper() - return get_url(url)