X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/00959fac57d6c1b35d009e68ad857989ae71de6c..b0067a0345c2ab7793614925d6150141ef5f18f0:/nominatim/tools/special_phrases/sp_wiki_loader.py?ds=sidebyside diff --git a/nominatim/tools/special_phrases/sp_wiki_loader.py b/nominatim/tools/special_phrases/sp_wiki_loader.py index 11e59b45..cbeaeabf 100644 --- a/nominatim/tools/special_phrases/sp_wiki_loader.py +++ b/nominatim/tools/special_phrases/sp_wiki_loader.py @@ -1,71 +1,68 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. """ Module containing the SPWikiLoader class. """ -import logging +from typing import Iterable import re +import logging + +from nominatim.config import Configuration from nominatim.tools.special_phrases.special_phrase import SpecialPhrase -from nominatim.tools.special_phrases.sp_loader import SPLoader from nominatim.tools.exec_utils import get_url LOG = logging.getLogger() -class SPWikiLoader(SPLoader): + +def _get_wiki_content(lang: str) -> str: + """ + Request and return the wiki page's content + corresponding to special phrases for a given lang. + Requested URL Example : + https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN + """ + url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \ + + lang.upper() + return get_url(url) + + +class SPWikiLoader: """ Handles loading of special phrases from the wiki. """ - def __init__(self, config, languages=None): - if languages is not None and not isinstance(languages, list): - raise TypeError('The \'languages\' parameter should be of type list.') - super().__init__() + def __init__(self, config: Configuration) -> None: self.config = config - #Compile the regex here to increase performances. + # Compile the regex here to increase performances. self.occurence_pattern = re.compile( r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' ) - self.languages = self._load_languages() if not languages else languages + # Hack around a bug where building=yes was imported with quotes into the wiki + self.type_fix_pattern = re.compile(r'\"|"') - def __next__(self): - if not self.languages: - raise StopIteration + self.languages = self.config.get_str_list('LANGUAGES') or \ + ['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', + 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', + 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', + 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi', + 'lv', 'tr'] - lang = self.languages.pop(0) - loaded_xml = SPWikiLoader._get_wiki_content(lang) - LOG.warning('Importing phrases for lang: %s...', lang) - return self.parse_xml(loaded_xml) - def parse_xml(self, xml): - """ - Parses XML content and extracts special phrases from it. - Return a list of SpecialPhrase. + def generate_phrases(self) -> Iterable[SpecialPhrase]: + """ Download the wiki pages for the configured languages + and extract the phrases from the page. """ - #One match will be of format [label, class, type, operator, plural] - matches = self.occurence_pattern.findall(xml) - returned_phrases = set() - for match in matches: - returned_phrases.add( - SpecialPhrase(match[0], match[1], match[2], match[3]) - ) - return returned_phrases + for lang in self.languages: + LOG.warning('Importing phrases for lang: %s...', lang) + loaded_xml = _get_wiki_content(lang) - def _load_languages(self): - """ - Get list of all languages from env config file - or default if there is no languages configured. - The system will extract special phrases only from all specified languages. - """ - default_languages = [ - 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', - 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', - 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', - 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages + # One match will be of format [label, class, type, operator, plural] + matches = self.occurence_pattern.findall(loaded_xml) - @staticmethod - def _get_wiki_content(lang): - """ - Request and return the wiki page's content - corresponding to special phrases for a given lang. - Requested URL Example : - https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN - """ - url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long - return get_url(url) + for match in matches: + yield SpecialPhrase(match[0], + match[1], + self.type_fix_pattern.sub('', match[2]), + match[3])