reintroduce cutoffs when searching for very frequent words

[nominatim.git] / nominatim / tools / special_phrases / sp_wiki_loader.py
diff --git a/nominatim/tools/special_phrases/sp_wiki_loader.py b/nominatim/tools/special_phrases/sp_wiki_loader.py

index 6093fa45c8fc3250217acb0db59d7885b901b2f2..cbeaeabfdfae752b11825b33c544e4d6476f3b8f 100644 (file)
--- a/nominatim/tools/special_phrases/sp_wiki_loader.py
+++ b/nominatim/tools/special_phrases/sp_wiki_loader.py
@@ -7,14 +7,17 @@
  """
      Module containing the SPWikiLoader class.
  """
  """
      Module containing the SPWikiLoader class.
  """
+from typing import Iterable
  import re
  import logging
  import re
  import logging
+
+from nominatim.config import Configuration
  from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
  from nominatim.tools.exec_utils import get_url
  
  LOG = logging.getLogger()
  
  from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
  from nominatim.tools.exec_utils import get_url
  
  LOG = logging.getLogger()
  
-def _get_wiki_content(lang):
+def _get_wiki_content(lang: str) -> str:
      """
          Request and return the wiki page's content
          corresponding to special phrases for a given lang.
      """
          Request and return the wiki page's content
          corresponding to special phrases for a given lang.
@@ -30,17 +33,24 @@ class SPWikiLoader:
      """
          Handles loading of special phrases from the wiki.
      """
      """
          Handles loading of special phrases from the wiki.
      """
-    def __init__(self, config):
-        super().__init__()
+    def __init__(self, config: Configuration) -> None:
          self.config = config
          # Compile the regex here to increase performances.
          self.occurence_pattern = re.compile(
              r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
          )
          self.config = config
          # Compile the regex here to increase performances.
          self.occurence_pattern = re.compile(
              r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
          )
-        self._load_languages()
+        # Hack around a bug where building=yes was imported with quotes into the wiki
+        self.type_fix_pattern = re.compile(r'\"|&quot;')
+
+        self.languages = self.config.get_str_list('LANGUAGES') or \
+                         ['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
+                          'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
+                          'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
+                          'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
+                          'lv', 'tr']
  
  
  
  
-    def generate_phrases(self):
+    def generate_phrases(self) -> Iterable[SpecialPhrase]:
          """ Download the wiki pages for the configured languages
              and extract the phrases from the page.
          """
          """ Download the wiki pages for the configured languages
              and extract the phrases from the page.
          """
@@ -52,20 +62,7 @@ class SPWikiLoader:
              matches = self.occurence_pattern.findall(loaded_xml)
  
              for match in matches:
              matches = self.occurence_pattern.findall(loaded_xml)
  
              for match in matches:
-                yield SpecialPhrase(match[0], match[1], match[2], match[3])
-
-
-    def _load_languages(self):
-        """
-            Get list of all languages from env config file
-            or default if there is no languages configured.
-            The system will extract special phrases only from all specified languages.
-        """
-        if self.config.LANGUAGES:
-            self.languages = self.config.get_str_list('LANGUAGES')
-        else:
-            self.languages = [
-            'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
-            'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
-            'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
-            'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
+                yield SpecialPhrase(match[0],
+                                    match[1],
+                                    self.type_fix_pattern.sub('', match[2]),
+                                    match[3])