From 9963261d8d572f7a0d88ef27f5d938f085c603ba Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 17 Jul 2022 10:46:59 +0200 Subject: [PATCH] add type annotations to special phrase importer --- nominatim/tokenizer/base.py | 5 +- nominatim/tokenizer/icu_tokenizer.py | 5 +- nominatim/tokenizer/legacy_tokenizer.py | 5 +- .../special_phrases/importer_statistics.py | 15 +++-- .../tools/special_phrases/sp_csv_loader.py | 8 +-- .../tools/special_phrases/sp_importer.py | 55 +++++++++++++------ .../tools/special_phrases/sp_wiki_loader.py | 33 ++++------- .../tools/special_phrases/special_phrase.py | 9 +-- .../tools/test_import_special_phrases.py | 2 +- 9 files changed, 77 insertions(+), 60 deletions(-) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index afcb0864..1c1ca9f7 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -9,7 +9,7 @@ Abstract class defintions for tokenizers. These base classes are here mainly for documentation purposes. """ from abc import ABC, abstractmethod -from typing import List, Tuple, Dict, Any, Optional +from typing import List, Tuple, Dict, Any, Optional, Iterable from pathlib import Path from typing_extensions import Protocol @@ -81,7 +81,8 @@ class AbstractAnalyzer(ABC): @abstractmethod - def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]], + def update_special_phrases(self, + phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: """ Update the tokenizer's special phrase tokens from the given list of special phrases. diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 1e3eab98..31eaaf29 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -8,7 +8,8 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ -from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable +from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \ + Dict, Set, Iterable import itertools import json import logging @@ -374,7 +375,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): - def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]], + def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: """ Replace the search index for special phrases with the new phrases. If `should_replace` is True, then the previous set of will be diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 848d6191..f52eaada 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -7,7 +7,8 @@ """ Tokenizer implementing normalisation as used before Nominatim 4. """ -from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, cast, Dict, Set +from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \ + cast, Dict, Set, Iterable from collections import OrderedDict import logging from pathlib import Path @@ -392,7 +393,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer): - def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]], + def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: """ Replace the search index for special phrases with the new phrases. """ diff --git a/nominatim/tools/special_phrases/importer_statistics.py b/nominatim/tools/special_phrases/importer_statistics.py index b1a9c438..0bb118c8 100644 --- a/nominatim/tools/special_phrases/importer_statistics.py +++ b/nominatim/tools/special_phrases/importer_statistics.py @@ -12,15 +12,14 @@ import logging LOG = logging.getLogger() class SpecialPhrasesImporterStatistics(): - # pylint: disable-msg=too-many-instance-attributes """ Class handling statistics of the import process of special phrases. """ - def __init__(self): + def __init__(self) -> None: self._intialize_values() - def _intialize_values(self): + def _intialize_values(self) -> None: """ Set all counts for the global import to 0. @@ -30,32 +29,32 @@ class SpecialPhrasesImporterStatistics(): self.tables_ignored = 0 self.invalids = 0 - def notify_one_phrase_invalid(self): + def notify_one_phrase_invalid(self) -> None: """ Add +1 to the count of invalid entries fetched from the wiki. """ self.invalids += 1 - def notify_one_table_created(self): + def notify_one_table_created(self) -> None: """ Add +1 to the count of created tables. """ self.tables_created += 1 - def notify_one_table_deleted(self): + def notify_one_table_deleted(self) -> None: """ Add +1 to the count of deleted tables. """ self.tables_deleted += 1 - def notify_one_table_ignored(self): + def notify_one_table_ignored(self) -> None: """ Add +1 to the count of ignored tables. """ self.tables_ignored += 1 - def notify_import_done(self): + def notify_import_done(self) -> None: """ Print stats for the whole import process and reset all values. diff --git a/nominatim/tools/special_phrases/sp_csv_loader.py b/nominatim/tools/special_phrases/sp_csv_loader.py index 0bd93c00..400f9fa9 100644 --- a/nominatim/tools/special_phrases/sp_csv_loader.py +++ b/nominatim/tools/special_phrases/sp_csv_loader.py @@ -9,6 +9,7 @@ The class allows to load phrases from a csv file. """ +from typing import Iterable import csv import os from nominatim.tools.special_phrases.special_phrase import SpecialPhrase @@ -18,12 +19,11 @@ class SPCsvLoader: """ Handles loading of special phrases from external csv file. """ - def __init__(self, csv_path): - super().__init__() + def __init__(self, csv_path: str) -> None: self.csv_path = csv_path - def generate_phrases(self): + def generate_phrases(self) -> Iterable[SpecialPhrase]: """ Open and parse the given csv file. Create the corresponding SpecialPhrases. """ @@ -35,7 +35,7 @@ class SPCsvLoader: yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator']) - def _check_csv_validity(self): + def _check_csv_validity(self) -> None: """ Check that the csv file has the right extension. """ diff --git a/nominatim/tools/special_phrases/sp_importer.py b/nominatim/tools/special_phrases/sp_importer.py index 805f8937..6ca6a1e1 100644 --- a/nominatim/tools/special_phrases/sp_importer.py +++ b/nominatim/tools/special_phrases/sp_importer.py @@ -13,19 +13,37 @@ The phrases already present in the database which are not valids anymore are removed. """ +from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set import logging import re +from typing_extensions import Protocol + from psycopg2.sql import Identifier, SQL + +from nominatim.config import Configuration +from nominatim.db.connection import Connection from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics +from nominatim.tools.special_phrases.special_phrase import SpecialPhrase +from nominatim.tokenizer.base import AbstractTokenizer LOG = logging.getLogger() -def _classtype_table(phrase_class, phrase_type): +def _classtype_table(phrase_class: str, phrase_type: str) -> str: """ Return the name of the table for the given class and type. """ return f'place_classtype_{phrase_class}_{phrase_type}' + +class SpecialPhraseLoader(Protocol): + """ Protocol for classes implementing a loader for special phrases. + """ + + def generate_phrases(self) -> Iterable[SpecialPhrase]: + """ Generates all special phrase terms this loader can produce. + """ + + class SPImporter(): # pylint: disable-msg=too-many-instance-attributes """ @@ -33,21 +51,22 @@ class SPImporter(): Take a sp loader which load the phrases from an external source. """ - def __init__(self, config, db_connection, sp_loader): + def __init__(self, config: Configuration, conn: Connection, + sp_loader: SpecialPhraseLoader) -> None: self.config = config - self.db_connection = db_connection + self.db_connection = conn self.sp_loader = sp_loader self.statistics_handler = SpecialPhrasesImporterStatistics() self.black_list, self.white_list = self._load_white_and_black_lists() self.sanity_check_pattern = re.compile(r'^\w+$') # This set will contain all existing phrases to be added. # It contains tuples with the following format: (lable, class, type, operator) - self.word_phrases = set() + self.word_phrases: Set[Tuple[str, str, str, str]] = set() # This set will contain all existing place_classtype tables which doesn't match any # special phrases class/type on the wiki. - self.table_phrases_to_delete = set() + self.table_phrases_to_delete: Set[str] = set() - def import_phrases(self, tokenizer, should_replace): + def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None: """ Iterate through all SpecialPhrases extracted from the loader and import them into the database. @@ -67,7 +86,7 @@ class SPImporter(): if result: class_type_pairs.add(result) - self._create_place_classtype_table_and_indexes(class_type_pairs) + self._create_classtype_table_and_indexes(class_type_pairs) if should_replace: self._remove_non_existent_tables_from_db() self.db_connection.commit() @@ -79,7 +98,7 @@ class SPImporter(): self.statistics_handler.notify_import_done() - def _fetch_existing_place_classtype_tables(self): + def _fetch_existing_place_classtype_tables(self) -> None: """ Fetch existing place_classtype tables. Fill the table_phrases_to_delete set of the class. @@ -95,7 +114,8 @@ class SPImporter(): for row in db_cursor: self.table_phrases_to_delete.add(row[0]) - def _load_white_and_black_lists(self): + def _load_white_and_black_lists(self) \ + -> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]: """ Load white and black lists from phrases-settings.json. """ @@ -103,7 +123,7 @@ class SPImporter(): return settings['blackList'], settings['whiteList'] - def _check_sanity(self, phrase): + def _check_sanity(self, phrase: SpecialPhrase) -> bool: """ Check sanity of given inputs in case somebody added garbage in the wiki. If a bad class/type is detected the system will exit with an error. @@ -117,7 +137,7 @@ class SPImporter(): return False return True - def _process_phrase(self, phrase): + def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]: """ Processes the given phrase by checking black and white list and sanity. @@ -145,7 +165,8 @@ class SPImporter(): return (phrase.p_class, phrase.p_type) - def _create_place_classtype_table_and_indexes(self, class_type_pairs): + def _create_classtype_table_and_indexes(self, + class_type_pairs: Iterable[Tuple[str, str]]) -> None: """ Create table place_classtype for each given pair. Also create indexes on place_id and centroid. @@ -188,7 +209,8 @@ class SPImporter(): db_cursor.execute("DROP INDEX idx_placex_classtype") - def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type): + def _create_place_classtype_table(self, sql_tablespace: str, + phrase_class: str, phrase_type: str) -> None: """ Create table place_classtype of the given phrase_class/phrase_type if doesn't exit. @@ -204,7 +226,8 @@ class SPImporter(): (phrase_class, phrase_type)) - def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type): + def _create_place_classtype_indexes(self, sql_tablespace: str, + phrase_class: str, phrase_type: str) -> None: """ Create indexes on centroid and place_id for the place_classtype table. """ @@ -227,7 +250,7 @@ class SPImporter(): SQL(sql_tablespace))) - def _grant_access_to_webuser(self, phrase_class, phrase_type): + def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None: """ Grant access on read to the table place_classtype for the webuser. """ @@ -237,7 +260,7 @@ class SPImporter(): .format(Identifier(table_name), Identifier(self.config.DATABASE_WEBUSER))) - def _remove_non_existent_tables_from_db(self): + def _remove_non_existent_tables_from_db(self) -> None: """ Remove special phrases which doesn't exist on the wiki anymore. Delete the place_classtype tables. diff --git a/nominatim/tools/special_phrases/sp_wiki_loader.py b/nominatim/tools/special_phrases/sp_wiki_loader.py index ca4758ac..e71c2ec0 100644 --- a/nominatim/tools/special_phrases/sp_wiki_loader.py +++ b/nominatim/tools/special_phrases/sp_wiki_loader.py @@ -7,14 +7,17 @@ """ Module containing the SPWikiLoader class. """ +from typing import Iterable import re import logging + +from nominatim.config import Configuration from nominatim.tools.special_phrases.special_phrase import SpecialPhrase from nominatim.tools.exec_utils import get_url LOG = logging.getLogger() -def _get_wiki_content(lang): +def _get_wiki_content(lang: str) -> str: """ Request and return the wiki page's content corresponding to special phrases for a given lang. @@ -30,8 +33,7 @@ class SPWikiLoader: """ Handles loading of special phrases from the wiki. """ - def __init__(self, config): - super().__init__() + def __init__(self, config: Configuration) -> None: self.config = config # Compile the regex here to increase performances. self.occurence_pattern = re.compile( @@ -39,10 +41,15 @@ class SPWikiLoader: ) # Hack around a bug where building=yes was imported with quotes into the wiki self.type_fix_pattern = re.compile(r'\"|"') - self._load_languages() + + self.languages = self.config.get_str_list('LANGUAGES') or \ + ['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', + 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', + 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', + 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - def generate_phrases(self): + def generate_phrases(self) -> Iterable[SpecialPhrase]: """ Download the wiki pages for the configured languages and extract the phrases from the page. """ @@ -58,19 +65,3 @@ class SPWikiLoader: match[1], self.type_fix_pattern.sub('', match[2]), match[3]) - - - def _load_languages(self): - """ - Get list of all languages from env config file - or default if there is no languages configured. - The system will extract special phrases only from all specified languages. - """ - if self.config.LANGUAGES: - self.languages = self.config.get_str_list('LANGUAGES') - else: - self.languages = [ - 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', - 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', - 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', - 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] diff --git a/nominatim/tools/special_phrases/special_phrase.py b/nominatim/tools/special_phrases/special_phrase.py index 16935ccf..40f6a9e4 100644 --- a/nominatim/tools/special_phrases/special_phrase.py +++ b/nominatim/tools/special_phrases/special_phrase.py @@ -10,20 +10,21 @@ This class is a model used to transfer a special phrase through the process of load and importation. """ +from typing import Any + class SpecialPhrase: """ Model representing a special phrase. """ - def __init__(self, p_label, p_class, p_type, p_operator): + def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None: self.p_label = p_label.strip() self.p_class = p_class.strip() - # Hack around a bug where building=yes was imported with quotes into the wiki self.p_type = p_type.strip() # Needed if some operator in the wiki are not written in english p_operator = p_operator.strip().lower() self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if not isinstance(other, SpecialPhrase): return False @@ -32,5 +33,5 @@ class SpecialPhrase: and self.p_type == other.p_type \ and self.p_operator == other.p_operator - def __hash__(self): + def __hash__(self) -> int: return hash((self.p_label, self.p_class, self.p_type, self.p_operator)) diff --git a/test/python/tools/test_import_special_phrases.py b/test/python/tools/test_import_special_phrases.py index 0dcf549c..75a6a066 100644 --- a/test/python/tools/test_import_special_phrases.py +++ b/test/python/tools/test_import_special_phrases.py @@ -128,7 +128,7 @@ def test_create_place_classtype_table_and_indexes( """ pairs = set([('class1', 'type1'), ('class2', 'type2')]) - sp_importer._create_place_classtype_table_and_indexes(pairs) + sp_importer._create_classtype_table_and_indexes(pairs) for pair in pairs: assert check_table_exist(temp_db_conn, pair[0], pair[1]) -- 2.39.5