2 Module containing the class handling the import
3 of the special phrases.
5 Phrases are analyzed and imported into the database.
7 The phrases already present in the database which are not
8 valids anymore are removed.
13 from psycopg2.sql import Identifier, Literal, SQL
14 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
16 LOG = logging.getLogger()
18 def _classtype_table(phrase_class, phrase_type):
19 """ Return the name of the table for the given class and type.
21 return f'place_classtype_{phrase_class}_{phrase_type}'
24 # pylint: disable-msg=too-many-instance-attributes
26 Class handling the process of special phrases importation into the database.
28 Take a sp loader which load the phrases from an external source.
30 def __init__(self, config, db_connection, sp_loader) -> None:
32 self.db_connection = db_connection
33 self.sp_loader = sp_loader
34 self.statistics_handler = SpecialPhrasesImporterStatistics()
35 self.black_list, self.white_list = self._load_white_and_black_lists()
36 self.sanity_check_pattern = re.compile(r'^\w+$')
37 # This set will contain all existing phrases to be added.
38 # It contains tuples with the following format: (lable, class, type, operator)
39 self.word_phrases = set()
40 # This set will contain all existing place_classtype tables which doesn't match any
41 # special phrases class/type on the wiki.
42 self.table_phrases_to_delete = set()
44 def import_phrases(self, tokenizer, should_replace):
46 Iterate through all SpecialPhrases extracted from the
47 loader and import them into the database.
49 If should_replace is set to True only the loaded phrases
50 will be kept into the database. All other phrases already
51 in the database will be removed.
53 LOG.warning('Special phrases importation starting')
54 self._fetch_existing_place_classtype_tables()
56 # Store pairs of class/type for further processing
57 class_type_pairs = set()
59 for loaded_phrases in self.sp_loader:
60 for phrase in loaded_phrases:
61 result = self._process_phrase(phrase)
63 class_type_pairs.add(result)
65 self._create_place_classtype_table_and_indexes(class_type_pairs)
67 self._remove_non_existent_tables_from_db()
68 self.db_connection.commit()
70 with tokenizer.name_analyzer() as analyzer:
71 analyzer.update_special_phrases(self.word_phrases, should_replace)
73 LOG.warning('Import done.')
74 self.statistics_handler.notify_import_done()
77 def _fetch_existing_place_classtype_tables(self):
79 Fetch existing place_classtype tables.
80 Fill the table_phrases_to_delete set of the class.
84 FROM information_schema.tables
85 WHERE table_schema='public'
86 AND table_name like 'place_classtype_%';
88 with self.db_connection.cursor() as db_cursor:
89 db_cursor.execute(SQL(query))
91 self.table_phrases_to_delete.add(row[0])
93 def _load_white_and_black_lists(self):
95 Load white and black lists from phrases-settings.json.
97 settings = self.config.load_sub_configuration('phrase-settings.json')
99 return settings['blackList'], settings['whiteList']
101 def _check_sanity(self, phrase):
103 Check sanity of given inputs in case somebody added garbage in the wiki.
104 If a bad class/type is detected the system will exit with an error.
106 class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
107 type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
109 if not class_matchs or not type_matchs:
110 LOG.warning("Bad class/type: %s=%s. It will not be imported",
111 phrase.p_class, phrase.p_type)
115 def _process_phrase(self, phrase):
117 Processes the given phrase by checking black and white list
119 Return the class/type pair corresponding to the phrase.
122 # blacklisting: disallow certain class/type combinations
123 if phrase.p_class in self.black_list.keys() \
124 and phrase.p_type in self.black_list[phrase.p_class]:
127 # whitelisting: if class is in whitelist, allow only tags in the list
128 if phrase.p_class in self.white_list.keys() \
129 and phrase.p_type not in self.white_list[phrase.p_class]:
132 # sanity check, in case somebody added garbage in the wiki
133 if not self._check_sanity(phrase):
134 self.statistics_handler.notify_one_phrase_invalid()
137 self.word_phrases.add((phrase.p_label, phrase.p_class,
138 phrase.p_type, phrase.p_operator))
140 return (phrase.p_class, phrase.p_type)
143 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
145 Create table place_classtype for each given pair.
146 Also create indexes on place_id and centroid.
148 LOG.warning('Create tables and indexes...')
150 sql_tablespace = self.config.TABLESPACE_AUX_DATA
152 sql_tablespace = ' TABLESPACE ' + sql_tablespace
154 with self.db_connection.cursor() as db_cursor:
155 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
157 for pair in class_type_pairs:
158 phrase_class = pair[0]
159 phrase_type = pair[1]
161 table_name = _classtype_table(phrase_class, phrase_type)
163 if table_name in self.table_phrases_to_delete:
164 self.statistics_handler.notify_one_table_ignored()
165 # Remove this table from the ones to delete as it match a
166 # class/type still existing on the special phrases of the wiki.
167 self.table_phrases_to_delete.remove(table_name)
168 # So don't need to create the table and indexes.
172 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
175 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
177 # Grant access on read to the web user.
178 self._grant_access_to_webuser(phrase_class, phrase_type)
180 self.statistics_handler.notify_one_table_created()
182 with self.db_connection.cursor() as db_cursor:
183 db_cursor.execute("DROP INDEX idx_placex_classtype")
186 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
188 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
190 table_name = _classtype_table(phrase_class, phrase_type)
191 with self.db_connection.cursor() as db_cursor:
192 db_cursor.execute(SQL("""
193 CREATE TABLE IF NOT EXISTS {{}} {}
194 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
195 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
196 .format(Identifier(table_name), Literal(phrase_class),
197 Literal(phrase_type)))
200 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
202 Create indexes on centroid and place_id for the place_classtype table.
204 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
205 base_table = _classtype_table(phrase_class, phrase_type)
207 if not self.db_connection.index_exists(index_prefix + 'centroid'):
208 with self.db_connection.cursor() as db_cursor:
209 db_cursor.execute(SQL("""
210 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
211 .format(Identifier(index_prefix + 'centroid'),
212 Identifier(base_table)), sql_tablespace)
215 if not self.db_connection.index_exists(index_prefix + 'place_id'):
216 with self.db_connection.cursor() as db_cursor:
217 db_cursor.execute(SQL(
218 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
219 .format(Identifier(index_prefix + 'place_id'),
220 Identifier(base_table)))
223 def _grant_access_to_webuser(self, phrase_class, phrase_type):
225 Grant access on read to the table place_classtype for the webuser.
227 table_name = _classtype_table(phrase_class, phrase_type)
228 with self.db_connection.cursor() as db_cursor:
229 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
230 .format(Identifier(table_name),
231 Identifier(self.config.DATABASE_WEBUSER)))
233 def _remove_non_existent_tables_from_db(self):
235 Remove special phrases which doesn't exist on the wiki anymore.
236 Delete the place_classtype tables.
238 LOG.warning('Cleaning database...')
240 # Delete place_classtype tables corresponding to class/type which
241 # are not on the wiki anymore.
242 with self.db_connection.cursor() as db_cursor:
243 for table in self.table_phrases_to_delete:
244 self.statistics_handler.notify_one_table_deleted()
245 db_cursor.drop_table(table)