4 psql --quiet wikiprocessingdb
8 ./mysql2pgsql.perl /dev/stdin /dev/stdout
13 wget --quiet --no-clobber --tries 3
16 # languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
18 readarray -t LANGUAGES < languages.txt
22 echo "====================================================================="
23 echo "Download wikidata dump tables"
24 echo "====================================================================="
26 # 114M wikidatawiki-latest-geo_tags.sql.gz
27 # 1.7G wikidatawiki-latest-page.sql.gz
28 # 1.2G wikidatawiki-latest-wb_items_per_site.sql.gz
29 download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
30 download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
31 download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
36 echo "====================================================================="
37 echo "Import wikidata dump tables"
38 echo "====================================================================="
40 echo "Importing wikidatawiki-latest-geo_tags"
41 gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
43 echo "Importing wikidatawiki-latest-page"
44 gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
46 echo "Importing wikidatawiki-latest-wb_items_per_site"
47 gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
54 echo "====================================================================="
55 echo "Get wikidata places from wikidata query API"
56 echo "====================================================================="
58 echo "Number of place types:"
59 wc -l wikidata_place_types.txt
62 echo "Querying for place type $F..."
63 wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
64 jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
65 awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
66 cat $F.csv >> wikidata_place_dump.csv
67 rm $F.json $F.txt $F.csv
68 done < wikidata_place_types.txt
73 echo "====================================================================="
74 echo "Import wikidata places"
75 echo "====================================================================="
77 echo "CREATE TABLE wikidata_place_dump (
82 echo "COPY wikidata_place_dump (item, instance_of)
83 FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv'
88 echo "CREATE TABLE wikidata_place_type_levels (
93 echo "COPY wikidata_place_type_levels (place_type, level)
94 FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv'
103 echo "====================================================================="
104 echo "Create derived tables"
105 echo "====================================================================="
107 echo "CREATE TABLE geo_earth_primary AS
112 WHERE gt_globe = 'earth'
114 AND NOT( gt_lat < -90
122 echo "CREATE TABLE geo_earth_wikidata AS
123 SELECT DISTINCT geo_earth_primary.gt_page_id,
124 geo_earth_primary.gt_lat,
125 geo_earth_primary.gt_lon,
128 FROM geo_earth_primary
130 ON (geo_earth_primary.gt_page_id = page.page_id)
131 ORDER BY geo_earth_primary.gt_page_id
134 echo "ALTER TABLE wikidata_place_dump
135 ADD COLUMN ont_level integer,
136 ADD COLUMN lat numeric(11,8),
137 ADD COLUMN lon numeric(11,8)
140 echo "UPDATE wikidata_place_dump
141 SET ont_level = wikidata_place_type_levels.level
142 FROM wikidata_place_type_levels
143 WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type
146 echo "CREATE TABLE wikidata_places
148 SELECT DISTINCT ON (item) item,
150 MAX(ont_level) AS ont_level,
153 FROM wikidata_place_dump
162 echo "UPDATE wikidata_places
163 SET lat = geo_earth_wikidata.gt_lat,
164 lon = geo_earth_wikidata.gt_lon
165 FROM geo_earth_wikidata
166 WHERE wikidata_places.item = geo_earth_wikidata.page_title
172 echo "====================================================================="
173 echo "Process language pages"
174 echo "====================================================================="
177 echo "CREATE TABLE wikidata_pages (
186 for i in "${LANGUAGES[@]}"
188 echo "CREATE TABLE wikidata_${i}_pages AS
189 SELECT wikidata_places.item,
190 wikidata_places.instance_of,
193 wb_items_per_site.ips_site_page
195 LEFT JOIN wb_items_per_site
196 ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id)
197 WHERE ips_site_id = '${i}wiki'
198 AND LEFT(wikidata_places.item,1) = 'Q'
199 ORDER BY wikidata_places.item
202 echo "ALTER TABLE wikidata_${i}_pages
203 ADD COLUMN language text
206 echo "UPDATE wikidata_${i}_pages
207 SET language = '${i}'
210 echo "INSERT INTO wikidata_pages
217 FROM wikidata_${i}_pages
221 echo "ALTER TABLE wikidata_pages
222 ADD COLUMN wp_page_title text
224 echo "UPDATE wikidata_pages
225 SET wp_page_title = REPLACE(ips_site_page, ' ', '_')
227 echo "ALTER TABLE wikidata_pages
228 DROP COLUMN ips_site_page
234 echo "====================================================================="
235 echo "Add wikidata to wikipedia_article table"
236 echo "====================================================================="
238 echo "UPDATE wikipedia_article
239 SET lat = wikidata_pages.lat,
240 lon = wikidata_pages.lon,
241 wd_page_title = wikidata_pages.item,
242 instance_of = wikidata_pages.instance_of
244 WHERE wikipedia_article.language = wikidata_pages.language
245 AND wikipedia_article.title = wikidata_pages.wp_page_title
248 echo "CREATE TABLE wikipedia_article_slim
250 SELECT * FROM wikipedia_article
251 WHERE wikidata_id IS NOT NULL
254 echo "ALTER TABLE wikipedia_article
255 RENAME TO wikipedia_article_full
258 echo "ALTER TABLE wikipedia_article_slim
259 RENAME TO wikipedia_article
265 echo "====================================================================="
266 echo "Dropping intermediate tables"
267 echo "====================================================================="
269 echo "DROP TABLE wikidata_place_dump;" | psqlcmd
270 echo "DROP TABLE geo_earth_primary;" | psqlcmd
271 for i in "${LANGUAGES[@]}"
273 echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd