X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/ff1c78fef5d64b1b0b7eba96332caffb46bc3e21..22da6c541d432bfff74f0c64cd806bd1dd446f7e:/data-sources/wikipedia-wikidata/import_wikidata.sh diff --git a/data-sources/wikipedia-wikidata/import_wikidata.sh b/data-sources/wikipedia-wikidata/import_wikidata.sh index 46546163..6939214f 100755 --- a/data-sources/wikipedia-wikidata/import_wikidata.sh +++ b/data-sources/wikipedia-wikidata/import_wikidata.sh @@ -1,37 +1,66 @@ #!/bin/bash psqlcmd() { - psql wikiprocessingdb + psql --quiet wikiprocessingdb } mysql2pgsqlcmd() { ./mysql2pgsql.perl /dev/stdin /dev/stdout } +download() { + echo "Downloading $1" + wget --quiet --no-clobber --tries 3 "$1" +} + +# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +# requires Bash 4.0 +readarray -t LANGUAGES < languages.txt + + -# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +echo "=====================================================================" +echo "Download wikidata dump tables" +echo "=====================================================================" -language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" ) +# 114M wikidatawiki-latest-geo_tags.sql.gz +# 1.7G wikidatawiki-latest-page.sql.gz +# 1.2G wikidatawiki-latest-wb_items_per_site.sql.gz +download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz +download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz +download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz -# get a few wikidata dump tables -wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz -wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz -wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz +echo "=====================================================================" +echo "Import wikidata dump tables" +echo "=====================================================================" -# import wikidata tables +echo "Importing wikidatawiki-latest-geo_tags" +gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd -gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd -gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd +echo "Importing wikidatawiki-latest-page" +gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd + +echo "Importing wikidatawiki-latest-wb_items_per_site" gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd -# get wikidata places from wikidata query API + + + + +echo "=====================================================================" +echo "Get wikidata places from wikidata query API" +echo "=====================================================================" + +echo "Number of place types:" +wc -l wikidata_place_types.txt while read F ; do - wget "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json + echo "Querying for place type $F..." + wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv cat $F.csv >> wikidata_place_dump.csv @@ -39,57 +68,207 @@ while read F ; do done < wikidata_place_types.txt -# import wikidata places - -echo "CREATE TABLE wikidata_place_dump (item text, instance_of text);" | psqlcmd -echo "COPY wikidata_place_dump (item, instance_of) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' DELIMITER ',' CSV;" | psqlcmd - -echo "CREATE TABLE wikidata_place_type_levels (place_type text, level integer);" | psqlcmd -echo "COPY wikidata_place_type_levels (place_type, level) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' DELIMITER ',' CSV HEADER;" | psqlcmd -# create derived tables +echo "=====================================================================" +echo "Import wikidata places" +echo "=====================================================================" + +echo "CREATE TABLE wikidata_place_dump ( + item text, + instance_of text + );" | psqlcmd + +echo "COPY wikidata_place_dump (item, instance_of) + FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' + DELIMITER ',' + CSV + ;" | psqlcmd + +echo "CREATE TABLE wikidata_place_type_levels ( + place_type text, + level integer + );" | psqlcmd + +echo "COPY wikidata_place_type_levels (place_type, level) + FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' + DELIMITER ',' + CSV + HEADER + ;" | psqlcmd + + + + +echo "=====================================================================" +echo "Create derived tables" +echo "=====================================================================" + +echo "CREATE TABLE geo_earth_primary AS + SELECT gt_page_id, + gt_lat, + gt_lon + FROM geo_tags + WHERE gt_globe = 'earth' + AND gt_primary = 1 + AND NOT( gt_lat < -90 + OR gt_lat > 90 + OR gt_lon < -180 + OR gt_lon > 180 + OR gt_lat=0 + OR gt_lon=0) + ;" | psqlcmd + +echo "CREATE TABLE geo_earth_wikidata AS + SELECT DISTINCT geo_earth_primary.gt_page_id, + geo_earth_primary.gt_lat, + geo_earth_primary.gt_lon, + page.page_title, + page.page_namespace + FROM geo_earth_primary + LEFT OUTER JOIN page + ON (geo_earth_primary.gt_page_id = page.page_id) + ORDER BY geo_earth_primary.gt_page_id + ;" | psqlcmd + +echo "ALTER TABLE wikidata_place_dump + ADD COLUMN ont_level integer, + ADD COLUMN lat numeric(11,8), + ADD COLUMN lon numeric(11,8) + ;" | psqlcmd + +echo "UPDATE wikidata_place_dump + SET ont_level = wikidata_place_type_levels.level + FROM wikidata_place_type_levels + WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type + ;" | psqlcmd + +echo "CREATE TABLE wikidata_places + AS + SELECT DISTINCT ON (item) item, + instance_of, + MAX(ont_level) AS ont_level, + lat, + lon + FROM wikidata_place_dump + GROUP BY item, + instance_of, + ont_level, + lat, + lon + ORDER BY item + ;" | psqlcmd + +echo "UPDATE wikidata_places + SET lat = geo_earth_wikidata.gt_lat, + lon = geo_earth_wikidata.gt_lon + FROM geo_earth_wikidata + WHERE wikidata_places.item = geo_earth_wikidata.page_title + ;" | psqlcmd + + + + +echo "=====================================================================" +echo "Process language pages" +echo "=====================================================================" + + +echo "CREATE TABLE wikidata_pages ( + item text, + instance_of text, + lat numeric(11,8), + lon numeric(11,8), + ips_site_page text, + language text + );" | psqlcmd + +for i in "${LANGUAGES[@]}" +do + echo "CREATE TABLE wikidata_${i}_pages AS + SELECT wikidata_places.item, + wikidata_places.instance_of, + wikidata_places.lat, + wikidata_places.lon, + wb_items_per_site.ips_site_page + FROM wikidata_places + LEFT JOIN wb_items_per_site + ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) + WHERE ips_site_id = '${i}wiki' + AND LEFT(wikidata_places.item,1) = 'Q' + ORDER BY wikidata_places.item + ;" | psqlcmd + + echo "ALTER TABLE wikidata_${i}_pages + ADD COLUMN language text + ;" | psqlcmd + + echo "UPDATE wikidata_${i}_pages + SET language = '${i}' + ;" | psqlcmd + + echo "INSERT INTO wikidata_pages + SELECT item, + instance_of, + lat, + lon, + ips_site_page, + language + FROM wikidata_${i}_pages + ;" | psqlcmd +done -echo "CREATE TABLE geo_earth_primary AS SELECT gt_page_id, gt_lat, gt_lon FROM geo_tags WHERE gt_globe = 'earth' AND gt_primary = 1 AND NOT( gt_lat < -90 OR gt_lat > 90 OR gt_lon < -180 OR gt_lon > 180 OR gt_lat=0 OR gt_lon=0) ;" | psqlcmd -echo "CREATE TABLE geo_earth_wikidata AS SELECT DISTINCT geo_earth_primary.gt_page_id, geo_earth_primary.gt_lat, geo_earth_primary.gt_lon, page.page_title, page.page_namespace FROM geo_earth_primary LEFT OUTER JOIN page ON (geo_earth_primary.gt_page_id = page.page_id) ORDER BY geo_earth_primary.gt_page_id;" | psqlcmd +echo "ALTER TABLE wikidata_pages + ADD COLUMN wp_page_title text + ;" | psqlcmd +echo "UPDATE wikidata_pages + SET wp_page_title = REPLACE(ips_site_page, ' ', '_') + ;" | psqlcmd +echo "ALTER TABLE wikidata_pages + DROP COLUMN ips_site_page + ;" | psqlcmd -echo "ALTER TABLE wikidata_place_dump ADD COLUMN ont_level integer, ADD COLUMN lat numeric(11,8), ADD COLUMN lon numeric(11,8);" | psqlcmd -echo "UPDATE wikidata_place_dump SET ont_level = wikidata_place_type_levels.level FROM wikidata_place_type_levels WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type;" | psqlcmd -echo "CREATE TABLE wikidata_places AS SELECT DISTINCT ON (item) item, instance_of, MAX(ont_level) AS ont_level, lat, lon FROM wikidata_place_dump GROUP BY item, instance_of, ont_level, lat, lon ORDER BY item;" | psqlcmd -echo "UPDATE wikidata_places SET lat = geo_earth_wikidata.gt_lat, lon = geo_earth_wikidata.gt_lon FROM geo_earth_wikidata WHERE wikidata_places.item = geo_earth_wikidata.page_title" | psqlcmd -# process language pages +echo "=====================================================================" +echo "Add wikidata to wikipedia_article table" +echo "=====================================================================" -echo "CREATE TABLE wikidata_pages (item text, instance_of text, lat numeric(11,8), lon numeric(11,8), ips_site_page text, language text );" | psqlcmd +echo "UPDATE wikipedia_article + SET lat = wikidata_pages.lat, + lon = wikidata_pages.lon, + wd_page_title = wikidata_pages.item, + instance_of = wikidata_pages.instance_of + FROM wikidata_pages + WHERE wikipedia_article.language = wikidata_pages.language + AND wikipedia_article.title = wikidata_pages.wp_page_title + ;" | psqlcmd -for i in "${language[@]}" -do - echo "CREATE TABLE wikidata_${i}_pages as select wikidata_places.item, wikidata_places.instance_of, wikidata_places.lat, wikidata_places.lon, wb_items_per_site.ips_site_page FROM wikidata_places LEFT JOIN wb_items_per_site ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) WHERE ips_site_id = '${i}wiki' AND LEFT(wikidata_places.item,1) = 'Q' order by wikidata_places.item;" | psqlcmd - echo "ALTER TABLE wikidata_${i}_pages ADD COLUMN language text;" | psqlcmd - echo "UPDATE wikidata_${i}_pages SET language = '${i}';" | psqlcmd - echo "INSERT INTO wikidata_pages SELECT item, instance_of, lat, lon, ips_site_page, language FROM wikidata_${i}_pages;" | psqlcmd -done +echo "CREATE TABLE wikipedia_article_slim + AS + SELECT * FROM wikipedia_article + WHERE wikidata_id IS NOT NULL + ;" | psqlcmd -echo "ALTER TABLE wikidata_pages ADD COLUMN wp_page_title text;" | psqlcmd -echo "UPDATE wikidata_pages SET wp_page_title = REPLACE(ips_site_page, ' ', '_');" | psqlcmd -echo "ALTER TABLE wikidata_pages DROP COLUMN ips_site_page;" | psqlcmd +echo "ALTER TABLE wikipedia_article + RENAME TO wikipedia_article_full + ;" | psqlcmd +echo "ALTER TABLE wikipedia_article_slim + RENAME TO wikipedia_article + ;" | psqlcmd -# add wikidata to wikipedia_article table -echo "UPDATE wikipedia_article SET lat = wikidata_pages.lat, lon = wikidata_pages.lon, wd_page_title = wikidata_pages.item, instance_of = wikidata_pages.instance_of FROM wikidata_pages WHERE wikipedia_article.language = wikidata_pages.language AND wikipedia_article.title = wikidata_pages.wp_page_title;" | psqlcmd -echo "CREATE TABLE wikipedia_article_slim AS SELECT * FROM wikipedia_article WHERE wikidata_id IS NOT NULL;" | psqlcmd -echo "ALTER TABLE wikipedia_article RENAME TO wikipedia_article_full;" | psqlcmd -echo "ALTER TABLE wikipedia_article_slim RENAME TO wikipedia_article;" | psqlcmd -# clean up intermediate tables +echo "=====================================================================" +echo "Dropping intermediate tables" +echo "=====================================================================" echo "DROP TABLE wikidata_place_dump;" | psqlcmd echo "DROP TABLE geo_earth_primary;" | psqlcmd -for i in "${language[@]}" +for i in "${LANGUAGES[@]}" do echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd done