From: marc tobias Date: Wed, 18 Dec 2019 18:18:39 +0000 (+0100) Subject: refactor import_wiki* script for readability X-Git-Tag: v3.5.0~112^2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/9587fc99098969ae8cb1ad7fb159db1b06f621ff?ds=sidebyside refactor import_wiki* script for readability --- diff --git a/data-sources/wikipedia-wikidata/import_wikidata.sh b/data-sources/wikipedia-wikidata/import_wikidata.sh index 46546163..a89dd319 100755 --- a/data-sources/wikipedia-wikidata/import_wikidata.sh +++ b/data-sources/wikipedia-wikidata/import_wikidata.sh @@ -1,37 +1,66 @@ #!/bin/bash psqlcmd() { - psql wikiprocessingdb + psql --quiet wikiprocessingdb } mysql2pgsqlcmd() { ./mysql2pgsql.perl /dev/stdin /dev/stdout } +download() { + echo "Downloading $1" + wget --quiet --no-clobber --tries 3 +} + +# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +# requires Bash 4.0 +readarray -t LANGUAGES < languages.txt + + -# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +echo "=====================================================================" +echo "Download wikidata dump tables" +echo "=====================================================================" -language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" ) +# 114M wikidatawiki-latest-geo_tags.sql.gz +# 1.7G wikidatawiki-latest-page.sql.gz +# 1.2G wikidatawiki-latest-wb_items_per_site.sql.gz +download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz +download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz +download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz -# get a few wikidata dump tables -wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz -wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz -wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz +echo "=====================================================================" +echo "Import wikidata dump tables" +echo "=====================================================================" -# import wikidata tables +echo "Importing wikidatawiki-latest-geo_tags" +gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd -gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd -gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd +echo "Importing wikidatawiki-latest-page" +gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd + +echo "Importing wikidatawiki-latest-wb_items_per_site" gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd -# get wikidata places from wikidata query API + + + + +echo "=====================================================================" +echo "Get wikidata places from wikidata query API" +echo "=====================================================================" + +echo "Number of place types:" +wc -l wikidata_place_types.txt while read F ; do - wget "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json + echo "Querying for place type $F..." + wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv cat $F.csv >> wikidata_place_dump.csv @@ -39,57 +68,207 @@ while read F ; do done < wikidata_place_types.txt -# import wikidata places - -echo "CREATE TABLE wikidata_place_dump (item text, instance_of text);" | psqlcmd -echo "COPY wikidata_place_dump (item, instance_of) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' DELIMITER ',' CSV;" | psqlcmd - -echo "CREATE TABLE wikidata_place_type_levels (place_type text, level integer);" | psqlcmd -echo "COPY wikidata_place_type_levels (place_type, level) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' DELIMITER ',' CSV HEADER;" | psqlcmd -# create derived tables +echo "=====================================================================" +echo "Import wikidata places" +echo "=====================================================================" + +echo "CREATE TABLE wikidata_place_dump ( + item text, + instance_of text + );" | psqlcmd + +echo "COPY wikidata_place_dump (item, instance_of) + FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' + DELIMITER ',' + CSV + ;" | psqlcmd + +echo "CREATE TABLE wikidata_place_type_levels ( + place_type text, + level integer + );" | psqlcmd + +echo "COPY wikidata_place_type_levels (place_type, level) + FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' + DELIMITER ',' + CSV + HEADER + ;" | psqlcmd + + + + +echo "=====================================================================" +echo "Create derived tables" +echo "=====================================================================" + +echo "CREATE TABLE geo_earth_primary AS + SELECT gt_page_id, + gt_lat, + gt_lon + FROM geo_tags + WHERE gt_globe = 'earth' + AND gt_primary = 1 + AND NOT( gt_lat < -90 + OR gt_lat > 90 + OR gt_lon < -180 + OR gt_lon > 180 + OR gt_lat=0 + OR gt_lon=0) + ;" | psqlcmd + +echo "CREATE TABLE geo_earth_wikidata AS + SELECT DISTINCT geo_earth_primary.gt_page_id, + geo_earth_primary.gt_lat, + geo_earth_primary.gt_lon, + page.page_title, + page.page_namespace + FROM geo_earth_primary + LEFT OUTER JOIN page + ON (geo_earth_primary.gt_page_id = page.page_id) + ORDER BY geo_earth_primary.gt_page_id + ;" | psqlcmd + +echo "ALTER TABLE wikidata_place_dump + ADD COLUMN ont_level integer, + ADD COLUMN lat numeric(11,8), + ADD COLUMN lon numeric(11,8) + ;" | psqlcmd + +echo "UPDATE wikidata_place_dump + SET ont_level = wikidata_place_type_levels.level + FROM wikidata_place_type_levels + WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type + ;" | psqlcmd + +echo "CREATE TABLE wikidata_places + AS + SELECT DISTINCT ON (item) item, + instance_of, + MAX(ont_level) AS ont_level, + lat, + lon + FROM wikidata_place_dump + GROUP BY item, + instance_of, + ont_level, + lat, + lon + ORDER BY item + ;" | psqlcmd + +echo "UPDATE wikidata_places + SET lat = geo_earth_wikidata.gt_lat, + lon = geo_earth_wikidata.gt_lon + FROM geo_earth_wikidata + WHERE wikidata_places.item = geo_earth_wikidata.page_title + ;" | psqlcmd + + + + +echo "=====================================================================" +echo "Process language pages" +echo "=====================================================================" + + +echo "CREATE TABLE wikidata_pages ( + item text, + instance_of text, + lat numeric(11,8), + lon numeric(11,8), + ips_site_page text, + language text + );" | psqlcmd + +for i in "${LANGUAGES[@]}" +do + echo "CREATE TABLE wikidata_${i}_pages AS + SELECT wikidata_places.item, + wikidata_places.instance_of, + wikidata_places.lat, + wikidata_places.lon, + wb_items_per_site.ips_site_page + FROM wikidata_places + LEFT JOIN wb_items_per_site + ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) + WHERE ips_site_id = '${i}wiki' + AND LEFT(wikidata_places.item,1) = 'Q' + ORDER BY wikidata_places.item + ;" | psqlcmd + + echo "ALTER TABLE wikidata_${i}_pages + ADD COLUMN language text + ;" | psqlcmd + + echo "UPDATE wikidata_${i}_pages + SET language = '${i}' + ;" | psqlcmd + + echo "INSERT INTO wikidata_pages + SELECT item, + instance_of, + lat, + lon, + ips_site_page, + language + FROM wikidata_${i}_pages + ;" | psqlcmd +done -echo "CREATE TABLE geo_earth_primary AS SELECT gt_page_id, gt_lat, gt_lon FROM geo_tags WHERE gt_globe = 'earth' AND gt_primary = 1 AND NOT( gt_lat < -90 OR gt_lat > 90 OR gt_lon < -180 OR gt_lon > 180 OR gt_lat=0 OR gt_lon=0) ;" | psqlcmd -echo "CREATE TABLE geo_earth_wikidata AS SELECT DISTINCT geo_earth_primary.gt_page_id, geo_earth_primary.gt_lat, geo_earth_primary.gt_lon, page.page_title, page.page_namespace FROM geo_earth_primary LEFT OUTER JOIN page ON (geo_earth_primary.gt_page_id = page.page_id) ORDER BY geo_earth_primary.gt_page_id;" | psqlcmd +echo "ALTER TABLE wikidata_pages + ADD COLUMN wp_page_title text + ;" | psqlcmd +echo "UPDATE wikidata_pages + SET wp_page_title = REPLACE(ips_site_page, ' ', '_') + ;" | psqlcmd +echo "ALTER TABLE wikidata_pages + DROP COLUMN ips_site_page + ;" | psqlcmd -echo "ALTER TABLE wikidata_place_dump ADD COLUMN ont_level integer, ADD COLUMN lat numeric(11,8), ADD COLUMN lon numeric(11,8);" | psqlcmd -echo "UPDATE wikidata_place_dump SET ont_level = wikidata_place_type_levels.level FROM wikidata_place_type_levels WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type;" | psqlcmd -echo "CREATE TABLE wikidata_places AS SELECT DISTINCT ON (item) item, instance_of, MAX(ont_level) AS ont_level, lat, lon FROM wikidata_place_dump GROUP BY item, instance_of, ont_level, lat, lon ORDER BY item;" | psqlcmd -echo "UPDATE wikidata_places SET lat = geo_earth_wikidata.gt_lat, lon = geo_earth_wikidata.gt_lon FROM geo_earth_wikidata WHERE wikidata_places.item = geo_earth_wikidata.page_title" | psqlcmd -# process language pages +echo "=====================================================================" +echo "Add wikidata to wikipedia_article table" +echo "=====================================================================" -echo "CREATE TABLE wikidata_pages (item text, instance_of text, lat numeric(11,8), lon numeric(11,8), ips_site_page text, language text );" | psqlcmd +echo "UPDATE wikipedia_article + SET lat = wikidata_pages.lat, + lon = wikidata_pages.lon, + wd_page_title = wikidata_pages.item, + instance_of = wikidata_pages.instance_of + FROM wikidata_pages + WHERE wikipedia_article.language = wikidata_pages.language + AND wikipedia_article.title = wikidata_pages.wp_page_title + ;" | psqlcmd -for i in "${language[@]}" -do - echo "CREATE TABLE wikidata_${i}_pages as select wikidata_places.item, wikidata_places.instance_of, wikidata_places.lat, wikidata_places.lon, wb_items_per_site.ips_site_page FROM wikidata_places LEFT JOIN wb_items_per_site ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) WHERE ips_site_id = '${i}wiki' AND LEFT(wikidata_places.item,1) = 'Q' order by wikidata_places.item;" | psqlcmd - echo "ALTER TABLE wikidata_${i}_pages ADD COLUMN language text;" | psqlcmd - echo "UPDATE wikidata_${i}_pages SET language = '${i}';" | psqlcmd - echo "INSERT INTO wikidata_pages SELECT item, instance_of, lat, lon, ips_site_page, language FROM wikidata_${i}_pages;" | psqlcmd -done +echo "CREATE TABLE wikipedia_article_slim + AS + SELECT * FROM wikipedia_article + WHERE wikidata_id IS NOT NULL + ;" | psqlcmd -echo "ALTER TABLE wikidata_pages ADD COLUMN wp_page_title text;" | psqlcmd -echo "UPDATE wikidata_pages SET wp_page_title = REPLACE(ips_site_page, ' ', '_');" | psqlcmd -echo "ALTER TABLE wikidata_pages DROP COLUMN ips_site_page;" | psqlcmd +echo "ALTER TABLE wikipedia_article + RENAME TO wikipedia_article_full + ;" | psqlcmd +echo "ALTER TABLE wikipedia_article_slim + RENAME TO wikipedia_article + ;" | psqlcmd -# add wikidata to wikipedia_article table -echo "UPDATE wikipedia_article SET lat = wikidata_pages.lat, lon = wikidata_pages.lon, wd_page_title = wikidata_pages.item, instance_of = wikidata_pages.instance_of FROM wikidata_pages WHERE wikipedia_article.language = wikidata_pages.language AND wikipedia_article.title = wikidata_pages.wp_page_title;" | psqlcmd -echo "CREATE TABLE wikipedia_article_slim AS SELECT * FROM wikipedia_article WHERE wikidata_id IS NOT NULL;" | psqlcmd -echo "ALTER TABLE wikipedia_article RENAME TO wikipedia_article_full;" | psqlcmd -echo "ALTER TABLE wikipedia_article_slim RENAME TO wikipedia_article;" | psqlcmd -# clean up intermediate tables +echo "=====================================================================" +echo "Dropping intermediate tables" +echo "=====================================================================" echo "DROP TABLE wikidata_place_dump;" | psqlcmd echo "DROP TABLE geo_earth_primary;" | psqlcmd -for i in "${language[@]}" +for i in "${LANGUAGES[@]}" do echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd done diff --git a/data-sources/wikipedia-wikidata/import_wikipedia.sh b/data-sources/wikipedia-wikidata/import_wikipedia.sh index 9cc9b1df..19553e18 100755 --- a/data-sources/wikipedia-wikidata/import_wikipedia.sh +++ b/data-sources/wikipedia-wikidata/import_wikipedia.sh @@ -1,81 +1,213 @@ #!/bin/bash psqlcmd() { - psql wikiprocessingdb + psql --quiet wikiprocessingdb } mysql2pgsqlcmd() { ./mysql2pgsql.perl /dev/stdin /dev/stdout } +download() { + echo "Downloading $1" + wget --quiet --no-clobber --tries 3 +} + + +# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +# requires Bash 4.0 +readarray -t LANGUAGES < languages.txt + + + +echo "=====================================================================" +echo "Create wikipedia calculation tables" +echo "=====================================================================" + +echo "CREATE TABLE linkcounts ( + language text, + title text, + count integer, + sumcount integer, + lat double precision, + lon double precision + );" | psqlcmd -# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +echo "CREATE TABLE wikipedia_article ( + language text NOT NULL, + title text NOT NULL, + langcount integer, + othercount integer, + totalcount integer, + lat double precision, + lon double precision, + importance double precision, + title_en text, + osm_type character(1), + osm_id bigint + );" | psqlcmd -language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" ) +echo "CREATE TABLE wikipedia_redirect ( + language text, + from_title text, + to_title text + );" | psqlcmd -# create wikipedia calculation tables -echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double precision, lon double precision);" | psqlcmd -echo "CREATE TABLE wikipedia_article (language text NOT NULL, title text NOT NULL, langcount integer, othercount integer, totalcount integer, lat double precision, lon double precision, importance double precision, title_en text, osm_type character(1), osm_id bigint );" | psqlcmd -echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );" | psqlcmd -# download individual wikipedia language tables +echo "=====================================================================" +echo "Download individual wikipedia language tables" +echo "=====================================================================" -for i in "${language[@]}" + +for i in "${LANGUAGES[@]}" do - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz + echo "Language: $i" + + # english is the largest + # 1.7G enwiki-latest-page.sql.gz + # 6.2G enwiki-latest-pagelinks.sql.gz + # 355M enwiki-latest-langlinks.sql.gz + # 128M enwiki-latest-redirect.sql.gz + + # example of smaller languge turkish + # 53M trwiki-latest-page.sql.gz + # 176M trwiki-latest-pagelinks.sql.gz + # 106M trwiki-latest-langlinks.sql.gz + # 3.2M trwiki-latest-redirect.sql.gz + + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz done -# import individual wikipedia language tables -for i in "${language[@]}" + + +echo "=====================================================================" +echo "Import individual wikipedia language tables" +echo "=====================================================================" + +for i in "${LANGUAGES[@]}" do + echo "Language: $i" + + echo "Importing ${i}wiki-latest-pagelinks" gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd - gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd + + echo "Importing ${i}wiki-latest-page" + gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd + + echo "Importing ${i}wiki-latest-langlinks" gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd - gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd + + echo "Importing ${i}wiki-latest-redirect" + gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd done -# process language tables and associated pagelink counts -for i in "${language[@]}" -do - echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd - echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd - echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | psqlcmd - echo "alter table ${i}pagelinkcount add column othercount integer;" | psqlcmd - echo "update ${i}pagelinkcount set othercount = 0;" | psqlcmd -done -for i in "${language[@]}" + +echo "=====================================================================" +echo "Process language tables and associated pagelink counts" +echo "=====================================================================" + + +for i in "${LANGUAGES[@]}" do - for j in "${language[@]}" + echo "Language: $i" + + echo "CREATE TABLE ${i}pagelinkcount + AS + SELECT pl_title AS title, + COUNT(*) AS count + FROM ${i}pagelinks + WHERE pl_namespace = 0 + GROUP BY pl_title + ;" | psqlcmd + + echo "INSERT INTO linkcounts + SELECT '${i}', + pl_title, + COUNT(*) + FROM ${i}pagelinks + WHERE pl_namespace = 0 + GROUP BY pl_title + ;" | psqlcmd + + echo "INSERT INTO wikipedia_redirect + SELECT '${i}', + page_title, + rd_title + FROM ${i}redirect + JOIN ${i}page ON (rd_from = page_id) + WHERE page_namespace = 0 + AND rd_namespace = 0 + ;" | psqlcmd + + echo "ALTER TABLE ${i}pagelinkcount + ADD COLUMN othercount integer + ;" | psqlcmd + + echo "UPDATE ${i}pagelinkcount + SET othercount = 0 + ;" | psqlcmd + + for j in "${LANGUAGES[@]}" do - echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | psqlcmd + echo "UPDATE ${i}pagelinkcount + SET othercount = ${i}pagelinkcount.othercount + x.count + FROM ( + SELECT page_title AS title, + count + FROM ${i}langlinks + JOIN ${i}page ON (ll_from = page_id) + JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title) + ) AS x + WHERE x.title = ${i}pagelinkcount.title + ;" | psqlcmd done - echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | psqlcmd + + echo "INSERT INTO wikipedia_article + SELECT '${i}', + title, + count, + othercount, + count + othercount + FROM ${i}pagelinkcount + ;" | psqlcmd done -# calculate importance score for each wikipedia page -echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | psqlcmd -# clean up intermediate tables to conserve space +echo "=====================================================================" +echo "Calculate importance score for each wikipedia page" +echo "=====================================================================" + +echo "UPDATE wikipedia_article + SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article)) + ;" | psqlcmd + + + + + +echo "=====================================================================" +echo "Clean up intermediate tables to conserve space" +echo "=====================================================================" -for i in "${language[@]}" +for i in "${LANGUAGES[@]}" do - echo "DROP TABLE ${i}pagelinks;" | psqlcmd - echo "DROP TABLE ${i}page;" | psqlcmd - echo "DROP TABLE ${i}langlinks;" | psqlcmd - echo "DROP TABLE ${i}redirect;" | psqlcmd + echo "DROP TABLE ${i}pagelinks;" | psqlcmd + echo "DROP TABLE ${i}page;" | psqlcmd + echo "DROP TABLE ${i}langlinks;" | psqlcmd + echo "DROP TABLE ${i}redirect;" | psqlcmd echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd done diff --git a/data-sources/wikipedia-wikidata/languages.txt b/data-sources/wikipedia-wikidata/languages.txt new file mode 100644 index 00000000..bef5d0e9 --- /dev/null +++ b/data-sources/wikipedia-wikidata/languages.txt @@ -0,0 +1,39 @@ +ar +bg +ca +cs +da +de +en +es +eo +eu +fa +fr +ko +hi +hr +id +it +he +lt +hu +ms +nl +ja +no +pl +pt +kk +ro +ru +sk +sl +sr +fi +sv +tr +uk +vi +war +zh \ No newline at end of file