X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2be70b2c365ac5cdb15f0800463d19e72999cd92..9587fc99098969ae8cb1ad7fb159db1b06f621ff:/data-sources/wikipedia-wikidata/import_wikipedia.sh diff --git a/data-sources/wikipedia-wikidata/import_wikipedia.sh b/data-sources/wikipedia-wikidata/import_wikipedia.sh index 9cc9b1df..19553e18 100755 --- a/data-sources/wikipedia-wikidata/import_wikipedia.sh +++ b/data-sources/wikipedia-wikidata/import_wikipedia.sh @@ -1,81 +1,213 @@ #!/bin/bash psqlcmd() { - psql wikiprocessingdb + psql --quiet wikiprocessingdb } mysql2pgsqlcmd() { ./mysql2pgsql.perl /dev/stdin /dev/stdout } +download() { + echo "Downloading $1" + wget --quiet --no-clobber --tries 3 +} + + +# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +# requires Bash 4.0 +readarray -t LANGUAGES < languages.txt + + + +echo "=====================================================================" +echo "Create wikipedia calculation tables" +echo "=====================================================================" + +echo "CREATE TABLE linkcounts ( + language text, + title text, + count integer, + sumcount integer, + lat double precision, + lon double precision + );" | psqlcmd -# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias) +echo "CREATE TABLE wikipedia_article ( + language text NOT NULL, + title text NOT NULL, + langcount integer, + othercount integer, + totalcount integer, + lat double precision, + lon double precision, + importance double precision, + title_en text, + osm_type character(1), + osm_id bigint + );" | psqlcmd -language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" ) +echo "CREATE TABLE wikipedia_redirect ( + language text, + from_title text, + to_title text + );" | psqlcmd -# create wikipedia calculation tables -echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double precision, lon double precision);" | psqlcmd -echo "CREATE TABLE wikipedia_article (language text NOT NULL, title text NOT NULL, langcount integer, othercount integer, totalcount integer, lat double precision, lon double precision, importance double precision, title_en text, osm_type character(1), osm_id bigint );" | psqlcmd -echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );" | psqlcmd -# download individual wikipedia language tables +echo "=====================================================================" +echo "Download individual wikipedia language tables" +echo "=====================================================================" -for i in "${language[@]}" + +for i in "${LANGUAGES[@]}" do - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz - wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz + echo "Language: $i" + + # english is the largest + # 1.7G enwiki-latest-page.sql.gz + # 6.2G enwiki-latest-pagelinks.sql.gz + # 355M enwiki-latest-langlinks.sql.gz + # 128M enwiki-latest-redirect.sql.gz + + # example of smaller languge turkish + # 53M trwiki-latest-page.sql.gz + # 176M trwiki-latest-pagelinks.sql.gz + # 106M trwiki-latest-langlinks.sql.gz + # 3.2M trwiki-latest-redirect.sql.gz + + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz + download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz done -# import individual wikipedia language tables -for i in "${language[@]}" + + +echo "=====================================================================" +echo "Import individual wikipedia language tables" +echo "=====================================================================" + +for i in "${LANGUAGES[@]}" do + echo "Language: $i" + + echo "Importing ${i}wiki-latest-pagelinks" gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd - gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd + + echo "Importing ${i}wiki-latest-page" + gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd + + echo "Importing ${i}wiki-latest-langlinks" gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd - gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd + + echo "Importing ${i}wiki-latest-redirect" + gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd done -# process language tables and associated pagelink counts -for i in "${language[@]}" -do - echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd - echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd - echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | psqlcmd - echo "alter table ${i}pagelinkcount add column othercount integer;" | psqlcmd - echo "update ${i}pagelinkcount set othercount = 0;" | psqlcmd -done -for i in "${language[@]}" + +echo "=====================================================================" +echo "Process language tables and associated pagelink counts" +echo "=====================================================================" + + +for i in "${LANGUAGES[@]}" do - for j in "${language[@]}" + echo "Language: $i" + + echo "CREATE TABLE ${i}pagelinkcount + AS + SELECT pl_title AS title, + COUNT(*) AS count + FROM ${i}pagelinks + WHERE pl_namespace = 0 + GROUP BY pl_title + ;" | psqlcmd + + echo "INSERT INTO linkcounts + SELECT '${i}', + pl_title, + COUNT(*) + FROM ${i}pagelinks + WHERE pl_namespace = 0 + GROUP BY pl_title + ;" | psqlcmd + + echo "INSERT INTO wikipedia_redirect + SELECT '${i}', + page_title, + rd_title + FROM ${i}redirect + JOIN ${i}page ON (rd_from = page_id) + WHERE page_namespace = 0 + AND rd_namespace = 0 + ;" | psqlcmd + + echo "ALTER TABLE ${i}pagelinkcount + ADD COLUMN othercount integer + ;" | psqlcmd + + echo "UPDATE ${i}pagelinkcount + SET othercount = 0 + ;" | psqlcmd + + for j in "${LANGUAGES[@]}" do - echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | psqlcmd + echo "UPDATE ${i}pagelinkcount + SET othercount = ${i}pagelinkcount.othercount + x.count + FROM ( + SELECT page_title AS title, + count + FROM ${i}langlinks + JOIN ${i}page ON (ll_from = page_id) + JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title) + ) AS x + WHERE x.title = ${i}pagelinkcount.title + ;" | psqlcmd done - echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | psqlcmd + + echo "INSERT INTO wikipedia_article + SELECT '${i}', + title, + count, + othercount, + count + othercount + FROM ${i}pagelinkcount + ;" | psqlcmd done -# calculate importance score for each wikipedia page -echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | psqlcmd -# clean up intermediate tables to conserve space +echo "=====================================================================" +echo "Calculate importance score for each wikipedia page" +echo "=====================================================================" + +echo "UPDATE wikipedia_article + SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article)) + ;" | psqlcmd + + + + + +echo "=====================================================================" +echo "Clean up intermediate tables to conserve space" +echo "=====================================================================" -for i in "${language[@]}" +for i in "${LANGUAGES[@]}" do - echo "DROP TABLE ${i}pagelinks;" | psqlcmd - echo "DROP TABLE ${i}page;" | psqlcmd - echo "DROP TABLE ${i}langlinks;" | psqlcmd - echo "DROP TABLE ${i}redirect;" | psqlcmd + echo "DROP TABLE ${i}pagelinks;" | psqlcmd + echo "DROP TABLE ${i}page;" | psqlcmd + echo "DROP TABLE ${i}langlinks;" | psqlcmd + echo "DROP TABLE ${i}redirect;" | psqlcmd echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd done