4 psql --quiet wikiprocessingdb
8 ./mysql2pgsql.perl /dev/stdin /dev/stdout
13 wget --quiet --no-clobber --tries 3
17 # languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
19 readarray -t LANGUAGES < languages.txt
23 echo "====================================================================="
24 echo "Create wikipedia calculation tables"
25 echo "====================================================================="
27 echo "CREATE TABLE linkcounts (
36 echo "CREATE TABLE wikipedia_article (
37 language text NOT NULL,
44 importance double precision,
46 osm_type character(1),
50 echo "CREATE TABLE wikipedia_redirect (
60 echo "====================================================================="
61 echo "Download individual wikipedia language tables"
62 echo "====================================================================="
65 for i in "${LANGUAGES[@]}"
69 # english is the largest
70 # 1.7G enwiki-latest-page.sql.gz
71 # 6.2G enwiki-latest-pagelinks.sql.gz
72 # 355M enwiki-latest-langlinks.sql.gz
73 # 128M enwiki-latest-redirect.sql.gz
75 # example of smaller languge turkish
76 # 53M trwiki-latest-page.sql.gz
77 # 176M trwiki-latest-pagelinks.sql.gz
78 # 106M trwiki-latest-langlinks.sql.gz
79 # 3.2M trwiki-latest-redirect.sql.gz
81 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
82 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
83 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
84 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
91 echo "====================================================================="
92 echo "Import individual wikipedia language tables"
93 echo "====================================================================="
95 for i in "${LANGUAGES[@]}"
99 echo "Importing ${i}wiki-latest-pagelinks"
100 gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
102 echo "Importing ${i}wiki-latest-page"
103 gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
105 echo "Importing ${i}wiki-latest-langlinks"
106 gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
108 echo "Importing ${i}wiki-latest-redirect"
109 gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
116 echo "====================================================================="
117 echo "Process language tables and associated pagelink counts"
118 echo "====================================================================="
121 for i in "${LANGUAGES[@]}"
125 echo "CREATE TABLE ${i}pagelinkcount
127 SELECT pl_title AS title,
130 WHERE pl_namespace = 0
134 echo "INSERT INTO linkcounts
139 WHERE pl_namespace = 0
143 echo "INSERT INTO wikipedia_redirect
148 JOIN ${i}page ON (rd_from = page_id)
149 WHERE page_namespace = 0
153 echo "ALTER TABLE ${i}pagelinkcount
154 ADD COLUMN othercount integer
157 echo "UPDATE ${i}pagelinkcount
161 for j in "${LANGUAGES[@]}"
163 echo "UPDATE ${i}pagelinkcount
164 SET othercount = ${i}pagelinkcount.othercount + x.count
166 SELECT page_title AS title,
169 JOIN ${i}page ON (ll_from = page_id)
170 JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title)
172 WHERE x.title = ${i}pagelinkcount.title
176 echo "INSERT INTO wikipedia_article
182 FROM ${i}pagelinkcount
190 echo "====================================================================="
191 echo "Calculate importance score for each wikipedia page"
192 echo "====================================================================="
194 echo "UPDATE wikipedia_article
195 SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article))
202 echo "====================================================================="
203 echo "Clean up intermediate tables to conserve space"
204 echo "====================================================================="
206 for i in "${LANGUAGES[@]}"
208 echo "DROP TABLE ${i}pagelinks;" | psqlcmd
209 echo "DROP TABLE ${i}page;" | psqlcmd
210 echo "DROP TABLE ${i}langlinks;" | psqlcmd
211 echo "DROP TABLE ${i}redirect;" | psqlcmd
212 echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd