4 psql --quiet wikiprocessingdb |& \
5 grep -v 'does not exist, skipping' |& \
6 grep -v 'violates check constraint' |& \
7 grep -vi 'Failing row contains'
11 ./mysql2pgsql.perl --nodrop /dev/stdin /dev/stdout
16 wget --quiet --no-clobber --tries=3 "$1"
20 # languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
22 readarray -t LANGUAGES < languages.txt
26 echo "====================================================================="
27 echo "Create wikipedia calculation tables"
28 echo "====================================================================="
30 echo "CREATE TABLE linkcounts (
39 echo "CREATE TABLE wikipedia_article (
40 language text NOT NULL,
47 importance double precision,
49 osm_type character(1),
53 echo "CREATE TABLE wikipedia_redirect (
63 echo "====================================================================="
64 echo "Download individual wikipedia language tables"
65 echo "====================================================================="
68 for i in "${LANGUAGES[@]}"
72 # english is the largest
73 # 1.7G enwiki-latest-page.sql.gz
74 # 6.2G enwiki-latest-pagelinks.sql.gz
75 # 355M enwiki-latest-langlinks.sql.gz
76 # 128M enwiki-latest-redirect.sql.gz
78 # example of smaller languge turkish
79 # 53M trwiki-latest-page.sql.gz
80 # 176M trwiki-latest-pagelinks.sql.gz
81 # 106M trwiki-latest-langlinks.sql.gz
82 # 3.2M trwiki-latest-redirect.sql.gz
84 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
85 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
86 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
87 download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
94 echo "====================================================================="
95 echo "Import individual wikipedia language tables"
96 echo "====================================================================="
98 for i in "${LANGUAGES[@]}"
102 # We pre-create the table schema. This allows us to
103 # 1. Skip index creation. Most queries we do are full table scans
104 # 2. Add constrain to only import namespace=0 (wikipedia articles)
105 # Both cuts down data size considerably (50%+)
107 echo "Importing ${i}wiki-latest-pagelinks"
109 echo "DROP TABLE IF EXISTS ${i}pagelinks;" | psqlcmd
110 echo "CREATE TABLE ${i}pagelinks (
111 pl_from int NOT NULL DEFAULT '0',
112 pl_namespace int NOT NULL DEFAULT '0',
113 pl_title text NOT NULL DEFAULT '',
114 pl_from_namespace int NOT NULL DEFAULT '0'
118 gzip -dc ${i}wiki-latest-pagelinks.sql.gz | \
119 sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | \
121 grep -v '^CREATE INDEX ' | \
127 echo "Importing ${i}wiki-latest-page"
129 # autoincrement serial8 4byte
130 echo "DROP TABLE IF EXISTS ${i}page;" | psqlcmd
131 echo "CREATE TABLE ${i}page (
132 page_id int NOT NULL,
133 page_namespace int NOT NULL DEFAULT '0',
134 page_title text NOT NULL DEFAULT '',
135 page_restrictions text NOT NULL,
136 page_is_redirect smallint NOT NULL DEFAULT '0',
137 page_is_new smallint NOT NULL DEFAULT '0',
138 page_random double precision NOT NULL DEFAULT '0',
139 page_touched text NOT NULL DEFAULT '',
140 page_links_updated text DEFAULT NULL,
141 page_latest int NOT NULL DEFAULT '0',
142 page_len int NOT NULL DEFAULT '0',
143 page_content_model text DEFAULT NULL,
144 page_lang text DEFAULT NULL
148 gzip -dc ${i}wiki-latest-page.sql.gz | \
149 sed "s/\`page\`/\`${i}page\`/g" | \
151 grep -v '^CREATE INDEX ' | \
157 echo "Importing ${i}wiki-latest-langlinks"
159 echo "DROP TABLE IF EXISTS ${i}langlinks;" | psqlcmd
160 echo "CREATE TABLE ${i}langlinks (
161 ll_from int NOT NULL DEFAULT '0',
162 ll_lang text NOT NULL DEFAULT '',
163 ll_title text NOT NULL DEFAULT ''
167 gzip -dc ${i}wiki-latest-langlinks.sql.gz | \
168 sed "s/\`langlinks\`/\`${i}langlinks\`/g" | \
170 grep -v '^CREATE INDEX ' | \
177 echo "Importing ${i}wiki-latest-redirect"
179 echo "DROP TABLE IF EXISTS ${i}redirect;" | psqlcmd
180 echo "CREATE TABLE ${i}redirect (
181 rd_from int NOT NULL DEFAULT '0',
182 rd_namespace int NOT NULL DEFAULT '0',
183 rd_title text NOT NULL DEFAULT '',
184 rd_interwiki text DEFAULT NULL,
185 rd_fragment text DEFAULT NULL
189 gzip -dc ${i}wiki-latest-redirect.sql.gz | \
190 sed "s/\`redirect\`/\`${i}redirect\`/g" | \
192 grep -v '^CREATE INDEX ' | \
200 echo "====================================================================="
201 echo "Process language tables and associated pagelink counts"
202 echo "====================================================================="
205 for i in "${LANGUAGES[@]}"
209 echo "CREATE TABLE ${i}pagelinkcount
211 SELECT pl_title AS title,
213 0::bigint as othercount
215 WHERE pl_namespace = 0
219 echo "INSERT INTO linkcounts
224 WHERE pl_namespace = 0
228 echo "INSERT INTO wikipedia_redirect
233 JOIN ${i}page ON (rd_from = page_id)
234 WHERE page_namespace = 0
241 for i in "${LANGUAGES[@]}"
243 for j in "${LANGUAGES[@]}"
245 echo "UPDATE ${i}pagelinkcount
246 SET othercount = ${i}pagelinkcount.othercount + x.count
248 SELECT page_title AS title,
251 JOIN ${i}page ON (ll_from = page_id)
252 JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title)
254 WHERE x.title = ${i}pagelinkcount.title
258 echo "INSERT INTO wikipedia_article
264 FROM ${i}pagelinkcount
272 echo "====================================================================="
273 echo "Calculate importance score for each wikipedia page"
274 echo "====================================================================="
276 echo "UPDATE wikipedia_article
277 SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article))
284 echo "====================================================================="
285 echo "Clean up intermediate tables to conserve space"
286 echo "====================================================================="
288 for i in "${LANGUAGES[@]}"
290 echo "DROP TABLE ${i}pagelinks;" | psqlcmd
291 echo "DROP TABLE ${i}page;" | psqlcmd
292 echo "DROP TABLE ${i}langlinks;" | psqlcmd
293 echo "DROP TABLE ${i}redirect;" | psqlcmd
294 echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd