From 437a7ce50e0722e726a5e7e105736978cddd5167 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Fri, 6 Sep 2013 10:52:55 +0100 Subject: [PATCH] download and merge wikipedia access logs --- wikidata/pull_wikipedia_logs.php | 95 ++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 wikidata/pull_wikipedia_logs.php diff --git a/wikidata/pull_wikipedia_logs.php b/wikidata/pull_wikipedia_logs.php new file mode 100644 index 00000000..8dfabbb1 --- /dev/null +++ b/wikidata/pull_wikipedia_logs.php @@ -0,0 +1,95 @@ + hour.txt'); + + $hPrevTotals = @fopen("totals.txt", "r"); + $hDayTotals = @fopen("hour.txt", "r"); + $hNewTotals = @fopen("newtotals.txt", "w"); + + $sPrevKey = $sDayKey = true; + $sPrevLine = true; + $sDayLine = true; + + do + { + if ($sPrevKey === $sDayKey) + { + if ($sPrevLine !== true) fputs($hNewTotals, "$sPrevKey ".($iPrevValue+$iDayValue)."\n"); + $sPrevLine = true; + $sDayLine = true; + } + else if ($sDayKey !== false && ($sPrevKey > $sDayKey || $sPrevKey === false)) + { + fputs($hNewTotals, "$sDayKey ".($iDayValue)."\n"); + $sDayLine = true; + } + else if ($sPrevKey !== false && ($sDayKey > $sPrevKey || $sDayKey === false)) + { + fputs($hNewTotals, "$sPrevKey ".($iPrevValue)."\n"); + $sPrevLine = true; + } + + if ($sPrevLine === true) + { + $sPrevLine = $hPrevTotals?fgets($hPrevTotals, 4096):false; + if ($sPrevLine !== false) + { + $aPrevLine = explode(' ', $sPrevLine); + $sPrevKey = $aPrevLine[0].' '.$aPrevLine[1]; + $iPrevValue = (int)$aPrevLine[2]; + } + else + { + $sPrevKey = false; + $iPrevValue = 0; + } + } + + if ($sDayLine === true) + { + $sDayLine = $hDayTotals?fgets($hDayTotals, 4096):false; + if ($sDayLine !== false) + { + preg_match('#^([a-z]{2}) ([^ :]+) ([0-9]+) [0-9]+$#', $sDayLine, $aMatch); + $sDayKey = $aMatch[1].' '.$aMatch[2]; + $iDayValue = (int)$aMatch[3]; + } + else + { + $sDayKey = false; + $iDayValue = 0; + } + } + + } while ($sPrevLine !== false || $sDayLine !== false); + + @fclose($hPrevTotals); + @fclose($hDayTotals); + @fclose($hNewTotals); + + @unlink("totals.txt"); + rename("newtotals.txt", "totals.txt"); + } + } + +// Notes: +/* + gzip -dc $FILE.gz | grep -e "^en [^ :]\+ [0-9]\+" | sed "s#\(^[a-z]\{2\}\) \([^ :]\+\) \([0-9]\+\) [0-9]\+#update wikipedia_article set hit_count = coalesce(hit_count,0) + \3 where language = '\1' and title = catch_decode_url_part('\2');#g" | /opt/mapquest/stdbase-dev$ + cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = s,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g" + cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = coalesce(hits,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g" +*/ -- 2.39.5