From 624df40f43cc4131b823fbef218fd0c08d6400b2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 12 Dec 2014 00:01:30 +0100 Subject: [PATCH] script for scraper blocking using apache log files --- utils/cron_ipanalyse.py | 319 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 319 insertions(+) create mode 100755 utils/cron_ipanalyse.py diff --git a/utils/cron_ipanalyse.py b/utils/cron_ipanalyse.py new file mode 100755 index 00000000..40afa2b8 --- /dev/null +++ b/utils/cron_ipanalyse.py @@ -0,0 +1,319 @@ +#!/usr/bin/python3 +# +# Search apache logs for high-bandwith users and create a list of suspicious IPs. +# There are three states: bulk, block, ban. The first are bulk requesters +# that need throtteling, the second bulk requesters that have overdone it +# and the last manually banned IPs. +# + +import re +import os +import sys +from datetime import datetime, timedelta +from collections import defaultdict + +# +# DEFAULT SETTINGS +# +# Copy into settings/ip_blcoks.conf and adapt as required. +# +BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..')) +BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map' +LOGFILE= BASEDIR + '/log/restricted_ip.log' + +# space-separated list of IPs that are never banned +WHITELIST = '' +# space-separated list of IPs manually blocked +BLACKLIST = '' +# user-agents that should be blocked from bulk mode +# (matched with startswith) +UA_BLOCKLIST = () + +# time before a automatically blocked IP is allowed back +BLOCKCOOLOFF_PERIOD=timedelta(hours=1) +# quiet time before an IP is released from the bulk pool +BULKCOOLOFF_PERIOD=timedelta(minutes=15) + +BULKLONG_LIMIT=8000 +BULKSHORT_LIMIT=2000 +BLOCK_UPPER=19000 +BLOCK_LOWER=4000 +BLOCK_LOADFAC=380 +BULK_LOADFAC=160 +BULK_LOWER=1500 +MAX_BULK_IPS=85 + +# +# END OF DEFAULT SETTINGS +# + +try: + with open(BASEDIR + "/settings/ip_blocks.conf") as f: + code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec') + exec(code, global_vars, local_vars) +except IOError: + pass + +BLOCK_LIMIT = BLOCK_LOWER + + +format_pat= re.compile('(?P[(\d\.)]+) - - \[(?P