3 # Search apache logs for high-bandwith users and create a list of suspicious IPs.
4 # There are three states: bulk, block, ban. The first are bulk requesters
5 # that need throtteling, the second bulk requesters that have overdone it
6 # and the last manually banned IPs.
12 from datetime import datetime, timedelta
13 from collections import defaultdict
18 # Copy into settings/ip_blcoks.conf and adapt as required.
20 BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
21 BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
22 LOGFILE= BASEDIR + '/log/restricted_ip.log'
24 # space-separated list of IPs that are never banned
26 # space-separated list of IPs manually blocked
28 # user-agents that should be blocked from bulk mode
29 # (matched with startswith)
32 # time before a automatically blocked IP is allowed back
33 BLOCKCOOLOFF_PERIOD=timedelta(hours=1)
34 # quiet time before an IP is released from the bulk pool
35 BULKCOOLOFF_PERIOD=timedelta(minutes=15)
47 # END OF DEFAULT SETTINGS
51 with open(BASEDIR + "/settings/ip_blocks.conf") as f:
52 code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec')
53 exec(code, global_vars, local_vars)
57 BLOCK_LIMIT = BLOCK_LOWER
60 format_pat= re.compile('(?P<ip>[(\d\.)]+) - - \[(?P<time>.*?)\] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
61 time_pat= re.compile('[(\d\.)]+ - - \[(?P<time>[^\]]*?)\] ')
63 logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
66 def __init__(self, logline):
67 e = format_pat.match(logline)
69 raise ValueError("Invalid log line:", logline)
72 self.date = datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
73 qp = e['query'].split(' ', 2)
79 if qp[0] == 'OPTIONS':
82 if qp[1].startswith('/search'):
84 elif qp[1].startswith('/reverse'):
86 elif qp[1].startswith('/details'):
90 self.query = e['query']
91 self.retcode = int(e['return'])
92 self.referer = e['referer'] if e['referer'] != '-' else None
93 self.ua = e['ua'] if e['ua'] != '-' else None
95 def get_log_time(logline):
96 e = format_pat.match(logline)
100 return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
104 """ An apache log file, unpacked. """
106 def __init__(self, filename):
107 self.fd = open(filename)
108 self.len = os.path.getsize(filename)
113 def seek_next(self, abstime):
114 self.fd.seek(abstime)
116 l = self.fd.readline()
117 return LogEntry.get_log_time(l) if l is not None else None
119 def seek_to_date(self, target):
120 date1 = self.seek_next(0)
123 curseek = 2*1024*1024
124 curdate = self.seek_next(curseek)
126 raise RuntimeError("Cannot seek to %d" % curseek)
127 while abs((target - curdate).total_seconds()) > 1.0:
128 bps = curseek / ((curdate - date1).total_seconds())
129 curseek += (target - curdate).total_seconds() * bps
132 elif curseek > self.len:
133 curseek = self.len - bps
134 curdate = self.seek_next(curseek)
136 raise RuntimeError("Cannot see to %d" % curseek)
146 self.whitelist = set(WHITELIST.split()) if WHITELIST else set()
147 self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set()
148 self.prevblocks = set()
149 self.prevbulks = set()
152 fd = open(BLOCKEDFILE)
154 ip, typ = line.strip().split(' ')
155 if ip not in self.blacklist:
162 pass #ignore non-existing file
174 def add_long(self, logentry):
176 if logentry.request is not None:
179 if logentry.ua is None:
182 def add_short(self, logentry):
183 self.short_total += 1
184 if logentry.request is not None:
186 self.add_long(logentry)
188 def new_state(self, was_blocked, was_bulked):
190 # deblock only if the IP has been really quiet
191 # (properly catches the ones that simply ignore the HTTP error)
192 return None if self.long_total < 5 else 'block'
193 if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
194 # client totally overdoing it
197 if self.short_total < 5:
198 # client has stopped, debulk
200 if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
201 # client is still hammering us, block
205 if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
207 return 'uablock' # bad useragent
214 if __name__ == '__main__':
215 if len(sys.argv) < 2:
216 print("Usage: %s logfile startdate" % sys.argv[0])
219 if len(sys.argv) == 2:
220 dt = datetime.now() - BLOCKCOOLOFF_PERIOD
222 dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
224 if os.path.getsize(sys.argv[1]) < 2*1030*1024:
225 sys.exit(0) # not enough data
227 lf = LogFile(sys.argv[1])
228 if not lf.seek_to_date(dt):
233 shortstart = dt + BLOCKCOOLOFF_PERIOD - BULKCOOLOFF_PERIOD
234 notlogged = bl.whitelist | bl.blacklist
236 stats = defaultdict(IPstats)
238 for l in lf.loglines():
239 if l.ip not in notlogged:
240 stats[l.ip].add_long(l)
241 if l.date > shortstart:
245 for l in lf.loglines():
246 if l.ip not in notlogged:
247 stats[l.ip].add_short(l)
248 if l.request is not None and l.retcode == 200:
251 # adapt limits according to CPU and DB load
252 fd = open("/proc/loadavg")
253 cpuload = int(float(fd.readline().split()[2]))
255 dbload = total200 / BULKCOOLOFF_PERIOD.total_seconds()
257 numbulks = len(bl.prevbulks)
258 BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
259 BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (cpuload - 14))
260 if numbulks > MAX_BULK_IPS:
261 BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
262 # if the bulk pool is still empty, clients will be faster, avoid having
263 # them blocked in this case
265 BLOCK_LIMIT = 2*BLOCK_UPPER
268 # collecting statistics
275 # write out new state file
276 fd = open(BLOCKEDFILE, 'w')
277 for k,v in stats.items():
278 wasblocked = k in bl.prevblocks
279 wasbulked = k in bl.prevbulks
280 state = v.new_state(wasblocked, wasbulked)
281 if state is not None:
282 if state == 'uablock':
285 elif state == 'emblock':
288 elif state == 'block':
291 elif state == 'bulk':
294 fd.write("%s %s\n" % (k, state))
300 for i in bl.blacklist:
301 fd.write("%s ban\n" % k)
304 # TODO write logs (need to collect some statistics)
305 logstr = datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
306 fd = open(LOGFILE, 'a')
308 fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
310 fd.write(logstr % (' debulked:', ', '.join(debulked)))
312 fd.write(logstr % ('new bulks:', ', '.join(bulked)))
314 fd.write(logstr % ('dir.block:', ', '.join(emblocked)))
316 fd.write(logstr % (' ua block:', ', '.join(uablocked)))
318 fd.write(logstr % ('new block:', ', '.join(blocked)))