3 # Search apache logs for high-bandwith users and create a list of suspicious IPs.
4 # There are three states: bulk, block, ban. The first are bulk requesters
5 # that need throtteling, the second bulk requesters that have overdone it
6 # and the last manually banned IPs.
13 from datetime import datetime, timedelta
14 from collections import defaultdict
19 # Copy into settings/ip_blcoks.conf and adapt as required.
21 BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
22 BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
23 LOGFILE= BASEDIR + '/log/restricted_ip.log'
25 # space-separated list of IPs that are never banned
27 # space-separated list of IPs manually blocked
29 # user-agents that should be blocked from bulk mode
30 # (matched with startswith)
33 # time before a automatically blocked IP is allowed back
34 BLOCKCOOLOFF_DELTA=timedelta(hours=1)
35 # quiet time before an IP is released from the bulk pool
36 BULKCOOLOFF_DELTA=timedelta(minutes=15)
37 # time to check if new accesses appear despite being blocked
38 BLOCKCHECK_DELTA=timedelta(minutes=1)
50 # END OF DEFAULT SETTINGS
54 with open(BASEDIR + "/settings/ip_blocks.conf") as f:
55 code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec')
60 BLOCK_LIMIT = BLOCK_LOWER
62 time_regex = r'(?P<t_day>\d\d)/(?P<t_month>[A-Za-z]+)/(?P<t_year>\d\d\d\d):(?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d) [+-]\d\d\d\d'
64 format_pat= re.compile(r'(?P<ip>[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
65 time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
67 logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
69 MONTHS = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
70 'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12 }
73 def __init__(self, logline):
74 e = format_pat.match(logline)
76 raise ValueError("Invalid log line:", logline)
79 self.date = datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
80 int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
81 qp = e['query'].split(' ', 2)
87 if qp[0] == 'OPTIONS':
92 elif '/search' in qp[1]:
94 elif '/reverse' in qp[1]:
96 elif '/details' in qp[1]:
98 elif '/lookup' in qp[1]:
102 self.query = e['query']
103 self.retcode = int(e['return'])
104 self.referer = e['referer'] if e['referer'] != '-' else None
105 self.ua = e['ua'] if e['ua'] != '-' else None
107 def get_log_time(logline):
108 e = format_pat.match(logline)
112 #return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
113 return datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
114 int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
118 """ An apache log file, unpacked. """
120 def __init__(self, filename):
121 self.fd = open(filename)
122 self.len = os.path.getsize(filename)
127 def seek_next(self, abstime):
128 self.fd.seek(abstime)
130 l = self.fd.readline()
131 return LogEntry.get_log_time(l) if l is not None else None
133 def seek_to_date(self, target):
134 # start position for binary search
136 fromdate = self.seek_next(0)
137 if fromdate > target:
139 # end position for binary search
141 while -toseek < self.len:
142 todate = self.seek_next(self.len + toseek)
143 if todate is not None:
146 if todate is None or todate < target:
148 toseek = self.len + toseek
152 bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
153 newseek = fromseek + int((target - fromdate).total_seconds() * bps)
154 newdate = self.seek_next(newseek)
157 error = abs((target - newdate).total_seconds())
163 oldfromseek = fromseek
164 fromseek = toseek - error * bps
166 if fromseek <= oldfromseek:
167 fromseek = oldfromseek
168 fromdate = self.seek_next(fromseek)
170 fromdate = self.seek_next(fromseek)
171 if fromdate < target:
174 fromseek -= error * bps
179 toseek = fromseek + error * bps
181 if toseek > oldtoseek:
183 todate = self.seek_next(toseek)
185 todate = self.seek_next(toseek)
189 toseek += error * bps
190 if toseek - fromseek < 500:
199 pass # ignore invalid lines
204 self.whitelist = set(WHITELIST.split()) if WHITELIST else set()
205 self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set()
206 self.prevblocks = set()
207 self.prevbulks = set()
210 fd = open(BLOCKEDFILE)
212 ip, typ = line.strip().split(' ')
213 if ip not in self.blacklist:
215 self.prevblocks.add(ip)
217 self.prevbulks.add(ip)
220 pass #ignore non-existing file
233 def add_long(self, logentry):
235 if logentry.request is not None:
238 if logentry.ua is None:
241 def add_short(self, logentry):
242 self.short_total += 1
243 if logentry.request is not None:
245 self.add_long(logentry)
247 def add_block(self, logentry):
248 self.block_total += 1
250 def ignores_warnings(self):
251 return self.block_total > 5
253 def new_state(self, was_blocked, was_bulked):
255 # deblock only if the IP has been really quiet
256 # (properly catches the ones that simply ignore the HTTP error)
257 return None if self.long_total < 20 else 'block'
258 if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
259 # client totally overdoing it
262 if self.short_total < 20:
263 # client has stopped, debulk
265 if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
266 # client is still hammering us, block
270 if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
272 # return 'uablock' # bad useragent
279 if __name__ == '__main__':
280 if len(sys.argv) < 2:
281 print("Usage: %s logfile startdate" % sys.argv[0])
284 if len(sys.argv) == 2:
285 dt = datetime.now() - BLOCKCOOLOFF_DELTA
287 dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
289 if os.path.getsize(sys.argv[1]) < 2*1030*1024:
290 sys.exit(0) # not enough data
292 lf = LogFile(sys.argv[1])
293 if not lf.seek_to_date(dt):
298 shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA
299 blockstart = dt + BLOCKCOOLOFF_DELTA - BLOCKCHECK_DELTA
300 notlogged = bl.whitelist | bl.blacklist
302 stats = defaultdict(IPstats)
304 for l in lf.loglines():
305 if l.ip not in notlogged:
306 stats[l.ip].add_long(l)
307 if l.date > shortstart:
311 for l in lf.loglines():
312 if l.ip not in notlogged:
313 stats[l.ip].add_short(l)
314 if l.request is not None and l.retcode == 200:
316 if l.date > blockstart and l.retcode in (403, 429):
317 stats[l.ip].add_block(l)
319 # adapt limits according to CPU and DB load
320 fd = open("/proc/loadavg")
321 cpuload = int(float(fd.readline().split()[2]))
323 # check the number of excess connections to apache
324 dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True))
325 fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True))
326 dbload = max(0, dbcons - fpms)
328 numbulks = len(bl.prevbulks)
329 BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload)
330 BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload)
331 if numbulks > MAX_BULK_IPS:
332 BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
333 # if the bulk pool is still empty, clients will be faster, avoid having
334 # them blocked in this case
337 BLOCK_LIMIT = BLOCK_UPPER
340 # collecting statistics
347 # write out new state file
348 fd = open(BLOCKEDFILE, 'w')
349 for k,v in stats.items():
350 wasblocked = k in bl.prevblocks
351 wasbulked = k in bl.prevbulks
352 state = v.new_state(wasblocked, wasbulked)
353 if state is not None:
354 if state == 'uablock':
357 elif state == 'emblock':
360 elif state == 'block':
363 elif state == 'bulk':
366 fd.write("%s %s\n" % (k, state))
372 for i in bl.blacklist:
373 fd.write("%s ban\n" % i)
376 # TODO write logs (need to collect some statistics)
377 logstr = datetime.now().strftime('%d/%b/%Y:%H:%M:%S') + ' %s %s\n'
378 fd = open(LOGFILE, 'a')
380 fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
382 fd.write(logstr % (' debulked:', ', '.join(debulked)))
384 fd.write(logstr % ('new bulks:', ', '.join(bulked)))
386 fd.write(logstr % ('dir.block:', ', '.join(emblocked)))
388 fd.write(logstr % (' ua block:', ', '.join(uablocked)))
390 fd.write(logstr % ('new block:', ', '.join(blocked)))
391 for k,v in stats.items():
392 if v.ignores_warnings() and k not in notlogged and ':' not in k:
393 fd.write(logstr % ('Warning ignored:', k))