import re
import os
import sys
+import subprocess
from datetime import datetime, timedelta
from collections import defaultdict
BLOCKCOOLOFF_DELTA=timedelta(hours=1)
# quiet time before an IP is released from the bulk pool
BULKCOOLOFF_DELTA=timedelta(minutes=15)
+# time to check if new accesses appear despite being blocked
+BLOCKCHECK_DELTA=timedelta(minutes=1)
BULKLONG_LIMIT=8000
BULKSHORT_LIMIT=2000
if qp[0] == 'OPTIONS':
self.request = None
else:
- if '/search' in qp[1]:
+ if '/?' in qp[1]:
+ self.request = 'S'
+ elif '/search' in qp[1]:
self.request = 'S'
elif '/reverse' in qp[1]:
self.request = 'R'
elif '/details' in qp[1]:
self.request = 'D'
+ elif '/lookup' in qp[1]:
+ self.request = 'L'
else:
self.request = None
self.query = e['query']
self.short_api = 0
self.long_total = 0
self.long_api = 0
+ self.block_total = 0
self.bad_ua = False
def add_long(self, logentry):
self.short_api += 1
self.add_long(logentry)
+ def add_block(self, logentry):
+ self.block_total += 1
+
+ def ignores_warnings(self):
+ return self.block_total > 5
+
def new_state(self, was_blocked, was_bulked):
if was_blocked:
# deblock only if the IP has been really quiet
# (properly catches the ones that simply ignore the HTTP error)
- return None if self.long_total < 5 else 'block'
+ return None if self.long_total < 20 else 'block'
if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
# client totally overdoing it
return 'block'
if was_bulked:
- if self.short_total < 5:
+ if self.short_total < 20:
# client has stopped, debulk
return None
if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
return 'bulk'
if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
- if self.bad_ua:
- return 'uablock' # bad useragent
+ #if self.bad_ua:
+ # return 'uablock' # bad useragent
return 'bulk'
return None
bl = BlockList()
shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA
+ blockstart = dt + BLOCKCOOLOFF_DELTA - BLOCKCHECK_DELTA
notlogged = bl.whitelist | bl.blacklist
stats = defaultdict(IPstats)
stats[l.ip].add_short(l)
if l.request is not None and l.retcode == 200:
total200 += 1
+ if l.date > blockstart and l.retcode in (403, 429):
+ stats[l.ip].add_block(l)
# adapt limits according to CPU and DB load
fd = open("/proc/loadavg")
# if the bulk pool is still empty, clients will be faster, avoid having
# them blocked in this case
if numbulks < 10:
- BLOCK_LIMIT = 2*BLOCK_UPPER
+ BLOCK_UPPER *= 2
+ BLOCK_LIMIT = BLOCK_UPPER
# collecting statistics
fd.close()
# TODO write logs (need to collect some statistics)
- logstr = datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
+ logstr = datetime.now().strftime('%d/%b/%Y:%H:%M:%S') + ' %s %s\n'
fd = open(LOGFILE, 'a')
if unblocked:
fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
fd.write(logstr % (' ua block:', ', '.join(uablocked)))
if blocked:
fd.write(logstr % ('new block:', ', '.join(blocked)))
+ for k,v in stats.items():
+ if v.ignores_warnings() and k not in notlogged and ':' not in k:
+ fd.write(logstr % ('Warning ignored:', k))
fd.close()