]> git.openstreetmap.org Git - nominatim.git/blob - utils/cron_ipanalyse.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / utils / cron_ipanalyse.py
1 #!/usr/bin/python3
2 #
3 # Search apache logs for high-bandwith users and create a list of suspicious IPs.
4 # There are three states: bulk, block, ban. The first are bulk requesters
5 # that need throtteling, the second bulk requesters that have overdone it
6 # and the last manually banned IPs.
7 #
8
9 import re
10 import os
11 import sys
12 from datetime import datetime, timedelta
13 from collections import defaultdict
14
15 #
16 # DEFAULT SETTINGS
17 #
18 # Copy into settings/ip_blcoks.conf and adapt as required.
19 #
20 BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
21 BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
22 LOGFILE= BASEDIR + '/log/restricted_ip.log'
23
24 # space-separated list of IPs that are never banned
25 WHITELIST = ''
26 # space-separated list of IPs manually blocked
27 BLACKLIST = ''
28 # user-agents that should be blocked from bulk mode
29 # (matched with startswith)
30 UA_BLOCKLIST = ()
31
32 # time before a automatically blocked IP is allowed back
33 BLOCKCOOLOFF_DELTA=timedelta(hours=1)
34 # quiet time before an IP is released from the bulk pool
35 BULKCOOLOFF_DELTA=timedelta(minutes=15)
36
37 BULKLONG_LIMIT=8000
38 BULKSHORT_LIMIT=2000
39 BLOCK_UPPER=19000
40 BLOCK_LOWER=4000
41 BLOCK_LOADFAC=380
42 BULK_LOADFAC=160
43 BULK_LOWER=1500
44 MAX_BULK_IPS=85
45
46 #
47 # END OF DEFAULT SETTINGS
48 #
49
50 try:
51     with open(BASEDIR + "/settings/ip_blocks.conf") as f:
52         code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec')
53         exec(code)
54 except IOError:
55     pass
56
57 BLOCK_LIMIT = BLOCK_LOWER
58
59 time_regex = r'(?P<t_day>\d\d)/(?P<t_month>[A-Za-z]+)/(?P<t_year>\d\d\d\d):(?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d) [+-]\d\d\d\d'
60
61 format_pat= re.compile(r'(?P<ip>[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
62 time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
63
64 logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
65
66 MONTHS = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
67            'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12 }
68
69 class LogEntry:
70     def __init__(self, logline):
71         e = format_pat.match(logline)
72         if e is None:
73             raise ValueError("Invalid log line:", logline)
74         e = e.groupdict()
75         self.ip = e['ip']
76         self.date = datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
77                              int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
78         qp = e['query'].split(' ', 2) 
79         if len(qp) < 2:
80             self.request = None
81             self.query = None
82         else:
83             self.query = qp[1]
84             if qp[0] == 'OPTIONS':
85                 self.request = None
86             else:
87                 if '/search' in qp[1]:
88                     self.request = 'S'
89                 elif '/reverse' in qp[1]:
90                     self.request = 'R'
91                 elif '/details' in qp[1]:
92                     self.request = 'D'
93                 else:
94                     self.request = None
95         self.query = e['query']
96         self.retcode = int(e['return'])
97         self.referer = e['referer'] if e['referer'] != '-' else None
98         self.ua = e['ua'] if e['ua'] != '-' else None
99
100     def get_log_time(logline):
101         e = format_pat.match(logline)
102         if e is None:
103             return None
104         e = e.groupdict()
105         #return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
106         return datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
107                              int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
108
109
110 class LogFile:
111     """ An apache log file, unpacked. """
112
113     def __init__(self, filename):
114         self.fd = open(filename)
115         self.len = os.path.getsize(filename)
116
117     def __del__(self):
118         self.fd.close()
119
120     def seek_next(self, abstime):
121         self.fd.seek(abstime)
122         self.fd.readline()
123         l = self.fd.readline()
124         return LogEntry.get_log_time(l) if l is not None else None
125
126     def seek_to_date(self, target):
127         # start position for binary search
128         fromseek = 0
129         fromdate = self.seek_next(0)
130         if fromdate > target:
131             return True
132         # end position for binary search
133         toseek = -100
134         while -toseek < self.len:
135             todate = self.seek_next(self.len + toseek)
136             if todate is not None:
137                 break
138             toseek -= 100
139         if todate is None or todate < target:
140             return False
141         toseek = self.len + toseek
142
143
144         while True:
145             bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
146             newseek = fromseek + int((target - fromdate).total_seconds() * bps)
147             newdate = self.seek_next(newseek)
148             if newdate is None:
149                 return False;
150             error = abs((target - newdate).total_seconds())
151             if error < 1:
152                 return True
153             if newdate > target:
154                 toseek = newseek
155                 todate = newdate
156                 oldfromseek = fromseek
157                 fromseek = toseek - error * bps
158                 while True:
159                     if fromseek <= oldfromseek:
160                         fromseek = oldfromseek
161                         fromdate = self.seek_next(fromseek)
162                         break
163                     fromdate = self.seek_next(fromseek)
164                     if fromdate < target:
165                         break;
166                     bps *=2
167                     fromseek -= error * bps
168             else:
169                 fromseek = newseek
170                 fromdate = newdate
171                 oldtoseek = toseek
172                 toseek = fromseek + error * bps
173                 while True:
174                     if toseek > oldtoseek:
175                         toseek = oldtoseek
176                         todate = self.seek_next(toseek)
177                         break
178                     todate = self.seek_next(toseek)
179                     if todate > target:
180                         break
181                     bps *=2
182                     toseek += error * bps
183             if toseek - fromseek < 500:
184                 return True
185
186
187     def loglines(self):
188         for l in self.fd:
189             try:
190                 yield LogEntry(l)
191             except ValueError:
192                 pass # ignore invalid lines
193
194 class BlockList:
195
196     def __init__(self):
197         self.whitelist = set(WHITELIST.split()) if WHITELIST else set()
198         self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set()
199         self.prevblocks = set()
200         self.prevbulks = set()
201
202         try:
203             fd = open(BLOCKEDFILE)
204             for line in fd:
205                 ip, typ = line.strip().split(' ')
206                 if ip not in self.blacklist:
207                     if typ == 'block':
208                         self.prevblocks.add(ip)
209                     elif typ == 'bulk':
210                         self.prevbulks.add(ip)
211             fd.close()
212         except IOError:
213             pass #ignore non-existing file
214
215
216 class IPstats:
217
218     def __init__(self):
219         self.short_total = 0
220         self.short_api = 0
221         self.long_total = 0
222         self.long_api = 0
223         self.bad_ua = False
224
225     def add_long(self, logentry):
226         self.long_total += 1
227         if logentry.request is not None:
228             self.long_api += 1
229         if not self.bad_ua:
230             if logentry.ua is None:
231                 self.bad_ua = True
232
233     def add_short(self, logentry):
234         self.short_total += 1
235         if logentry.request is not None:
236             self.short_api += 1
237         self.add_long(logentry)
238
239     def new_state(self, was_blocked, was_bulked):
240         if was_blocked:
241             # deblock only if the IP has been really quiet
242             # (properly catches the ones that simply ignore the HTTP error)
243             return None if self.long_total < 5 else 'block'
244         if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
245                 # client totally overdoing it
246                 return 'block'
247         if was_bulked:
248             if self.short_total < 5:
249                 # client has stopped, debulk
250                 return None
251             if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
252                 # client is still hammering us, block
253                 return 'emblock'
254             return 'bulk'
255
256         if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
257             if self.bad_ua:
258                 return 'uablock' # bad useragent
259             return 'bulk'
260
261         return None
262
263
264
265 if __name__ == '__main__':
266     if len(sys.argv) < 2:
267         print("Usage: %s logfile startdate" % sys.argv[0])
268         sys.exit(-1)
269
270     if len(sys.argv) == 2:
271         dt = datetime.now() - BLOCKCOOLOFF_DELTA
272     else:
273         dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
274
275     if os.path.getsize(sys.argv[1]) < 2*1030*1024:
276         sys.exit(0) # not enough data
277
278     lf = LogFile(sys.argv[1])
279     if not lf.seek_to_date(dt):
280         sys.exit(0)
281
282     bl = BlockList()
283
284     shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA
285     notlogged = bl.whitelist | bl.blacklist
286
287     stats = defaultdict(IPstats)
288
289     for l in lf.loglines():
290         if l.ip not in notlogged:
291             stats[l.ip].add_long(l)
292         if l.date > shortstart:
293             break
294
295     total200 = 0
296     for l in lf.loglines():
297         if l.ip not in notlogged:
298             stats[l.ip].add_short(l)
299         if l.request is not None and l.retcode == 200:
300             total200 += 1
301
302     # adapt limits according to CPU and DB load
303     fd = open("/proc/loadavg")
304     cpuload = int(float(fd.readline().split()[2]))
305     fd.close()
306     dbload = total200 / BULKCOOLOFF_DELTA.total_seconds()
307
308     numbulks = len(bl.prevbulks)
309     BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
310     BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (cpuload - 14))
311     if numbulks > MAX_BULK_IPS:
312         BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
313     # if the bulk pool is still empty, clients will be faster, avoid having
314     # them blocked in this case
315     if numbulks < 10:
316         BLOCK_LIMIT = 2*BLOCK_UPPER
317
318
319     # collecting statistics
320     unblocked = []
321     debulked = []
322     bulked = []
323     blocked = []
324     uablocked = []
325     emblocked = []
326     # write out new state file
327     fd = open(BLOCKEDFILE, 'w')
328     for k,v in stats.items():
329         wasblocked = k in bl.prevblocks
330         wasbulked = k in bl.prevbulks
331         state = v.new_state(wasblocked, wasbulked)
332         if state is not None:
333             if state == 'uablock':
334                 uablocked.append(k)
335                 state = 'block'
336             elif state == 'emblock':
337                 emblocked.append(k)
338                 state = 'block'
339             elif state == 'block':
340                 if not wasblocked:
341                     blocked.append(k)
342             elif state == 'bulk':
343                 if not wasbulked:
344                     bulked.append(k)
345             fd.write("%s %s\n" % (k, state))
346         else:
347             if wasblocked:
348                 unblocked.append(k)
349             elif wasbulked:
350                 debulked.append(k)
351     for i in bl.blacklist:
352         fd.write("%s ban\n" % k)
353     fd.close()
354
355     # TODO write logs (need to collect some statistics)
356     logstr = datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
357     fd = open(LOGFILE, 'a')
358     if unblocked:
359         fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
360     if debulked:
361         fd.write(logstr % (' debulked:', ', '.join(debulked)))
362     if bulked:
363         fd.write(logstr % ('new bulks:', ', '.join(bulked)))
364     if emblocked:
365         fd.write(logstr % ('dir.block:', ', '.join(emblocked)))
366     if uablocked:
367         fd.write(logstr % (' ua block:', ', '.join(uablocked)))
368     if blocked:
369         fd.write(logstr % ('new block:', ', '.join(blocked)))
370     fd.close()