+#!/usr/bin/python
+#
+# Search logs for high-bandwith users and create a list of suspicious IPs.
+# There are three states: bulk, block, ban. The first are bulk requesters
+# that need throtteling, the second bulk requesters that have overdone it
+# and the last manually banned IPs.
+#
+# The list can then be used in apache using rewrite rules to
+# direct bulk users to smaller thread pools or block them. A
+# typical apache config that uses php-fpm pools would look
+# like this:
+#
+# Alias /nominatim-www/ "/var/www/nominatim/"
+# Alias /nominatim-bulk/ "/var/www/nominatim/"
+# <Directory "/var/www/nominatim/">
+# Options MultiViews FollowSymLinks
+# AddType text/html .php
+# </Directory>
+#
+# <Location /nominatim-www>
+# AddHandler fcgi:/var/run/php5-fpm-www.sock .php
+# </Location>
+# <Location /nominatim-bulk>
+# AddHandler fcgi:/var/run/php5-fpm-bulk.sock .php
+# </Location>
+#
+# Redirect 509 /nominatim-block/
+# ErrorDocument 509 "Bandwidth limit exceeded."
+# Redirect 403 /nominatim-ban/
+# ErrorDocument 403 "Access blocked."
+#
+# RewriteEngine On
+# RewriteMap bulklist txt:/home/wherever/ip-block.map
+# RewriteRule ^/(.*) /nominatim-${bulklist:%{REMOTE_ADDR}|www}/$1 [PT]
+#
+
+import os
+import psycopg2
+import datetime
+
+BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
+
+#
+# DEFAULT SETTINGS
+#
+# Copy into settings/ip_blcoks.conf and adapt as required.
+#
+BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
+LOGFILE= BASEDIR + '/log/restricted_ip.log'
+
+# space-separated list of IPs that are never banned
+WHITELIST = ''
+# space-separated list of IPs manually blocked
+BLACKLIST = ''
+
+# time before a automatically blocked IP is allowed back
+BLOCKCOOLOFF_PERIOD='1 hour'
+# quiet time before an IP is released from the bulk pool
+BULKCOOLOFF_PERIOD='15 min'
+
+BULKLONG_LIMIT=8000
+BULKSHORT_LIMIT=2000
+BLOCK_UPPER=19000
+BLOCK_LOADFAC=300
+
+#
+# END OF DEFAULT SETTINGS
+#
+
+try:
+ execfile(os.path.expanduser(BASEDIR + "/settings/ip_blocks.conf"))
+except IOError:
+ pass
+
+# determine current load
+fd = open("/proc/loadavg")
+avgload = int(float(fd.readline().split()[1]))
+fd.close()
+
+BLOCK_LIMIT = BLOCK_UPPER - BLOCK_LOADFAC * avgload
+
+# read the previous blocklist
+WHITELIST = set(WHITELIST.split()) if WHITELIST else set()
+prevblocks = []
+prevbulks = []
+BLACKLIST = set(BLACKLIST.split()) if BLACKLIST else set()
+newblocks = set()
+newbulks = set()
+
+try:
+ fd = open(BLOCKEDFILE)
+ for line in fd:
+ ip, typ = line.strip().split(' ')
+ if ip not in BLACKLIST:
+ if typ == 'block':
+ prevblocks.append(ip)
+ elif typ == 'bulk':
+ prevbulks.append(ip)
+ fd.close()
+except IOError:
+ pass #ignore non-existing file
+
+conn = psycopg2.connect('dbname=nominatim')
+cur = conn.cursor()
+
+# get the new block candidates
+cur.execute("""
+ SELECT ipaddress, max(count) FROM
+ ((SELECT * FROM
+ (SELECT ipaddress, sum(CASE WHEN type = 'search' THEN 2 ELSE 1 END) as count FROM new_query_log
+ WHERE starttime > now() - interval '1 hour' GROUP BY ipaddress) as i
+ WHERE count > %s)
+ UNION
+ (SELECT ipaddress, count * 4 FROM
+ (SELECT ipaddress, sum(CASE WHEN type = 'search' THEN 2 ELSE 1 END) as count FROM new_query_log
+ WHERE starttime > now() - interval '10 min' GROUP BY ipaddress) as i
+ WHERE count > %s)) as o
+ GROUP BY ipaddress
+""", (BULKLONG_LIMIT, BULKSHORT_LIMIT))
+
+bulkips = {}
+emergencyblocks = []
+
+for c in cur:
+ if c[0] not in WHITELIST and c[0] not in BLACKLIST:
+ if c[1] > BLOCK_UPPER and c[0] not in prevbulks:
+ newblocks.add(c[0])
+ if c[0] not in prevblocks:
+ emergencyblocks.append(c[0])
+ else:
+ bulkips[c[0]] = c[1]
+
+# IPs from the block list that are no longer in the bulk list
+deblockcandidates = set()
+# IPs from the bulk list that are no longer in the bulk list
+debulkcandidates = set()
+# new IPs to go into the block list
+newlyblocked = []
+
+
+for ip in prevblocks:
+ if ip in bulkips:
+ newblocks.add(ip)
+ del bulkips[ip]
+ else:
+ deblockcandidates.add(ip)
+
+for ip in prevbulks:
+ if ip in bulkips:
+ if bulkips[ip] > BLOCK_LIMIT:
+ newblocks.add(ip)
+ newlyblocked.append(ip)
+ else:
+ newbulks.add(ip)
+ del bulkips[ip]
+ else:
+ debulkcandidates.add(ip)
+
+# cross-check deblock candidates
+if deblockcandidates:
+ cur.execute("""
+ SELECT DISTINCT ipaddress FROM new_query_log
+ WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
+ """ % ("','".join(deblockcandidates), BLOCKCOOLOFF_PERIOD))
+
+ for c in cur:
+ newblocks.add(c[0])
+ deblockcandidates.remove(c[0])
+# deblocked IPs go back to the bulk pool to catch the ones that simply
+# ignored the HTTP error and just continue to hammer the API.
+# Those that behave and stopped will be debulked a minute later.
+for ip in deblockcandidates:
+ newbulks.add(ip)
+
+# cross-check debulk candidates
+if debulkcandidates:
+ cur.execute("""
+ SELECT DISTINCT ipaddress FROM new_query_log
+ WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
+ """ % ("','".join(debulkcandidates), BULKCOOLOFF_PERIOD))
+
+ for c in cur:
+ newbulks.add(c[0])
+ debulkcandidates.remove(c[0])
+
+for ip in bulkips.iterkeys():
+ newbulks.add(ip)
+
+# write out the new list
+fd = open(BLOCKEDFILE, 'w')
+for ip in newblocks:
+ fd.write(ip + " block\n")
+for ip in newbulks:
+ fd.write(ip + " bulk\n")
+for ip in BLACKLIST:
+ fd.write(ip + " ban\n")
+fd.close()
+
+# write out the log
+logstr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
+fd = open(LOGFILE, 'a')
+if deblockcandidates:
+ fd.write(logstr % ('unblocked:', ', '.join(deblockcandidates)))
+if debulkcandidates:
+ fd.write(logstr % (' debulked:', ', '.join(debulkcandidates)))
+if bulkips:
+ fd.write(logstr % ('new bulks:', ', '.join(bulkips.keys())))
+if emergencyblocks:
+ fd.write(logstr % ('dir.block:', ', '.join(emergencyblocks)))
+if newlyblocked:
+ fd.write(logstr % ('new block:', ', '.join(newlyblocked)))
+fd.close()