From: Sarah Hoffmann Date: Wed, 12 Dec 2012 18:38:48 +0000 (+0100) Subject: introducing extra pools for bulk users X-Git-Tag: deploy~661 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/932f5a0927971752d0183fb0c64dec898d3c17b1?ds=sidebyside;hp=-c introducing extra pools for bulk users --- 932f5a0927971752d0183fb0c64dec898d3c17b1 diff --git a/lib/init-website.php b/lib/init-website.php index ef8237fb..ce046eaa 100644 --- a/lib/init-website.php +++ b/lib/init-website.php @@ -2,50 +2,3 @@ require_once('init.php'); header('Content-type: text/html; charset=utf-8'); - - // check blocks in place for external servers - if (strpos($_SERVER["REMOTE_ADDR"],'193.63.75.') !== 0 && - strpos(CONST_WhitelistedIPs, ','.$_SERVER["REMOTE_ADDR"].',') === false) - { - - $aBucketKeys = array(); - - if (isset($_SERVER["HTTP_REFERER"])) $aBucketKeys[] = str_replace('www.','',strtolower(parse_url($_SERVER["HTTP_REFERER"], PHP_URL_HOST))); - if (isset($_SERVER["REMOTE_ADDR"])) $aBucketKeys[] = $_SERVER["REMOTE_ADDR"]; - if (isset($_GET["email"])) $aBucketKeys[] = $_GET["email"]; - - $fBucketVal = doBucket($aBucketKeys, - (defined('CONST_ConnectionBucket_PageType')?constant('CONST_ConnectionBucket_Cost_'.CONST_ConnectionBucket_PageType):1) + user_busy_cost(), - CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit); - - if ($fBucketVal > CONST_ConnectionBucket_WaitLimit && $fBucketVal < CONST_ConnectionBucket_BlockLimit) - { - $m = getBucketMemcache(); - $iCurrentSleeping = $m->increment('sleepCounter'); - if (false === $iCurrentSleeping) - { - $m->add('sleepCounter', 0); - $iCurrentSleeping = $m->increment('sleepCounter'); - } - if ($iCurrentSleeping >= CONST_ConnectionBucket_MaxSleeping || isBucketSleeping($aBucketKeys)) - { - // Too many threads sleeping already. This becomes a hard block. - $fBucketVal = doBucket($aBucketKeys, CONST_ConnectionBucket_BlockLimit, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit); - } - else - { - setBucketSleeping($aBucketKeys, true); - sleep(($fBucketVal - CONST_ConnectionBucket_WaitLimit)/CONST_ConnectionBucket_LeakRate); - $fBucketVal = doBucket($aBucketKeys, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit); - setBucketSleeping($aBucketKeys, false); - } - $m->decrement('sleepCounter'); - } - - if (strpos(CONST_BlockedIPs, ','.$_SERVER["REMOTE_ADDR"].',') !== false || $fBucketVal >= CONST_ConnectionBucket_BlockLimit) - { - echo "Your IP has been blocked. \n"; - echo "Please create a nominatim trac ticket (http://trac.openstreetmap.org/newticket?component=nominatim) to request this to be removed. \n"; - echo "Information on the Nominatim usage policy can be found here: http://wiki.openstreetmap.org/wiki/Nominatim#Usage_Policy \n"; - exit; - } diff --git a/utils/cron_banip.py b/utils/cron_banip.py new file mode 100755 index 00000000..4cc50fc5 --- /dev/null +++ b/utils/cron_banip.py @@ -0,0 +1,212 @@ +#!/usr/bin/python +# +# Search logs for high-bandwith users and create a list of suspicious IPs. +# There are three states: bulk, block, ban. The first are bulk requesters +# that need throtteling, the second bulk requesters that have overdone it +# and the last manually banned IPs. +# +# The list can then be used in apache using rewrite rules to +# direct bulk users to smaller thread pools or block them. A +# typical apache config that uses php-fpm pools would look +# like this: +# +# Alias /nominatim-www/ "/var/www/nominatim/" +# Alias /nominatim-bulk/ "/var/www/nominatim/" +# +# Options MultiViews FollowSymLinks +# AddType text/html .php +# +# +# +# AddHandler fcgi:/var/run/php5-fpm-www.sock .php +# +# +# AddHandler fcgi:/var/run/php5-fpm-bulk.sock .php +# +# +# Redirect 509 /nominatim-block/ +# ErrorDocument 509 "Bandwidth limit exceeded." +# Redirect 403 /nominatim-ban/ +# ErrorDocument 403 "Access blocked." +# +# RewriteEngine On +# RewriteMap bulklist txt:/home/wherever/ip-block.map +# RewriteRule ^/(.*) /nominatim-${bulklist:%{REMOTE_ADDR}|www}/$1 [PT] +# + +import os +import psycopg2 +import datetime + +BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..')) + +# +# DEFAULT SETTINGS +# +# Copy into settings/ip_blcoks.conf and adapt as required. +# +BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map' +LOGFILE= BASEDIR + '/log/restricted_ip.log' + +# space-separated list of IPs that are never banned +WHITELIST = '' +# space-separated list of IPs manually blocked +BLACKLIST = '' + +# time before a automatically blocked IP is allowed back +BLOCKCOOLOFF_PERIOD='1 hour' +# quiet time before an IP is released from the bulk pool +BULKCOOLOFF_PERIOD='15 min' + +BULKLONG_LIMIT=8000 +BULKSHORT_LIMIT=2000 +BLOCK_UPPER=19000 +BLOCK_LOADFAC=300 + +# +# END OF DEFAULT SETTINGS +# + +try: + execfile(os.path.expanduser(BASEDIR + "/settings/ip_blocks.conf")) +except IOError: + pass + +# determine current load +fd = open("/proc/loadavg") +avgload = int(float(fd.readline().split()[1])) +fd.close() + +BLOCK_LIMIT = BLOCK_UPPER - BLOCK_LOADFAC * avgload + +# read the previous blocklist +WHITELIST = set(WHITELIST.split()) if WHITELIST else set() +prevblocks = [] +prevbulks = [] +BLACKLIST = set(BLACKLIST.split()) if BLACKLIST else set() +newblocks = set() +newbulks = set() + +try: + fd = open(BLOCKEDFILE) + for line in fd: + ip, typ = line.strip().split(' ') + if ip not in BLACKLIST: + if typ == 'block': + prevblocks.append(ip) + elif typ == 'bulk': + prevbulks.append(ip) + fd.close() +except IOError: + pass #ignore non-existing file + +conn = psycopg2.connect('dbname=nominatim') +cur = conn.cursor() + +# get the new block candidates +cur.execute(""" + SELECT ipaddress, max(count) FROM + ((SELECT * FROM + (SELECT ipaddress, sum(CASE WHEN type = 'search' THEN 2 ELSE 1 END) as count FROM new_query_log + WHERE starttime > now() - interval '1 hour' GROUP BY ipaddress) as i + WHERE count > %s) + UNION + (SELECT ipaddress, count * 4 FROM + (SELECT ipaddress, sum(CASE WHEN type = 'search' THEN 2 ELSE 1 END) as count FROM new_query_log + WHERE starttime > now() - interval '10 min' GROUP BY ipaddress) as i + WHERE count > %s)) as o + GROUP BY ipaddress +""", (BULKLONG_LIMIT, BULKSHORT_LIMIT)) + +bulkips = {} +emergencyblocks = [] + +for c in cur: + if c[0] not in WHITELIST and c[0] not in BLACKLIST: + if c[1] > BLOCK_UPPER and c[0] not in prevbulks: + newblocks.add(c[0]) + if c[0] not in prevblocks: + emergencyblocks.append(c[0]) + else: + bulkips[c[0]] = c[1] + +# IPs from the block list that are no longer in the bulk list +deblockcandidates = set() +# IPs from the bulk list that are no longer in the bulk list +debulkcandidates = set() +# new IPs to go into the block list +newlyblocked = [] + + +for ip in prevblocks: + if ip in bulkips: + newblocks.add(ip) + del bulkips[ip] + else: + deblockcandidates.add(ip) + +for ip in prevbulks: + if ip in bulkips: + if bulkips[ip] > BLOCK_LIMIT: + newblocks.add(ip) + newlyblocked.append(ip) + else: + newbulks.add(ip) + del bulkips[ip] + else: + debulkcandidates.add(ip) + +# cross-check deblock candidates +if deblockcandidates: + cur.execute(""" + SELECT DISTINCT ipaddress FROM new_query_log + WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s' + """ % ("','".join(deblockcandidates), BLOCKCOOLOFF_PERIOD)) + + for c in cur: + newblocks.add(c[0]) + deblockcandidates.remove(c[0]) +# deblocked IPs go back to the bulk pool to catch the ones that simply +# ignored the HTTP error and just continue to hammer the API. +# Those that behave and stopped will be debulked a minute later. +for ip in deblockcandidates: + newbulks.add(ip) + +# cross-check debulk candidates +if debulkcandidates: + cur.execute(""" + SELECT DISTINCT ipaddress FROM new_query_log + WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s' + """ % ("','".join(debulkcandidates), BULKCOOLOFF_PERIOD)) + + for c in cur: + newbulks.add(c[0]) + debulkcandidates.remove(c[0]) + +for ip in bulkips.iterkeys(): + newbulks.add(ip) + +# write out the new list +fd = open(BLOCKEDFILE, 'w') +for ip in newblocks: + fd.write(ip + " block\n") +for ip in newbulks: + fd.write(ip + " bulk\n") +for ip in BLACKLIST: + fd.write(ip + " ban\n") +fd.close() + +# write out the log +logstr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n' +fd = open(LOGFILE, 'a') +if deblockcandidates: + fd.write(logstr % ('unblocked:', ', '.join(deblockcandidates))) +if debulkcandidates: + fd.write(logstr % (' debulked:', ', '.join(debulkcandidates))) +if bulkips: + fd.write(logstr % ('new bulks:', ', '.join(bulkips.keys()))) +if emergencyblocks: + fd.write(logstr % ('dir.block:', ', '.join(emergencyblocks))) +if newlyblocked: + fd.write(logstr % ('new block:', ', '.join(newlyblocked))) +fd.close() diff --git a/utils/cron_banip.sh b/utils/cron_banip.sh deleted file mode 100755 index 2e0c8dfb..00000000 --- a/utils/cron_banip.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash -# -# Create or update the list of temporarily banned IPs. -# - -BASEDIR="$( cd "$( dirname "$0" )" && cd .. && pwd )" -if [ "x$BASEDIR" == "x" ]; then - echo "Could not determine base dir." - exit -1 -fi - -BLOCKEDFILE=$BASEDIR/settings/ip_blocks -LOGFILE=$BASEDIR/log/ip_blocks.log - -LONG_PERIOD='1 hour' -SHORT_PERIOD='10 min' -COOLOFF_PERIOD='1 hour' - -REVLONG_LIMIT=20000 -REVSHORT_LIMIT=6000 -SRCHLONG_LIMIT=4000 -SRCHSHORT_LIMIT='10 min' - -PSQLCMD='psql -qtA -d nominatim' - -curload=`cat /proc/loadavg | sed 's:[. ].*::'` -if [ "$curload" -gt "15" ]; then - REVSHORT_LIMIT=$((REVSHORT_LIMIT/2)) -fi - -# Blocking candidates -$PSQLCMD > $BLOCKEDFILE.newblocks << ENDOFQUERY -SELECT ipaddress FROM -((SELECT ipaddress FROM - (SELECT ipaddress, count(*) FROM new_query_log - WHERE type = 'reverse' AND starttime > now() - interval '$LONG_PERIOD' - GROUP BY ipaddress) - as v - WHERE count > $REVLONG_LIMIT) -UNION -(SELECT ipaddress FROM - (SELECT ipaddress, count(*) FROM new_query_log - WHERE type = 'reverse' AND starttime > now() - interval '$SHORT_PERIOD' - GROUP BY ipaddress) - as v - WHERE count > $REVSHORT_LIMIT) -UNION -(SELECT ipaddress FROM - (SELECT ipaddress, count(*) FROM new_query_log - WHERE type = 'search' AND starttime > now() - interval '$LONG_PERIOD' - GROUP BY ipaddress) - as v - WHERE count > $SRCHLONG_LIMIT) -UNION -(SELECT ipaddress FROM - (SELECT ipaddress, sum(endtime-starttime) as dur FROM new_query_log - WHERE type = 'search' AND starttime > now() - interval '$SHORT_PERIOD' - GROUP BY ipaddress) - as v - WHERE dur > '$SRCHSHORT_LIMIT') -) as q ORDER BY ipaddress; -ENDOFQUERY - -no_newblocks=`comm $BLOCKEDFILE.newblocks $BLOCKEDFILE -23 | wc -l` - -if [ "x$no_newblocks" != "x0" ]; then - date +"%x %X Newly blocked IPs: `comm $BLOCKEDFILE.newblocks $BLOCKEDFILE -23 | tr '\n' ' '`" >> $LOGFILE -fi - - -# Deblockable candidates -blocked=`tr '\n' ',' < $BLOCKEDFILE | sed "s:[[:space:]]::g;s:,$::;s:,:'),(':g"` - -if [ "x$blocked" == "x" ]; then - mv $BLOCKEDFILE.newblocks $BLOCKEDFILE -else - $PSQLCMD > $BLOCKEDFILE.newlifted << ENDOFQUERY - SELECT column1 FROM ( - VALUES ('$blocked') - EXCEPT - (SELECT DISTINCT ipaddress FROM new_query_log - WHERE starttime > now() - interval '$COOLOFF_PERIOD') - ) as q ORDER BY column1; -ENDOFQUERY - - no_lifted=`cat $BLOCKEDFILE.newlifted | wc -w` - - if [ "x$no_lifted" != "x0" ]; then - date +"%x %X Bans lifted: `tr '\n' ' ' < $BLOCKEDFILE.newlifted`" >> $LOGFILE - fi - - # Write out new blocks - cat $BLOCKEDFILE.newblocks $BLOCKEDFILE | sort -u | comm - $BLOCKEDFILE.newlifted -23 > $BLOCKEDFILE.new - mv $BLOCKEDFILE.new $BLOCKEDFILE - - rm $BLOCKEDFILE.newblocks $BLOCKEDFILE.newlifted -fi diff --git a/website/403.html b/website/403.html new file mode 100644 index 00000000..c5fd71f1 --- /dev/null +++ b/website/403.html @@ -0,0 +1,14 @@ + + +Access blocked + + +

Access blocked

+ +

You have been blocked because you have been overusing OSM's geocoding service. +Please be aware that OSM's resources are limited and shared between many users. +To have this block lifted, contact the Nominatim system administrator.

+ +

For more information, consult the usage policy for the OSM Nominatim server. + + diff --git a/website/509.html b/website/509.html new file mode 100644 index 00000000..047d9025 --- /dev/null +++ b/website/509.html @@ -0,0 +1,13 @@ + + +Bandwidth limit exceeded + + +

Bandwidth limit exceeded

+ +

You have been temporarily blocked because you have been overusing OSM's geocoding service. +Please adapt your scripts to reduce the number of requests and try again later.

+ +

For more information, consult the usage policy for the OSM Nominatim server. + +