]> git.openstreetmap.org Git - nominatim.git/commitdiff
introducing extra pools for bulk users
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 12 Dec 2012 18:38:48 +0000 (19:38 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 12 Dec 2012 18:38:48 +0000 (19:38 +0100)
lib/init-website.php
utils/cron_banip.py [new file with mode: 0755]
utils/cron_banip.sh [deleted file]
website/403.html [new file with mode: 0644]
website/509.html [new file with mode: 0644]

index ef8237fbfa7d4fbc6d968fc243d3978ff18c169b..ce046eaa6b752ef8d9bb41e4ef1e8025b5fc000a 100644 (file)
@@ -2,50 +2,3 @@
     require_once('init.php');
 
     header('Content-type: text/html; charset=utf-8');
     require_once('init.php');
 
     header('Content-type: text/html; charset=utf-8');
-
-    // check blocks in place for external servers
-    if (strpos($_SERVER["REMOTE_ADDR"],'193.63.75.') !== 0 &&
-        strpos(CONST_WhitelistedIPs, ','.$_SERVER["REMOTE_ADDR"].',') === false)
-    {
-
-       $aBucketKeys = array();
-
-       if (isset($_SERVER["HTTP_REFERER"])) $aBucketKeys[] = str_replace('www.','',strtolower(parse_url($_SERVER["HTTP_REFERER"], PHP_URL_HOST)));
-       if (isset($_SERVER["REMOTE_ADDR"])) $aBucketKeys[] = $_SERVER["REMOTE_ADDR"];
-       if (isset($_GET["email"])) $aBucketKeys[] = $_GET["email"];
-
-       $fBucketVal = doBucket($aBucketKeys, 
-                       (defined('CONST_ConnectionBucket_PageType')?constant('CONST_ConnectionBucket_Cost_'.CONST_ConnectionBucket_PageType):1) + user_busy_cost(),
-                       CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit);
-
-       if ($fBucketVal > CONST_ConnectionBucket_WaitLimit && $fBucketVal < CONST_ConnectionBucket_BlockLimit)
-       {
-               $m = getBucketMemcache();
-               $iCurrentSleeping = $m->increment('sleepCounter');
-               if (false === $iCurrentSleeping)
-               {
-                       $m->add('sleepCounter', 0);
-                       $iCurrentSleeping = $m->increment('sleepCounter');
-               }
-               if ($iCurrentSleeping >= CONST_ConnectionBucket_MaxSleeping || isBucketSleeping($aBucketKeys))
-               {
-                       // Too many threads sleeping already.  This becomes a hard block.
-                       $fBucketVal = doBucket($aBucketKeys, CONST_ConnectionBucket_BlockLimit, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit);
-               }
-               else
-               {
-                       setBucketSleeping($aBucketKeys, true);
-                       sleep(($fBucketVal - CONST_ConnectionBucket_WaitLimit)/CONST_ConnectionBucket_LeakRate);
-                       $fBucketVal = doBucket($aBucketKeys, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit);
-                       setBucketSleeping($aBucketKeys, false);
-               }
-               $m->decrement('sleepCounter');
-       }
-
-       if (strpos(CONST_BlockedIPs, ','.$_SERVER["REMOTE_ADDR"].',') !== false || $fBucketVal >= CONST_ConnectionBucket_BlockLimit)
-       {
-               echo "Your IP has been blocked. \n";
-               echo "Please create a nominatim trac ticket (http://trac.openstreetmap.org/newticket?component=nominatim) to request this to be removed. \n";
-               echo "Information on the Nominatim usage policy can be found here: http://wiki.openstreetmap.org/wiki/Nominatim#Usage_Policy \n";
-               exit;
-       }
diff --git a/utils/cron_banip.py b/utils/cron_banip.py
new file mode 100755 (executable)
index 0000000..4cc50fc
--- /dev/null
@@ -0,0 +1,212 @@
+#!/usr/bin/python
+#
+# Search logs for high-bandwith users and create a list of suspicious IPs.
+# There are three states: bulk, block, ban. The first are bulk requesters
+# that need throtteling, the second bulk requesters that have overdone it
+# and the last manually banned IPs.
+#
+# The list can then be used in apache using rewrite rules to
+# direct bulk users to smaller thread pools or block them. A
+# typical apache config that uses php-fpm pools would look
+# like this:
+#
+#    Alias /nominatim-www/ "/var/www/nominatim/"
+#    Alias /nominatim-bulk/ "/var/www/nominatim/"
+#    <Directory "/var/www/nominatim/">
+#        Options MultiViews FollowSymLinks
+#        AddType text/html   .php
+#    </Directory>
+#
+#    <Location /nominatim-www>
+#        AddHandler fcgi:/var/run/php5-fpm-www.sock .php
+#    </Location>
+#    <Location /nominatim-bulk>
+#        AddHandler fcgi:/var/run/php5-fpm-bulk.sock .php
+#    </Location>
+#
+#    Redirect 509 /nominatim-block/
+#    ErrorDocument 509 "Bandwidth limit exceeded."
+#    Redirect 403 /nominatim-ban/
+#    ErrorDocument 403 "Access blocked."
+#
+#    RewriteEngine On
+#    RewriteMap bulklist txt:/home/wherever/ip-block.map
+#    RewriteRule ^/(.*) /nominatim-${bulklist:%{REMOTE_ADDR}|www}/$1 [PT]
+#
+
+import os
+import psycopg2
+import datetime
+
+BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
+
+#
+# DEFAULT SETTINGS
+#
+# Copy into settings/ip_blcoks.conf and adapt as required.
+#
+BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
+LOGFILE= BASEDIR + '/log/restricted_ip.log'
+
+# space-separated list of IPs that are never banned
+WHITELIST = ''
+# space-separated list of IPs manually blocked
+BLACKLIST = ''
+
+# time before a automatically blocked IP is allowed back
+BLOCKCOOLOFF_PERIOD='1 hour'
+# quiet time before an IP is released from the bulk pool
+BULKCOOLOFF_PERIOD='15 min'
+
+BULKLONG_LIMIT=8000
+BULKSHORT_LIMIT=2000
+BLOCK_UPPER=19000
+BLOCK_LOADFAC=300
+
+#
+# END OF DEFAULT SETTINGS
+#
+
+try:
+    execfile(os.path.expanduser(BASEDIR + "/settings/ip_blocks.conf"))
+except IOError:
+    pass
+
+# determine current load
+fd = open("/proc/loadavg")
+avgload = int(float(fd.readline().split()[1]))
+fd.close()
+
+BLOCK_LIMIT = BLOCK_UPPER - BLOCK_LOADFAC * avgload
+
+# read the previous blocklist
+WHITELIST = set(WHITELIST.split()) if WHITELIST else set()
+prevblocks = []
+prevbulks = []
+BLACKLIST = set(BLACKLIST.split()) if BLACKLIST else set()
+newblocks = set()
+newbulks = set()
+
+try:
+    fd = open(BLOCKEDFILE)
+    for line in fd:
+        ip, typ = line.strip().split(' ')
+        if ip not in BLACKLIST:
+            if typ == 'block':
+                prevblocks.append(ip)
+            elif typ == 'bulk':
+                prevbulks.append(ip)
+    fd.close()
+except IOError:
+    pass #ignore non-existing file
+
+conn = psycopg2.connect('dbname=nominatim')
+cur = conn.cursor()
+
+# get the new block candidates
+cur.execute("""
+  SELECT ipaddress, max(count) FROM
+   ((SELECT * FROM
+     (SELECT ipaddress, sum(CASE WHEN type = 'search' THEN 2 ELSE 1 END) as count FROM new_query_log 
+      WHERE starttime > now() - interval '1 hour' GROUP BY ipaddress) as i
+   WHERE count > %s)
+   UNION
+   (SELECT ipaddress, count * 4 FROM
+     (SELECT ipaddress, sum(CASE WHEN type = 'search' THEN 2 ELSE 1 END) as count FROM new_query_log 
+      WHERE starttime > now() - interval '10 min' GROUP BY ipaddress) as i
+   WHERE count > %s)) as o
+  GROUP BY ipaddress
+""", (BULKLONG_LIMIT, BULKSHORT_LIMIT))
+
+bulkips = {}
+emergencyblocks = []
+
+for c in cur:
+    if c[0] not in WHITELIST and c[0] not in BLACKLIST:
+        if c[1] > BLOCK_UPPER and c[0] not in prevbulks:
+            newblocks.add(c[0])
+            if c[0] not in prevblocks:
+                emergencyblocks.append(c[0])
+        else:
+            bulkips[c[0]] = c[1]
+
+# IPs from the block list that are no longer in the bulk list
+deblockcandidates = set()
+# IPs from the bulk list that are no longer in the bulk list
+debulkcandidates = set()
+# new IPs to go into the block list
+newlyblocked = []
+
+
+for ip in prevblocks:
+    if ip in bulkips:
+        newblocks.add(ip)
+        del bulkips[ip]
+    else:
+        deblockcandidates.add(ip)    
+        
+for ip in prevbulks:
+    if ip in bulkips:
+        if bulkips[ip] > BLOCK_LIMIT:
+            newblocks.add(ip)
+            newlyblocked.append(ip)
+        else:
+            newbulks.add(ip)
+        del bulkips[ip]
+    else:
+        debulkcandidates.add(ip)
+
+# cross-check deblock candidates
+if deblockcandidates:
+    cur.execute("""
+        SELECT DISTINCT ipaddress FROM new_query_log
+        WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
+        """ % ("','".join(deblockcandidates), BLOCKCOOLOFF_PERIOD))
+
+    for c in cur:
+        newblocks.add(c[0])
+        deblockcandidates.remove(c[0])
+# deblocked IPs go back to the bulk pool to catch the ones that simply
+# ignored the HTTP error and just continue to hammer the API.
+# Those that behave and stopped will be debulked a minute later.
+for ip in deblockcandidates:
+    newbulks.add(ip)
+
+# cross-check debulk candidates
+if debulkcandidates:
+    cur.execute("""
+        SELECT DISTINCT ipaddress FROM new_query_log
+        WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
+        """ % ("','".join(debulkcandidates), BULKCOOLOFF_PERIOD))
+
+    for c in cur:
+        newbulks.add(c[0])
+        debulkcandidates.remove(c[0])
+
+for ip in bulkips.iterkeys():
+    newbulks.add(ip)
+
+# write out the new list
+fd = open(BLOCKEDFILE, 'w')
+for ip in newblocks:
+    fd.write(ip + " block\n")
+for ip in newbulks:
+    fd.write(ip + " bulk\n")
+for ip in BLACKLIST:
+    fd.write(ip + " ban\n")
+fd.close()
+
+# write out the log
+logstr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
+fd = open(LOGFILE, 'a')
+if deblockcandidates:
+    fd.write(logstr % ('unblocked:', ', '.join(deblockcandidates)))
+if debulkcandidates:
+    fd.write(logstr % (' debulked:', ', '.join(debulkcandidates)))
+if bulkips:
+    fd.write(logstr % ('new bulks:', ', '.join(bulkips.keys())))
+if emergencyblocks:
+    fd.write(logstr % ('dir.block:', ', '.join(emergencyblocks)))
+if newlyblocked:
+    fd.write(logstr % ('new block:', ', '.join(newlyblocked)))
+fd.close()
diff --git a/utils/cron_banip.sh b/utils/cron_banip.sh
deleted file mode 100755 (executable)
index 2e0c8df..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/bin/bash
-#
-# Create or update the list of temporarily banned IPs.
-#
-
-BASEDIR="$( cd "$( dirname "$0" )" && cd .. && pwd )"
-if [ "x$BASEDIR" == "x" ]; then
-    echo "Could not determine base dir."
-    exit -1
-fi
-
-BLOCKEDFILE=$BASEDIR/settings/ip_blocks
-LOGFILE=$BASEDIR/log/ip_blocks.log
-
-LONG_PERIOD='1 hour'
-SHORT_PERIOD='10 min'
-COOLOFF_PERIOD='1 hour'
-
-REVLONG_LIMIT=20000
-REVSHORT_LIMIT=6000
-SRCHLONG_LIMIT=4000
-SRCHSHORT_LIMIT='10 min'
-
-PSQLCMD='psql -qtA -d nominatim'
-
-curload=`cat /proc/loadavg | sed 's:[. ].*::'`
-if [ "$curload" -gt "15" ]; then
-  REVSHORT_LIMIT=$((REVSHORT_LIMIT/2))
-fi
-
-# Blocking candidates
-$PSQLCMD > $BLOCKEDFILE.newblocks << ENDOFQUERY
-SELECT ipaddress FROM
-((SELECT ipaddress FROM
-  (SELECT ipaddress, count(*) FROM new_query_log
-   WHERE type = 'reverse' AND starttime > now() - interval '$LONG_PERIOD'
-   GROUP BY ipaddress)
-  as v
-  WHERE count > $REVLONG_LIMIT) 
-UNION
-(SELECT ipaddress FROM
-  (SELECT ipaddress, count(*) FROM new_query_log
-   WHERE type = 'reverse' AND starttime > now() - interval '$SHORT_PERIOD'
-   GROUP BY ipaddress)
-  as v
-  WHERE count > $REVSHORT_LIMIT) 
-UNION
-(SELECT ipaddress FROM
-  (SELECT ipaddress, count(*) FROM new_query_log
-   WHERE type = 'search' AND starttime > now() - interval '$LONG_PERIOD'
-   GROUP BY ipaddress)
-  as v
-  WHERE count > $SRCHLONG_LIMIT) 
-UNION
-(SELECT ipaddress FROM
-  (SELECT ipaddress, sum(endtime-starttime) as dur FROM new_query_log
-   WHERE type = 'search' AND starttime > now() - interval '$SHORT_PERIOD'
-   GROUP BY ipaddress)
-  as v
-  WHERE dur > '$SRCHSHORT_LIMIT')
-) as q ORDER BY ipaddress;
-ENDOFQUERY
-
-no_newblocks=`comm $BLOCKEDFILE.newblocks $BLOCKEDFILE -23 | wc -l`
-
-if [ "x$no_newblocks" != "x0" ]; then
-    date +"%x %X Newly blocked IPs: `comm $BLOCKEDFILE.newblocks $BLOCKEDFILE -23 | tr '\n' ' '`" >> $LOGFILE
-fi
-
-
-# Deblockable candidates
-blocked=`tr '\n' ',' < $BLOCKEDFILE | sed "s:[[:space:]]::g;s:,$::;s:,:'),(':g"`
-
-if [ "x$blocked" == "x" ]; then
-  mv $BLOCKEDFILE.newblocks $BLOCKEDFILE 
-else
-    $PSQLCMD > $BLOCKEDFILE.newlifted << ENDOFQUERY
-    SELECT column1 FROM (
-    VALUES ('$blocked')
-    EXCEPT
-    (SELECT DISTINCT ipaddress FROM new_query_log
-     WHERE starttime > now() - interval '$COOLOFF_PERIOD')
-    ) as q ORDER BY column1;
-ENDOFQUERY
-
-    no_lifted=`cat $BLOCKEDFILE.newlifted | wc -w`
-
-    if [ "x$no_lifted" != "x0" ]; then
-        date +"%x %X Bans lifted: `tr '\n' ' ' < $BLOCKEDFILE.newlifted`" >> $LOGFILE
-    fi
-
-    # Write out new blocks
-    cat $BLOCKEDFILE.newblocks $BLOCKEDFILE | sort -u | comm - $BLOCKEDFILE.newlifted -23 > $BLOCKEDFILE.new
-    mv $BLOCKEDFILE.new $BLOCKEDFILE
-
-    rm $BLOCKEDFILE.newblocks $BLOCKEDFILE.newlifted
-fi
diff --git a/website/403.html b/website/403.html
new file mode 100644 (file)
index 0000000..c5fd71f
--- /dev/null
@@ -0,0 +1,14 @@
+<html>
+<head>
+<title>Access blocked</title>
+</head>
+<body>
+<h1>Access blocked</h1>
+
+<p>You have been blocked because you have been overusing OSM's geocoding service.
+Please be aware that OSM's resources are limited and shared between many users.
+To have this block lifted, contact the <a href="http://wiki.openstreetmap.org/wiki/System_Administrators">Nominatim system administrator</a>.</p>
+
+<p>For more information, consult the <a href="http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy">usage policy</a> for the OSM Nominatim server.
+</body>
+</head>
diff --git a/website/509.html b/website/509.html
new file mode 100644 (file)
index 0000000..047d902
--- /dev/null
@@ -0,0 +1,13 @@
+<html>
+<head>
+<title>Bandwidth limit exceeded</title>
+</head>
+<body>
+<h1>Bandwidth limit exceeded</h1>
+
+<p>You have been temporarily blocked because you have been overusing OSM's geocoding service.
+Please adapt your scripts to reduce the number of requests and try again later.</p>
+
+<p>For more information, consult the <a href="http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy">usage policy</a> for the OSM Nominatim server.
+</body>
+</head>