Merge remote-tracking branch 'upstream/master'

[nominatim.git] / utils / cron_ipanalyse.py
diff --git a/utils/cron_ipanalyse.py b/utils/cron_ipanalyse.py

index 262090a4710c362e44342b1c973fc9796be1e3b9..05b0b7f9e18382e05c33ba6a0c67d69bbc8050d0 100755 (executable)
--- a/utils/cron_ipanalyse.py
+++ b/utils/cron_ipanalyse.py
@@ -9,6 +9,7 @@
  import re
  import os
  import sys
+import subprocess
  from datetime import datetime, timedelta
  from collections import defaultdict
  
@@ -58,8 +59,8 @@ BLOCK_LIMIT = BLOCK_LOWER
  
  time_regex = r'(?P<t_day>\d\d)/(?P<t_month>[A-Za-z]+)/(?P<t_year>\d\d\d\d):(?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d) [+-]\d\d\d\d'
  
-format_pat= re.compile(r'(?P<ip>[(\d\.)]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
-time_pat= re.compile(r'[(\d\.)]+ - - \[' + time_regex + '\] ')
+format_pat= re.compile(r'(?P<ip>[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
+time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
  
  logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
  
@@ -84,11 +85,11 @@ class LogEntry:
              if qp[0] == 'OPTIONS':
                  self.request = None
              else:
-                if qp[1].startswith('/search'):
+                if '/search' in qp[1]:
                      self.request = 'S'
-                elif qp[1].startswith('/reverse'):
+                elif '/reverse' in qp[1]:
                      self.request = 'R'
-                elif qp[1].startswith('/details'):
+                elif '/details' in qp[1]:
                      self.request = 'D'
                  else:
                      self.request = None
@@ -186,7 +187,10 @@ class LogFile:
  
      def loglines(self):
          for l in self.fd:
-            yield LogEntry(l)
+            try:
+                yield LogEntry(l)
+            except ValueError:
+                pass # ignore invalid lines
  
  class BlockList:
  
@@ -237,12 +241,12 @@ class IPstats:
          if was_blocked:
              # deblock only if the IP has been really quiet
              # (properly catches the ones that simply ignore the HTTP error)
-            return None if self.long_total < 5 else 'block'
+            return None if self.long_total < 20 else 'block'
          if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
                  # client totally overdoing it
                  return 'block'
          if was_bulked:
-            if self.short_total < 5:
+            if self.short_total < 20:
                  # client has stopped, debulk
                  return None
              if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
@@ -251,8 +255,8 @@ class IPstats:
              return 'bulk'
  
          if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
-            if self.bad_ua:
-                return 'uablock' # bad useragent
+            #if self.bad_ua:
+            #    return 'uablock' # bad useragent
              return 'bulk'
  
          return None
@@ -300,17 +304,21 @@ if __name__ == '__main__':
      fd = open("/proc/loadavg")
      cpuload = int(float(fd.readline().split()[2]))
      fd.close()
-    dbload = total200 / BULKCOOLOFF_DELTA.total_seconds()
+    # check the number of excess connections to apache
+    dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True))
+    fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True))
+    dbload = max(0, dbcons - fpms)
  
      numbulks = len(bl.prevbulks)
-    BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
-    BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (cpuload - 14))
+    BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload)
+    BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload)
      if numbulks > MAX_BULK_IPS:
          BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
      # if the bulk pool is still empty, clients will be faster, avoid having
      # them blocked in this case
      if numbulks < 10:
-        BLOCK_LIMIT = 2*BLOCK_UPPER
+        BLOCK_UPPER *= 2
+        BLOCK_LIMIT = BLOCK_UPPER
  
  
      # collecting statistics
@@ -346,7 +354,7 @@ if __name__ == '__main__':
              elif wasbulked:
                  debulked.append(k)
      for i in bl.blacklist:
-        fd.write("%s ban\n" % k)
+        fd.write("%s ban\n" % i)
      fd.close()
  
      # TODO write logs (need to collect some statistics)