Merge remote-tracking branch 'upstream/master'

[nominatim.git] / utils / cron_ipanalyse.py
diff --git a/utils/cron_ipanalyse.py b/utils/cron_ipanalyse.py

index 2d0738af0eade18aa19f4b4b61618e59d87f561b..05b0b7f9e18382e05c33ba6a0c67d69bbc8050d0 100755 (executable)
--- a/utils/cron_ipanalyse.py
+++ b/utils/cron_ipanalyse.py
@@ -9,6 +9,7 @@
  import re
  import os
  import sys
+import subprocess
  from datetime import datetime, timedelta
  from collections import defaultdict
  
@@ -58,8 +59,8 @@ BLOCK_LIMIT = BLOCK_LOWER
  
  time_regex = r'(?P<t_day>\d\d)/(?P<t_month>[A-Za-z]+)/(?P<t_year>\d\d\d\d):(?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d) [+-]\d\d\d\d'
  
-format_pat= re.compile(r'(?P<ip>[(\d\.)]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
-time_pat= re.compile(r'[(\d\.)]+ - - \[' + time_regex + '\] ')
+format_pat= re.compile(r'(?P<ip>[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
+time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
  
  logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
  
@@ -84,11 +85,11 @@ class LogEntry:
              if qp[0] == 'OPTIONS':
                  self.request = None
              else:
-                if qp[1].startswith('/search'):
+                if '/search' in qp[1]:
                      self.request = 'S'
-                elif qp[1].startswith('/reverse'):
+                elif '/reverse' in qp[1]:
                      self.request = 'R'
-                elif qp[1].startswith('/details'):
+                elif '/details' in qp[1]:
                      self.request = 'D'
                  else:
                      self.request = None
@@ -124,28 +125,72 @@ class LogFile:
          return LogEntry.get_log_time(l) if l is not None else None
  
      def seek_to_date(self, target):
-        date1 = self.seek_next(0)
-        if date1 > target:
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
              return False
-        curseek = 2*1024*1024
-        curdate = self.seek_next(curseek)
-        if curdate is None:
-            raise RuntimeError("Cannot seek to %d" % curseek)
-        while target < curdate or (target - curdate).total_seconds() > 1:
-            bps = curseek / ((curdate - date1).total_seconds())
-            curseek += (target - curdate).total_seconds() * bps
-            if curseek < 0:
-                curseek = 0
-            elif curseek > self.len:
-                curseek = self.len - bps
-            curdate = self.seek_next(curseek)
-            if curdate is None:
-                raise RuntimeError("Cannot see to %d" % curseek)
-        return True
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
  
      def loglines(self):
          for l in self.fd:
-            yield LogEntry(l)
+            try:
+                yield LogEntry(l)
+            except ValueError:
+                pass # ignore invalid lines
  
  class BlockList:
  
@@ -196,12 +241,12 @@ class IPstats:
          if was_blocked:
              # deblock only if the IP has been really quiet
              # (properly catches the ones that simply ignore the HTTP error)
-            return None if self.long_total < 5 else 'block'
+            return None if self.long_total < 20 else 'block'
          if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
                  # client totally overdoing it
                  return 'block'
          if was_bulked:
-            if self.short_total < 5:
+            if self.short_total < 20:
                  # client has stopped, debulk
                  return None
              if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
@@ -210,8 +255,8 @@ class IPstats:
              return 'bulk'
  
          if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
-            if self.bad_ua:
-                return 'uablock' # bad useragent
+            #if self.bad_ua:
+            #    return 'uablock' # bad useragent
              return 'bulk'
  
          return None
@@ -259,17 +304,21 @@ if __name__ == '__main__':
      fd = open("/proc/loadavg")
      cpuload = int(float(fd.readline().split()[2]))
      fd.close()
-    dbload = total200 / BULKCOOLOFF_DELTA.total_seconds()
+    # check the number of excess connections to apache
+    dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True))
+    fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True))
+    dbload = max(0, dbcons - fpms)
  
      numbulks = len(bl.prevbulks)
-    BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
-    BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (cpuload - 14))
+    BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload)
+    BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload)
      if numbulks > MAX_BULK_IPS:
          BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
      # if the bulk pool is still empty, clients will be faster, avoid having
      # them blocked in this case
      if numbulks < 10:
-        BLOCK_LIMIT = 2*BLOCK_UPPER
+        BLOCK_UPPER *= 2
+        BLOCK_LIMIT = BLOCK_UPPER
  
  
      # collecting statistics
@@ -305,7 +354,7 @@ if __name__ == '__main__':
              elif wasbulked:
                  debulked.append(k)
      for i in bl.blacklist:
-        fd.write("%s ban\n" % k)
+        fd.write("%s ban\n" % i)
      fd.close()
  
      # TODO write logs (need to collect some statistics)