From: Sarah Hoffmann Date: Wed, 19 Mar 2025 15:01:23 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/be63329a69762a3d13487b060f3cbc7b815bdb48?hp=e4295dba10bdb05045e35c772b3d8ca3cb042fd1 Merge remote-tracking branch 'upstream/master' --- diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql index 6d7a7bd5..b59b7656 100644 --- a/lib-sql/functions/address_lookup.sql +++ b/lib-sql/functions/address_lookup.sql @@ -232,7 +232,7 @@ BEGIN FOR location IN SELECT placex.place_id, osm_type, osm_id, name, class, type, coalesce(extratags->'linked_place', extratags->'place') as place_type, - admin_level, fromarea, isaddress, + admin_level, fromarea, isaddress and linked_place_id is NULL as isaddress, CASE WHEN rank_address = 11 THEN 5 ELSE rank_address END as rank_address, distance, country_code, postcode FROM place_addressline join placex on (address_place_id = placex.place_id) diff --git a/munin/nominatim_query_speed_querylog b/munin/nominatim_query_speed_querylog new file mode 100755 index 00000000..f35793fe --- /dev/null +++ b/munin/nominatim_query_speed_querylog @@ -0,0 +1,163 @@ +#!/usr/bin/python3 +# +# Plugin to monitor the types of requsts made to the API +# +# Uses the query log. +# +# Parameters: +# +# config (required) +# autoconf (optional - used by munin-config) +# + +import re +import os +import sys +from datetime import datetime, timedelta + +CONFIG="""graph_title Total Nominatim response time +graph_vlabel Time to response +graph_category Nominatim +graph_period minute +graph_args --base 1000 + +avgs.label Average search time +avgs.draw LINE +avgs.type GAUGE +avgs.min 0 +avgs.info Moving 5 minute average time to perform search + +avgr.label Average reverse time +avgr.draw LINE +avgr.type GAUGE +avgr.min 0 +avgr.info Moving 5 minute average time to perform reverse + +max.label Slowest time to response (1/100) +max.draw LINE +max.type GAUGE +max.min 0 +max.info Slowest query in last 5 minutes (unit: 100s)""" + +ENTRY_REGEX = re.compile(r'\[[^]]+\] (?P[0-9.]+) (?P\d+) (?P[a-z]+) ') +TIME_REGEX = re.compile(r'\[(?P\d\d\d\d)-(?P\d\d)-(?P\d\d) (?P\d\d):(?P\d\d):(?P\d\d)[0-9.]*\] ') + + +class LogFile: + """ A query log file, unpacked. """ + + def __init__(self, filename): + self.fd = open(filename, encoding='utf-8', errors='replace') + self.len = os.path.getsize(filename) + + def __del__(self): + self.fd.close() + + def seek_next(self, abstime): + self.fd.seek(abstime) + self.fd.readline() + l = self.fd.readline() + e = TIME_REGEX.match(l) + if e is None: + return None + e = e.groupdict() + return datetime(int(e['t_year']), int(e['t_month']), int(e['t_day']), + int(e['t_hour']), int(e['t_min']), int(e['t_sec'])) + + def seek_to_date(self, target): + # start position for binary search + fromseek = 0 + fromdate = self.seek_next(0) + if fromdate > target: + return True + # end position for binary search + toseek = -100 + while -toseek < self.len: + todate = self.seek_next(self.len + toseek) + if todate is not None: + break + toseek -= 100 + if todate is None or todate < target: + return False + toseek = self.len + toseek + + + while True: + bps = (toseek - fromseek) / (todate - fromdate).total_seconds() + newseek = fromseek + int((target - fromdate).total_seconds() * bps) + newdate = self.seek_next(newseek) + if newdate is None: + return False; + error = abs((target - newdate).total_seconds()) + if error < 1: + return True + if newdate > target: + toseek = newseek + todate = newdate + oldfromseek = fromseek + fromseek = toseek - error * bps + while True: + if fromseek <= oldfromseek: + fromseek = oldfromseek + fromdate = self.seek_next(fromseek) + break + fromdate = self.seek_next(fromseek) + if fromdate < target: + break; + bps *=2 + fromseek -= error * bps + else: + fromseek = newseek + fromdate = newdate + oldtoseek = toseek + toseek = fromseek + error * bps + while True: + if toseek > oldtoseek: + toseek = oldtoseek + todate = self.seek_next(toseek) + break + todate = self.seek_next(toseek) + if todate > target: + break + bps *=2 + toseek += error * bps + if toseek - fromseek < 500: + return True + + + def loglines(self): + for l in self.fd: + e = ENTRY_REGEX.match(l) + if e is not None: + yield e.groupdict() + + +if __name__ == '__main__': + + if len(sys.argv) > 1 and sys.argv[1] == 'config': + print(CONFIG) + sys.exit(0) + + sumrev = 0 + numrev = 0 + sumsearch = 0 + numsearch = 0 + maxres = 0 + if 'NOMINATIM_QUERYLOG' in os.environ: + lf = LogFile(os.environ['NOMINATIM_QUERYLOG']) + if lf.seek_to_date(datetime.now() - timedelta(minutes=5)): + for l in lf.loglines(): + dur = float(l['dur']) + if l['type'] == 'reverse': + numrev += 1 + sumrev += dur + elif l['type'] == 'search': + numsearch += 1 + sumsearch += dur + if dur > maxres: + maxres = dur + + + print('avgs.value', 0 if numsearch == 0 else sumsearch/numsearch) + print('avgr.value', 0 if numrev == 0 else sumrev/numrev) + print('max.value', maxres/100.0) diff --git a/munin/nominatim_requests_querylog b/munin/nominatim_requests_querylog new file mode 100755 index 00000000..8a103cfc --- /dev/null +++ b/munin/nominatim_requests_querylog @@ -0,0 +1,163 @@ +#!/usr/bin/python3 +# +# Plugin to monitor the types of requsts made to the API +# +# Uses the query log. +# +# Parameters: +# +# config (required) +# autoconf (optional - used by munin-config) +# + +import re +import os +import sys +from datetime import datetime, timedelta + +CONFIG="""graph_title Requests by API call +graph_args --base 1000 -l 0 +graph_vlabel requests per minute +graph_category nominatim +z1.label reverse +z1.draw AREA +z1.type GAUGE +z2.label search (successful) +z2.draw STACK +z2.type GAUGE +z3.label search (no result) +z3.draw STACK +z3.type GAUGE +z4.label lookup +z4.draw STACK +z4.type GAUGE +z4.label details +z4.draw STACK +z4.type GAUGE""" + +ENTRY_REGEX = re.compile(r'\[[^]]+\] (?P[0-9.]+) (?P\d+) (?P[a-z]+) ') +TIME_REGEX = re.compile(r'\[(?P\d\d\d\d)-(?P\d\d)-(?P\d\d) (?P\d\d):(?P\d\d):(?P\d\d)[0-9.]*\] ') + + +class LogFile: + """ A query log file, unpacked. """ + + def __init__(self, filename): + self.fd = open(filename, encoding='utf-8', errors='replace') + self.len = os.path.getsize(filename) + + def __del__(self): + self.fd.close() + + def seek_next(self, abstime): + self.fd.seek(abstime) + self.fd.readline() + l = self.fd.readline() + e = TIME_REGEX.match(l) + if e is None: + return None + e = e.groupdict() + return datetime(int(e['t_year']), int(e['t_month']), int(e['t_day']), + int(e['t_hour']), int(e['t_min']), int(e['t_sec'])) + + def seek_to_date(self, target): + # start position for binary search + fromseek = 0 + fromdate = self.seek_next(0) + if fromdate > target: + return True + # end position for binary search + toseek = -100 + while -toseek < self.len: + todate = self.seek_next(self.len + toseek) + if todate is not None: + break + toseek -= 100 + if todate is None or todate < target: + return False + toseek = self.len + toseek + + + while True: + bps = (toseek - fromseek) / (todate - fromdate).total_seconds() + newseek = fromseek + int((target - fromdate).total_seconds() * bps) + newdate = self.seek_next(newseek) + if newdate is None: + return False; + error = abs((target - newdate).total_seconds()) + if error < 1: + return True + if newdate > target: + toseek = newseek + todate = newdate + oldfromseek = fromseek + fromseek = toseek - error * bps + while True: + if fromseek <= oldfromseek: + fromseek = oldfromseek + fromdate = self.seek_next(fromseek) + break + fromdate = self.seek_next(fromseek) + if fromdate < target: + break; + bps *=2 + fromseek -= error * bps + else: + fromseek = newseek + fromdate = newdate + oldtoseek = toseek + toseek = fromseek + error * bps + while True: + if toseek > oldtoseek: + toseek = oldtoseek + todate = self.seek_next(toseek) + break + todate = self.seek_next(toseek) + if todate > target: + break + bps *=2 + toseek += error * bps + if toseek - fromseek < 500: + return True + + + def loglines(self): + for l in self.fd: + e = ENTRY_REGEX.match(l) + if e is not None: + yield e.groupdict() + + +if __name__ == '__main__': + + if len(sys.argv) > 1 and sys.argv[1] == 'config': + print(CONFIG) + sys.exit(0) + + reverse = 0 + searchy = 0 + searchn = 0 + lookup = 0 + details = 0 + if 'NOMINATIM_QUERYLOG' in os.environ: + lf = LogFile(os.environ['NOMINATIM_QUERYLOG']) + if lf.seek_to_date(datetime.now() - timedelta(minutes=5)): + for l in lf.loglines(): + if l['type'] == 'reverse': + reverse += 1 + elif l['type'] == 'search': + if l['numres'] == '0': + searchn += 1 + else: + searchy += 1 + elif l['type'] == 'place': + lookup +=1 + else: + details += 1 + + + print('z1.value', reverse/5) + print('z2.value', searchy/5) + print('z3.value', searchn/5) + print('z4.value', lookup/5) + print('z4.value', details/5) diff --git a/munin/nominatim_throttled_ips b/munin/nominatim_throttled_ips new file mode 100755 index 00000000..a56ff31d --- /dev/null +++ b/munin/nominatim_throttled_ips @@ -0,0 +1,28 @@ +#!/bin/sh +# +# Plugin to monitor the number of IPs in special pools +# +# Parameters: +# +# config (required) +# autoconf (optional - used by munin-config) +# + +if [ "$1" = "config" ]; then + + echo 'graph_title Restricted IPs' + echo 'graph_args -l 0' + echo 'graph_vlabel number of IPs' + echo 'graph_category nominatim' + echo 'bulk.label bulk' + echo 'bulk.draw AREA' + echo 'bulk.type GAUGE' + echo 'block.label blocked' + echo 'block.draw STACK' + echo 'block.type GAUGE' + exit 0 +fi + +BASEDIR="$(dirname "$(readlink -f "$0")")" + +cut -f 2 -d ' ' $BASEDIR/../../bin/settings/ip_blocks.map | sort | uniq -c | sed 's:[[:space:]]*\([0-9]\+\) \(.*\):\2.value \1:' diff --git a/packaging/nominatim-api/pyproject.toml b/packaging/nominatim-api/pyproject.toml index ca86f8a7..78bf9d58 100644 --- a/packaging/nominatim-api/pyproject.toml +++ b/packaging/nominatim-api/pyproject.toml @@ -1,5 +1,6 @@ [project] name = "nominatim-api" +version = "5.0.0.post6" description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Search library." readme = "README.md" requires-python = ">=3.7" @@ -15,13 +16,11 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "python-dotenv", - "pyYAML>=5.1", - "SQLAlchemy>=1.4.31", - "psycopg", - "PyICU" + "SQLAlchemy==2.0.38", + "falcon==4.0.2", + "uvicorn==0.34.0", + "gunicorn==23.0.0" ] -dynamic = ["version"] [project.urls] Homepage = "https://nominatim.org" diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml index 3c99fd2a..e05e58c5 100644 --- a/packaging/nominatim-db/pyproject.toml +++ b/packaging/nominatim-db/pyproject.toml @@ -1,5 +1,6 @@ [project] name = "nominatim-db" +version = "5.0.0.post6" description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Database backend." readme = "README.md" requires-python = ">=3.7" @@ -15,14 +16,14 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "psycopg", - "python-dotenv", - "jinja2", - "pyYAML>=5.1", - "psutil", - "PyICU" + "psycopg[binary]==3.2.5", + "python-dotenv==1.0.1", + "jinja2==3.1.5", + "pyYAML==6.0.2", + "psutil==7.0.0", + "PyICU==2.14", + "osmium==4.0.2", ] -dynamic = ["version"] [project.urls] Homepage = "https://nominatim.org" diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index ddfddaa6..c63803d2 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -214,13 +214,13 @@ class SearchBuilder: yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens) return - addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000 + addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000 # Partial term to frequent. Try looking up by rare full names first. name_fulls = self.query.get_tokens(name, qmod.TOKEN_WORD) if name_fulls: fulls_count = sum(t.count for t in name_fulls) - if fulls_count < 50000 or addr_count < 30000: + if fulls_count < 80000 or addr_count < 50000: yield penalty, fulls_count / (2**len(addr_tokens)), \ self.get_full_name_ranking(name_fulls, addr_partials, fulls_count > 30000 / max(1, len(addr_tokens))) @@ -268,12 +268,7 @@ class SearchBuilder: # This might yield wrong results, nothing we can do about that. if use_lookup: addr_restrict_tokens = [] - addr_lookup_tokens = [] - for t in addr_partials: - if t.addr_count > 20000: - addr_restrict_tokens.append(t.token) - else: - addr_lookup_tokens.append(t.token) + addr_lookup_tokens = [t.token for t in addr_partials] else: addr_restrict_tokens = [t.token for t in addr_partials] addr_lookup_tokens = [] diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 1bd0030d..cc5b6cf0 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -166,6 +166,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): log().section('Analyze query (using ICU tokenizer)') for func in self.preprocessors: phrases = func(phrases) + + if len(phrases) == 1 \ + and phrases[0].text.count(' ') > 3 \ + and max(len(s) for s in phrases[0].text.split()) < 3: + normalized = [] + query = qmod.QueryStruct(phrases) log().var_dump('Normalized query', query.source) diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 3da1171f..858cb64c 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -139,6 +139,7 @@ class ICUTokenizer(AbstractTokenizer): END) as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id + ORDER BY word_id """) drop_tables(conn, 'word_frequencies') diff --git a/utils/cron_ipanalyse.py b/utils/cron_ipanalyse.py new file mode 100755 index 00000000..97bad8da --- /dev/null +++ b/utils/cron_ipanalyse.py @@ -0,0 +1,402 @@ +#!/usr/bin/python3 +# +# Search apache logs for high-bandwith users and create a list of suspicious IPs. +# There are three states: bulk, block, ban. The first are bulk requesters +# that need throtteling, the second bulk requesters that have overdone it +# and the last manually banned IPs. +# + +import re +import os +import sys +import subprocess +from datetime import datetime, timedelta +from collections import defaultdict + +# +# DEFAULT SETTINGS +# +# Copy into settings/ip_blcoks.conf and adapt as required. +# +BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..')) +BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map' +LOGFILE= BASEDIR + '/log/restricted_ip.log' + +# space-separated list of IPs that are never banned +WHITELIST = '' +# space-separated list of IPs manually blocked +BLACKLIST = '' +# user-agents that should be blocked from bulk mode +# (matched with startswith) +UA_BLOCKLIST = () + +# time before a automatically blocked IP is allowed back +BLOCKCOOLOFF_DELTA=timedelta(hours=1) +# quiet time before an IP is released from the bulk pool +BULKCOOLOFF_DELTA=timedelta(minutes=15) +# time to check if new accesses appear despite being blocked +BLOCKCHECK_DELTA=timedelta(minutes=1) + +BULKLONG_LIMIT=8000 +BULKSHORT_LIMIT=2000 +BLOCK_UPPER=19000 +BLOCK_LOWER=4000 +BLOCK_LOADFAC=380 +BULK_LOADFAC=160 +BULK_LOWER=1500 +MAX_BULK_IPS=85 + +# +# END OF DEFAULT SETTINGS +# + +try: + with open(BASEDIR + "/settings/ip_blocks.conf") as f: + code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec') + exec(code) +except IOError: + pass + +BLOCK_LIMIT = BLOCK_LOWER + +time_regex = r'(?P\d\d)/(?P[A-Za-z]+)/(?P\d\d\d\d):(?P\d\d):(?P\d\d):(?P\d\d) [+-]\d\d\d\d' + +format_pat= re.compile(r'(?P[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P.*?)" (?P\d+) (?P\d+) "(?P.*?)" "(?P.*?)"') +time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ') + +logtime_pat = "%d/%b/%Y:%H:%M:%S %z" + +MONTHS = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6, + 'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12 } + +class LogEntry: + def __init__(self, logline): + e = format_pat.match(logline) + if e is None: + raise ValueError("Invalid log line:", logline) + e = e.groupdict() + self.ip = e['ip'] + self.date = datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']), + int(e['t_hour']), int(e['t_min']), int(e['t_sec'])) + qp = e['query'].split(' ', 2) + if len(qp) < 2: + self.request = None + self.query = None + else: + self.query = qp[1] + if qp[0] == 'OPTIONS': + self.request = None + else: + if '/?' in qp[1]: + self.request = 'S' + elif '/search' in qp[1]: + self.request = 'S' + elif '/reverse' in qp[1]: + self.request = 'R' + elif '/details' in qp[1]: + self.request = 'D' + elif '/lookup' in qp[1]: + self.request = 'L' + else: + self.request = None + self.query = e['query'] + self.retcode = int(e['return']) + self.referer = e['referer'] if e['referer'] != '-' else None + self.ua = e['ua'] if e['ua'] != '-' else None + + def get_log_time(logline): + e = format_pat.match(logline) + if e is None: + return None + e = e.groupdict() + #return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None) + return datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']), + int(e['t_hour']), int(e['t_min']), int(e['t_sec'])) + + +class LogFile: + """ An apache log file, unpacked. """ + + def __init__(self, filename): + self.fd = open(filename) + self.len = os.path.getsize(filename) + + def __del__(self): + self.fd.close() + + def seek_next(self, abstime): + self.fd.seek(abstime) + self.fd.readline() + l = self.fd.readline() + return LogEntry.get_log_time(l) if l is not None else None + + def seek_to_date(self, target): + # start position for binary search + fromseek = 0 + fromdate = self.seek_next(0) + if fromdate > target: + return True + # end position for binary search + toseek = -100 + while -toseek < self.len: + todate = self.seek_next(self.len + toseek) + if todate is not None: + break + toseek -= 100 + if todate is None or todate < target: + return False + toseek = self.len + toseek + + + while True: + bps = (toseek - fromseek) / (todate - fromdate).total_seconds() + newseek = fromseek + int((target - fromdate).total_seconds() * bps) + newdate = self.seek_next(newseek) + if newdate is None: + return False; + error = abs((target - newdate).total_seconds()) + if error < 1: + return True + if newdate > target: + toseek = newseek + todate = newdate + oldfromseek = fromseek + fromseek = toseek - error * bps + while True: + if fromseek <= oldfromseek: + fromseek = oldfromseek + fromdate = self.seek_next(fromseek) + break + fromdate = self.seek_next(fromseek) + if fromdate < target: + break; + bps *=2 + fromseek -= error * bps + else: + fromseek = newseek + fromdate = newdate + oldtoseek = toseek + toseek = fromseek + error * bps + while True: + if toseek > oldtoseek: + toseek = oldtoseek + todate = self.seek_next(toseek) + break + todate = self.seek_next(toseek) + if todate > target: + break + bps *=2 + toseek += error * bps + if toseek - fromseek < 500: + return True + + + def loglines(self): + for l in self.fd: + try: + yield LogEntry(l) + except ValueError: + pass # ignore invalid lines + +class BlockList: + + def __init__(self): + self.whitelist = set(WHITELIST.split()) if WHITELIST else set() + self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set() + self.prevblocks = set() + self.prevbulks = set() + + try: + fd = open(BLOCKEDFILE) + for line in fd: + ip, typ = line.strip().split(' ') + if ip not in self.blacklist: + if typ == 'block': + self.prevblocks.add(ip) + elif typ == 'bulk': + self.prevbulks.add(ip) + fd.close() + except IOError: + pass #ignore non-existing file + + +class IPstats: + + def __init__(self): + self.redirected = 0 + self.short_total = 0 + self.short_api = 0 + self.long_total = 0 + self.long_api = 0 + self.block_total = 0 + self.bad_ua = False + + def add_long(self, logentry): + self.long_total += 1 + if logentry.retcode == 301: + return + if logentry.request is not None: + self.long_api += 1 + if not self.bad_ua: + if logentry.ua is None: + self.bad_ua = True + + def add_short(self, logentry): + self.short_total += 1 + if logentry.retcode == 301: + self.redirected += 1 + return + if logentry.request is not None: + self.short_api += 1 + self.add_long(logentry) + + def add_block(self, logentry): + self.block_total += 1 + + def ignores_warnings(self, wasblocked): + return self.block_total > 5 or (wasblocked and self.redirected > 5) + + def new_state(self, was_blocked, was_bulked): + if was_blocked: + # deblock only if the IP has been really quiet + # (properly catches the ones that simply ignore the HTTP error) + return None if self.long_total < 20 else 'block' + if self.long_api > BLOCK_UPPER \ + or self.short_api > BLOCK_UPPER / 3 \ + or (self.redirected > 100 and self.short_total == self.redirected): + # client totally overdoing it + return 'block' + if was_bulked: + if self.short_total < 20: + # client has stopped, debulk + return None + if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3: + # client is still hammering us, block + return 'emblock' + return 'bulk' + + if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT: + #if self.bad_ua: + # return 'uablock' # bad useragent + return 'bulk' + + return None + + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: %s logfile startdate" % sys.argv[0]) + sys.exit(-1) + + if len(sys.argv) == 2: + dt = datetime.now() - BLOCKCOOLOFF_DELTA + else: + dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S") + + if os.path.getsize(sys.argv[1]) < 2*1030*1024: + sys.exit(0) # not enough data + + lf = LogFile(sys.argv[1]) + if not lf.seek_to_date(dt): + sys.exit(0) + + bl = BlockList() + + shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA + blockstart = dt + BLOCKCOOLOFF_DELTA - BLOCKCHECK_DELTA + notlogged = bl.whitelist | bl.blacklist + + stats = defaultdict(IPstats) + + for l in lf.loglines(): + if l.ip not in notlogged: + stats[l.ip].add_long(l) + if l.date > shortstart: + break + + total200 = 0 + for l in lf.loglines(): + if l.ip not in notlogged: + stats[l.ip].add_short(l) + if l.request is not None and l.retcode == 200: + total200 += 1 + if l.date > blockstart and l.retcode in (403, 429): + stats[l.ip].add_block(l) + + # adapt limits according to CPU and DB load + fd = open("/proc/loadavg") + cpuload = int(float(fd.readline().split()[2])) + fd.close() + # check the number of excess connections to apache + dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True)) + fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True)) + dbload = max(0, dbcons - fpms) + + numbulks = len(bl.prevbulks) + BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload) + BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload) + if numbulks > MAX_BULK_IPS: + BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10) + # if the bulk pool is still empty, clients will be faster, avoid having + # them blocked in this case + if numbulks < 10: + BLOCK_UPPER *= 2 + BLOCK_LIMIT = BLOCK_UPPER + + + # collecting statistics + unblocked = [] + debulked = [] + bulked = [] + blocked = [] + uablocked = [] + emblocked = [] + # write out new state file + fd = open(BLOCKEDFILE, 'w') + for k,v in stats.items(): + wasblocked = k in bl.prevblocks + wasbulked = k in bl.prevbulks + state = v.new_state(wasblocked, wasbulked) + if state is not None: + if state == 'uablock': + uablocked.append(k) + state = 'block' + elif state == 'emblock': + emblocked.append(k) + state = 'block' + elif state == 'block': + if not wasblocked: + blocked.append(k) + elif state == 'bulk': + if not wasbulked: + bulked.append(k) + fd.write("%s %s\n" % (k, state)) + else: + if wasblocked: + unblocked.append(k) + elif wasbulked: + debulked.append(k) + for i in bl.blacklist: + fd.write("%s ban\n" % i) + fd.close() + + # TODO write logs (need to collect some statistics) + logstr = datetime.now().strftime('%d/%b/%Y:%H:%M:%S') + ' %s %s\n' + fd = open(LOGFILE, 'a') + if unblocked: + fd.write(logstr % ('unblocked:', ', '.join(unblocked))) + if debulked: + fd.write(logstr % (' debulked:', ', '.join(debulked))) + if bulked: + fd.write(logstr % ('new bulks:', ', '.join(bulked))) + if emblocked: + fd.write(logstr % ('dir.block:', ', '.join(emblocked))) + if uablocked: + fd.write(logstr % (' ua block:', ', '.join(uablocked))) + if blocked: + fd.write(logstr % ('new block:', ', '.join(blocked))) + #for k,v in stats.items(): + # if v.ignores_warnings(k in bl.prevblocks) and k not in notlogged and ':' not in k: + # fd.write(logstr % ('Warning ignored:', k)) + fd.close()