Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index e31362feddda1e67ad906f22df02a2906d8a5d09..e6d59520c10601b8660a855c8f5ad3dc8ca1c4c5 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,27 @@ if (BUILD_IMPORTER)
                    ${PROJECT_BINARY_DIR}/nominatim)
  endif()
  
+#-----------------------------------------------------------------------------
+# Targets for running a development webserver from the build directory.
+#-----------------------------------------------------------------------------
+
+if (BUILD_API)
+   set(WEBSITEFILES
+       403.html
+       509.html
+       crossdomain.xml
+       favicon.ico
+       nominatim.xml
+       robots.txt
+       taginfo.json
+   )
+
+   foreach (webfile ${WEBSITEFILES})
+       configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
+                      ${PROJECT_BINARY_DIR}/website/${webfile})
+   endforeach()
+endif()
+
  #-----------------------------------------------------------------------------
  # Tests
  #-----------------------------------------------------------------------------
diff --git a/cmake/tool-installed.tmpl b/cmake/tool-installed.tmpl

index 915f1d69cb49a3167e1b767a5d5f1bb20d398028..47c17f416b29a5d508d238569591c9b44edb21a9 100644 (file)
--- a/cmake/tool-installed.tmpl
+++ b/cmake/tool-installed.tmpl
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/srv/nominatim.openstreetmap.org/venv/bin/python3
  import sys
  import os
  
diff --git a/cmake/tool.tmpl b/cmake/tool.tmpl

index 4053ca35b8ec4745360df03e3d1dcaf46e3bafc2..fff8231c6b5880e09cbb00fd360341a2f1bfa762 100755 (executable)
--- a/cmake/tool.tmpl
+++ b/cmake/tool.tmpl
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/srv/nominatim.openstreetmap.org/venv/bin/python3
  import sys
  import os
  
diff --git a/lib-php/website/403.html b/lib-php/website/403.html

new file mode 100644 (file)

index 0000000..8d8e323
--- /dev/null
+++ b/lib-php/website/403.html
@@ -0,0 +1,23 @@
+<html>
+<head>
+<title>Access blocked</title>
+</head>
+<body>
+<h1>Access blocked</h1>
+
+<p>You have been blocked because you have violated the
+<a href="https://operations.osmfoundation.org/policies/nominatim/">usage policy</a>
+of OSM's Nominatim geocoding service. Please be aware that OSM's resources are
+limited and shared between many users. The usage policy is there to ensure that
+the service remains usable for everybody.</p>
+
+<p>Please review the terms and make sure that your
+software adheres to the terms. You should in particular verify that you have set a
+<b>custom HTTP referrer or HTTP user agent</b> that identifies your application, and
+that you are not overusing the service with massive bulk requests.</p>
+
+<p>If you feel that this block is unjustified or remains after you have adopted
+your usage, you may contact the Nominatim system administrator at
+nominatim@openstreetmap.org to have this block lifted.</p>
+</body>
+</head>
diff --git a/lib-php/website/509.html b/lib-php/website/509.html

new file mode 100644 (file)

index 0000000..628c53b
--- /dev/null
+++ b/lib-php/website/509.html
@@ -0,0 +1,12 @@
+<html>
+<head>
+<title>Bandwidth limit exceeded</title>
+</head>
+<body>
+<h1>Bandwidth limit exceeded</h1>
+
+<p>You have been temporarily blocked because you have been overusing OSM's geocoding service or because you have not provided sufficient identification of your application. This block will be automatically lifted after a while. Please take the time and adapt your scripts to reduce the number of requests and make sure that you send a valid UserAgent or Referer.</p>
+
+<p>For more information, consult the <a href="https://operations.osmfoundation.org/policies/nominatim/">usage policy</a> for the OSM Nominatim server.</p>
+</body>
+</html>
diff --git a/lib-php/website/crossdomain.xml b/lib-php/website/crossdomain.xml

new file mode 100644 (file)

index 0000000..963a682
--- /dev/null
+++ b/lib-php/website/crossdomain.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+           <!DOCTYPE cross-domain-policy SYSTEM "http://www.macromedia.com/xml/dtds/cross-domain-policy.dtd">
+           <cross-domain-policy>
+           <allow-access-from domain="*" />
+           </cross-domain-policy> 
diff --git a/lib-php/website/favicon.ico b/lib-php/website/favicon.ico

new file mode 100644 (file)

index 0000000..0157ea0

Binary files /dev/null and b/lib-php/website/favicon.ico differ
diff --git a/lib-php/website/nominatim.xml b/lib-php/website/nominatim.xml

new file mode 100644 (file)

index 0000000..28684b1
--- /dev/null
+++ b/lib-php/website/nominatim.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
+                       xmlns:moz="http://www.mozilla.org/2006/browser/search/">
+       <ShortName>Nominatim</ShortName>
+       <LongName>Nominatim OSM Search</LongName>
+       <Description>Search for a place in OpenStreetMap Nominatim</Description>
+       <InputEncoding>UTF-8</InputEncoding>
+       <OutputEncoding>UTF-8</OutputEncoding>
+       <Url type="text/html" method="get" template="http://nominatim.openstreetmap.org/search/?q={searchTerms}" />
+       <Query role="example" searchTerms="Reigate" />
+       <Developer>Brian Quinion</Developer>
+       <AdultContent>false</AdultContent>
+       <Attribution>Data &amp;copy; OpenStreetMap contributors, Some Rights Reserved. ODbL, http://www.osm.org/copyright.</Attribution>
+</OpenSearchDescription>
+
diff --git a/lib-php/website/robots.txt b/lib-php/website/robots.txt

new file mode 100644 (file)

index 0000000..9624d97
--- /dev/null
+++ b/lib-php/website/robots.txt
@@ -0,0 +1,14 @@
+User-agent: ia_archiver
+Allow: /
+
+User-agent: *
+Disallow: /search.php
+Disallow: /search
+Disallow: /details.php
+Disallow: /details
+Disallow: /reverse.php
+Disallow: /reverse
+Disallow: /hierarchy
+Disallow: /hierarchy.php
+Disallow: /lookup
+Disallow: /lookup.php
diff --git a/lib-php/website/taginfo.json b/lib-php/website/taginfo.json

new file mode 100644 (file)

index 0000000..98f8b97
--- /dev/null
+++ b/lib-php/website/taginfo.json
@@ -0,0 +1,112 @@
+{
+    "data_format": 1,
+    "data_url": "http://nominatim.openstreetmap.org/taginfo.json",
+    "project": {
+        "name": "Nominatim",
+        "description": "OSM search engine.",
+        "project_url": "http://nominatim.openstreetmap.org",
+        "doc_url": "http://wiki.osm.org/wiki/Nominatim",
+        "contact_name": "Sarah Hoffmann",
+        "contact_email": "lonvia@denofr.de"
+    },
+    "tags": [
+      { "key" : "ref", "description": "Searchable name of the place."},
+      { "key" : "int_ref", "description": "Searchable name of the place."},
+      { "key" : "nat_ref", "description": "Searchable name of the place."},
+      { "key" : "reg_ref", "description": "Searchable name of the place."},
+      { "key" : "loc_ref", "description": "Searchable name of the place."},
+      { "key" : "old_ref", "description": "Searchable name of the place."},
+      { "key" : "iata", "description": "Searchable name of the place."},
+      { "key" : "icao", "description": "Searchable name of the place."},
+      { "key" : "pcode", "description": "Searchable name of the place."},
+      { "key" : "name", "description": "Searchable name of the place."},
+      { "key" : "int_name", "description": "Searchable name of the place."},
+      { "key" : "nat_name", "description": "Searchable name of the place."},
+      { "key" : "reg_name", "description": "Searchable name of the place."},
+      { "key" : "loc_name", "description": "Searchable name of the place."},
+      { "key" : "old_name", "description": "Searchable name of the place."},
+      { "key" : "alt_name", "description": "Searchable name of the place."},
+      { "key" : "official_name", "description": "Searchable name of the place."},
+      { "key" : "place_name", "description": "Searchable name of the place."},
+      { "key" : "short_name", "description": "Searchable name of the place."},
+      { "key" : "addr:housename", "description": "Searchable name of the place."},
+      { "key" : "operator", "description": "Searchable name for amenities and shops." },
+      { "key" : "brand", "description": "Searchable name of POI places."},
+      { "key" : "bridge:name", "description" : "Searchable name for bridges."},
+      { "key" : "tunnel:name", "description" : "Searchable name for tunnels."},
+      { "key" : "emergency", "description": "POI in the search database." },
+      { "key" : "tourism", "description": "POI in the search database." },
+      { "key" : "historic", "description": "POI in the search database." },
+      { "key" : "military", "description": "POI in the search database." },
+      { "key" : "natural", "description": "POI in the search database." },
+      { "key" : "man_made", "description": "POI in the search database." },
+      { "key" : "mountain_pass", "description": "POI in the search database." },
+      { "key" : "highway", "description": "POI or street in the search database (not added are: 'no', 'turning_circle', 'traffic_signals', 'mini_roundabout', 'crossing' and traffic signs)." },
+      { "key" : "aerialway", "description": "POI in the search database (unless value is 'no', 'pylon')." },
+      { "key" : "aeroway", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "amenity", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "boundary", "description": "Area in the search database (used to compute addresses of other places)." },
+      { "key" : "bridge", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "craft", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "leisure", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "office", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "railway", "description": "Geographic feature in the search database (unless value is 'no')." },
+      { "key" : "landuse", "description": "Geographic feature in the search database (unless value is 'no')." },
+      { "key" : "shop", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "tunnel", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "waterway", "description": "Geographic feature in the search database (unless value is 'riverbank')."},
+      { "key" : "place", "description": "Settlement on the search database (used to compute addresses of other places)." },
+      { "key" : "postal_code", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "addr:postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "tiger:zip_left", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "tiger:zip_right", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "addr:street", "description": "Used to determine the street of a house or POI. Note that a street with the same name must exist for the tag to be effective."},
+      { "key" : "addr:place", "description": "Used to determine the settlement of a house or POI with a street-less address."},
+      { "key" : "country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "ISO3166-1", "description": "Used to determine the country a place is in."},
+      { "key" : "is_in:country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "is_in:country", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:country", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:housenumber", "description": "House number of the place (no ranges)."},
+      { "key" : "addr:conscriptionnumber", "description": "House number of the place (Eastern European system)."},
+      { "key" : "addr:streetnumber", "description": "House number of the place (Eastern European system)."},
+      { "key" : "addr:interpolation", "description": "Way along which house numbers are interpolated."} ,
+      { "key" : "tiger:county", "description": "Used to determine the address in the US (needs a place with the same name and a county suffix)."},
+      { "key" : "addr:suburb", "description": "Used to determine the address of a place."},
+      { "key" : "addr:city", "description": "Used to determine the address of a place."},
+      { "key" : "addr:state_code", "description": "Used to determine the address of a place."},
+      { "key" : "addr:state", "description": "Used to determine the address of a place."},
+      { "key" : "addr:province", "description": "Used to determine the address of a place."},
+      { "key" : "addr:district", "description": "Used to determine the address of a place."},
+      { "key" : "addr:region", "description": "Used to determine the address of a place."},
+      { "key" : "addr:county", "description": "Used to determine the address of a place."},
+      { "key" : "addr:municipality", "description": "Used to determine the address of a place."},
+      { "key" : "addr:hamlet", "description": "Used to determine the address of a place."},
+      { "key" : "addr:village", "description": "Used to determine the address of a place."},
+      { "key" : "addr:subdistrict", "description": "Used to determine the address of a place."},
+      { "key" : "addr:town", "description": "Used to determine the address of a place."},
+      { "key" : "addr:neighbourhood", "description": "Used to determine the address of a place."},
+      { "key" : "addr:quarter", "description": "Used to determine the address of a place."},
+      { "key" : "addr:parish", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:suburb", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:city", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:state_code", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:state", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:province", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:district", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:region", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:county", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:municipality", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:hamlet", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:village", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:subdistrict", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:town", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:neighbourhood", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:quarter", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:parish", "description": "Used to determine the address of a place."},
+      { "key" : "admin_level", "description": "Determines the hierarchy for administrative boundaries."},
+      { "key" : "wikipedia", "description": "Linking to the right wikipedia article helps to guess the importance of a place, which determines how far up in the search results it appears."}
+   ]
+}
diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql

index 26ce20738d301d4e3b33c43ff1571fd52a4e9a45..cba11dbf3400d7fcc45a74f9c7fb85ed1a2f4c69 100644 (file)
--- a/lib-sql/functions/address_lookup.sql
+++ b/lib-sql/functions/address_lookup.sql
@@ -232,7 +232,7 @@ BEGIN
    FOR location IN
      SELECT placex.place_id, osm_type, osm_id, name, class, type,
             coalesce(extratags->'linked_place', extratags->'place') as place_type,
-           admin_level, fromarea, isaddress,
+           admin_level, fromarea, isaddress and linked_place_id is NULL as isaddress,
             CASE WHEN rank_address = 11 THEN 5 ELSE rank_address END as rank_address,
             distance, country_code, postcode
        FROM place_addressline join placex on (address_place_id = placex.place_id)
diff --git a/munin/nominatim_query_speed_querylog b/munin/nominatim_query_speed_querylog

new file mode 100755 (executable)

index 0000000..f35793f
--- /dev/null
+++ b/munin/nominatim_query_speed_querylog
@@ -0,0 +1,163 @@
+#!/usr/bin/python3
+#
+# Plugin to monitor the types of requsts made to the API
+#
+# Uses the query log.
+#
+# Parameters:
+#
+#       config   (required)
+#       autoconf (optional - used by munin-config)
+#
+
+import re
+import os
+import sys
+from datetime import datetime, timedelta
+
+CONFIG="""graph_title Total Nominatim response time
+graph_vlabel Time to response
+graph_category Nominatim 
+graph_period minute
+graph_args --base 1000
+
+avgs.label Average search time
+avgs.draw LINE
+avgs.type GAUGE
+avgs.min 0
+avgs.info Moving 5 minute average time to perform search
+
+avgr.label Average reverse time
+avgr.draw LINE
+avgr.type GAUGE
+avgr.min 0
+avgr.info Moving 5 minute average time to perform reverse
+
+max.label Slowest time to response (1/100)
+max.draw LINE
+max.type GAUGE
+max.min 0
+max.info Slowest query in last 5 minutes (unit: 100s)"""
+
+ENTRY_REGEX = re.compile(r'\[[^]]+\] (?P<dur>[0-9.]+) (?P<numres>\d+) (?P<type>[a-z]+) ')
+TIME_REGEX = re.compile(r'\[(?P<t_year>\d\d\d\d)-(?P<t_month>\d\d)-(?P<t_day>\d\d) (?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d)[0-9.]*\] ')
+
+
+class LogFile:
+    """ A query log file, unpacked. """
+
+    def __init__(self, filename):
+        self.fd = open(filename, encoding='utf-8', errors='replace')
+        self.len = os.path.getsize(filename)
+
+    def __del__(self):
+        self.fd.close()
+
+    def seek_next(self, abstime):
+        self.fd.seek(abstime)
+        self.fd.readline()
+        l = self.fd.readline()
+        e = TIME_REGEX.match(l)
+        if e is None:
+            return None
+        e = e.groupdict()
+        return datetime(int(e['t_year']), int(e['t_month']), int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+    def seek_to_date(self, target):
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
+            return False
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
+
+    def loglines(self):
+        for l in self.fd:
+            e = ENTRY_REGEX.match(l)
+            if e is not None:
+                yield e.groupdict()
+
+
+if __name__ == '__main__':
+
+    if len(sys.argv) > 1 and sys.argv[1] == 'config':
+        print(CONFIG)
+        sys.exit(0)
+
+    sumrev = 0
+    numrev = 0
+    sumsearch = 0
+    numsearch = 0
+    maxres = 0
+    if 'NOMINATIM_QUERYLOG' in os.environ:
+        lf = LogFile(os.environ['NOMINATIM_QUERYLOG'])
+        if lf.seek_to_date(datetime.now() - timedelta(minutes=5)):
+            for l in lf.loglines():
+                dur = float(l['dur'])
+                if l['type'] == 'reverse':
+                    numrev += 1
+                    sumrev += dur
+                elif  l['type'] == 'search':
+                    numsearch += 1
+                    sumsearch += dur
+                if dur > maxres:
+                    maxres = dur
+
+
+    print('avgs.value', 0 if numsearch == 0 else sumsearch/numsearch)
+    print('avgr.value', 0 if numrev == 0 else sumrev/numrev)
+    print('max.value', maxres/100.0)
diff --git a/munin/nominatim_requests_querylog b/munin/nominatim_requests_querylog

new file mode 100755 (executable)

index 0000000..8a103cf
--- /dev/null
+++ b/munin/nominatim_requests_querylog
@@ -0,0 +1,163 @@
+#!/usr/bin/python3
+#
+# Plugin to monitor the types of requsts made to the API
+#
+# Uses the query log.
+#
+# Parameters: 
+#
+#       config   (required)
+#       autoconf (optional - used by munin-config)
+#
+
+import re
+import os
+import sys
+from datetime import datetime, timedelta
+
+CONFIG="""graph_title Requests by API call
+graph_args --base 1000 -l 0
+graph_vlabel requests per minute
+graph_category nominatim
+z1.label reverse
+z1.draw AREA
+z1.type GAUGE
+z2.label search (successful)
+z2.draw STACK
+z2.type GAUGE
+z3.label search (no result)
+z3.draw STACK
+z3.type GAUGE
+z4.label lookup
+z4.draw STACK
+z4.type GAUGE
+z4.label details
+z4.draw STACK
+z4.type GAUGE"""
+
+ENTRY_REGEX = re.compile(r'\[[^]]+\] (?P<dur>[0-9.]+) (?P<numres>\d+) (?P<type>[a-z]+) ')
+TIME_REGEX = re.compile(r'\[(?P<t_year>\d\d\d\d)-(?P<t_month>\d\d)-(?P<t_day>\d\d) (?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d)[0-9.]*\] ')
+
+
+class LogFile:
+    """ A query log file, unpacked. """
+
+    def __init__(self, filename):
+        self.fd = open(filename, encoding='utf-8', errors='replace')
+        self.len = os.path.getsize(filename)
+
+    def __del__(self):
+        self.fd.close()
+
+    def seek_next(self, abstime):
+        self.fd.seek(abstime)
+        self.fd.readline()
+        l = self.fd.readline()
+        e = TIME_REGEX.match(l)
+        if e is None:
+            return None
+        e = e.groupdict()
+        return datetime(int(e['t_year']), int(e['t_month']), int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+    def seek_to_date(self, target):
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
+            return False
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
+
+    def loglines(self):
+        for l in self.fd:
+            e = ENTRY_REGEX.match(l)
+            if e is not None:
+                yield e.groupdict()
+
+
+if __name__ == '__main__':
+
+    if len(sys.argv) > 1 and sys.argv[1] == 'config':
+        print(CONFIG)
+        sys.exit(0)
+
+    reverse = 0
+    searchy = 0
+    searchn = 0
+    lookup = 0
+    details = 0
+    if 'NOMINATIM_QUERYLOG' in os.environ:
+        lf = LogFile(os.environ['NOMINATIM_QUERYLOG'])
+        if lf.seek_to_date(datetime.now() - timedelta(minutes=5)):
+            for l in lf.loglines():
+                if l['type'] == 'reverse':
+                    reverse += 1
+                elif  l['type'] == 'search':
+                    if l['numres'] == '0':
+                        searchn += 1
+                    else:
+                        searchy += 1
+                elif  l['type'] == 'place':
+                    lookup +=1
+                else:
+                    details += 1
+
+
+    print('z1.value', reverse/5)
+    print('z2.value', searchy/5)
+    print('z3.value', searchn/5)
+    print('z4.value', lookup/5)
+    print('z4.value', details/5)
diff --git a/munin/nominatim_throttled_ips b/munin/nominatim_throttled_ips

new file mode 100755 (executable)

index 0000000..a56ff31
--- /dev/null
+++ b/munin/nominatim_throttled_ips
@@ -0,0 +1,28 @@
+#!/bin/sh
+#
+# Plugin to monitor the number of IPs in special pools
+#
+# Parameters: 
+#
+#       config   (required)
+#       autoconf (optional - used by munin-config)
+#
+ 
+if [ "$1" = "config" ]; then
+ 
+        echo 'graph_title Restricted IPs' 
+        echo 'graph_args -l 0'
+        echo 'graph_vlabel number of IPs'
+        echo 'graph_category nominatim'
+        echo 'bulk.label bulk'
+        echo 'bulk.draw AREA'
+        echo 'bulk.type GAUGE'
+        echo 'block.label blocked'
+        echo 'block.draw STACK'
+        echo 'block.type GAUGE'
+        exit 0
+fi
+ 
+BASEDIR="$(dirname "$(readlink -f "$0")")"
+
+cut -f 2 -d ' ' $BASEDIR/../../bin/settings/ip_blocks.map | sort | uniq -c | sed 's:[[:space:]]*\([0-9]\+\) \(.*\):\2.value \1:'
diff --git a/packaging/nominatim-api/pyproject.toml b/packaging/nominatim-api/pyproject.toml

index ca86f8a7e04be80dfe9d6e13158da6e3c39451dc..42eb93fcfbaf9e166e05a4e287d9155d6c868a65 100644 (file)
--- a/packaging/nominatim-api/pyproject.toml
+++ b/packaging/nominatim-api/pyproject.toml
@@ -1,5 +1,6 @@
  [project]
  name = "nominatim-api"
+version = "4.5.0.post3"
  description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Search library."
  readme = "README.md"
  requires-python = ">=3.7"
@@ -15,13 +16,15 @@ classifiers = [
      "Operating System :: OS Independent",
  ]
  dependencies = [
-    "python-dotenv",
-    "pyYAML>=5.1",
-    "SQLAlchemy>=1.4.31",
-    "psycopg",
-    "PyICU"
+    "python-dotenv==1.0.1",
+    "pyYAML==6.0.2",
+    "SQLAlchemy==2.0.36",
+    "psycopg[binary]==3.2.3",
+    "PyICU==2.14",
+    "falcon==4.0.2",
+    "uvicorn==0.32.0",
+    "gunicorn==23.0.0"
  ]
-dynamic = ["version"]
  
  [project.urls]
  Homepage = "https://nominatim.org"
diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml

index 841845f036f20e3ad5db19ba9f8c62fcfe44b4c4..782ce921719d5302c35caffce6a2358d877c69b6 100644 (file)
--- a/packaging/nominatim-db/pyproject.toml
+++ b/packaging/nominatim-db/pyproject.toml
@@ -1,5 +1,6 @@
  [project]
  name = "nominatim-db"
+version = "4.5.0.post3"
  description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Database backend."
  readme = "README.md"
  requires-python = ">=3.7"
@@ -15,15 +16,15 @@ classifiers = [
      "Operating System :: OS Independent",
  ]
  dependencies = [
-    "psycopg",
-    "python-dotenv",
-    "jinja2",
-    "pyYAML>=5.1",
-    "datrie",
-    "psutil",
-    "PyICU"
+    "psycopg[binary]==3.2.3",
+    "python-dotenv==1.0.1",
+    "jinja2==3.1.4",
+    "pyYAML==6.0.2",
+    "datrie==0.8.2",
+    "psutil==6.1.0",
+    "PyICU==2.14",
+    "osmium==4.0.2",
  ]
-dynamic = ["version"]
  
  [project.urls]
  Homepage = "https://nominatim.org"
diff --git a/settings/flex-base.lua b/settings/flex-base.lua

index 4d960d7267a3cc502c57938a2ebb12057ca95b31..7860737f7e6ef267a9bc8e27094ac1267f64e057 100644 (file)
--- a/settings/flex-base.lua
+++ b/settings/flex-base.lua
@@ -227,7 +227,7 @@ function Place:write_row(k, v, save_extra_mains)
      if self.geometry == nil then
          self.geometry = self.geom_func(self.object)
      end
-    if self.geometry:is_null() then
+    if self.geometry == nil or self.geometry:is_null() then
          return 0
      end
  
@@ -408,6 +408,9 @@ function module.process_way(object)
  
          if geom:is_null() then
              geom = o:as_linestring()
+            if not geom:is_null() and geom:length() > 30 then
+              return nil
+            end
          end
  
          return geom
diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py

index 632270ef04176f394a10e29d9397141bdeb5a457..1fbb7168bb44a963f31e83bfd99f6f534bcf9be5 100644 (file)
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -215,13 +215,13 @@ class SearchBuilder:
              yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
              return
  
-        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
+        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
          if name_fulls:
              fulls_count = sum(t.count for t in name_fulls)
  
-            if fulls_count < 50000 or addr_count < 30000:
+            if fulls_count < 80000 or addr_count < 50000:
                  yield penalty, fulls_count / (2**len(addr_tokens)), \
                      self.get_full_name_ranking(name_fulls, addr_partials,
                                                 fulls_count > 30000 / max(1, len(addr_tokens)))
@@ -269,12 +269,7 @@ class SearchBuilder:
          # This might yield wrong results, nothing we can do about that.
          if use_lookup:
              addr_restrict_tokens = []
-            addr_lookup_tokens = []
-            for t in addr_partials:
-                if t.addr_count > 20000:
-                    addr_restrict_tokens.append(t.token)
-                else:
-                    addr_lookup_tokens.append(t.token)
+            addr_lookup_tokens = [t.token for t in addr_partials]
          else:
              addr_restrict_tokens = [t.token for t in addr_partials]
              addr_lookup_tokens = []
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py

index fa14531aed0d6c07cf79c277255324495b1b063d..c18dd8be62ed1190284e9c0751464b5e54091a47 100644 (file)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -202,7 +202,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
-        return cast(str, self.normalizer.transliterate(text))
+        norm = cast(str, self.normalizer.transliterate(text))
+        numspaces = norm.count(' ')
+        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+            return ''
+
+        return norm
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py

index 83928644a9c3a9964e26af05c81ef061b8cfeb05..19818adb9d3df610ed04ec39b547cf99d5adc590 100644 (file)
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -182,6 +182,7 @@ class ICUTokenizer(AbstractTokenizer):
                                              END) as info
                                      FROM word LEFT JOIN word_frequencies wf
                                           ON word.word_id = wf.id
+                                    ORDER BY word_id
                                  """)
                      drop_tables(conn, 'word_frequencies')
  
diff --git a/utils/cron_ipanalyse.py b/utils/cron_ipanalyse.py

new file mode 100755 (executable)

index 0000000..97bad8d
--- /dev/null
+++ b/utils/cron_ipanalyse.py
@@ -0,0 +1,402 @@
+#!/usr/bin/python3
+#
+# Search apache logs for high-bandwith users and create a list of suspicious IPs.
+# There are three states: bulk, block, ban. The first are bulk requesters
+# that need throtteling, the second bulk requesters that have overdone it
+# and the last manually banned IPs.
+#
+
+import re
+import os
+import sys
+import subprocess
+from datetime import datetime, timedelta
+from collections import defaultdict
+
+#
+# DEFAULT SETTINGS
+#
+# Copy into settings/ip_blcoks.conf and adapt as required.
+#
+BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
+BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
+LOGFILE= BASEDIR + '/log/restricted_ip.log'
+
+# space-separated list of IPs that are never banned
+WHITELIST = ''
+# space-separated list of IPs manually blocked
+BLACKLIST = ''
+# user-agents that should be blocked from bulk mode
+# (matched with startswith)
+UA_BLOCKLIST = ()
+
+# time before a automatically blocked IP is allowed back
+BLOCKCOOLOFF_DELTA=timedelta(hours=1)
+# quiet time before an IP is released from the bulk pool
+BULKCOOLOFF_DELTA=timedelta(minutes=15)
+# time to check if new accesses appear despite being blocked
+BLOCKCHECK_DELTA=timedelta(minutes=1)
+
+BULKLONG_LIMIT=8000
+BULKSHORT_LIMIT=2000
+BLOCK_UPPER=19000
+BLOCK_LOWER=4000
+BLOCK_LOADFAC=380
+BULK_LOADFAC=160
+BULK_LOWER=1500
+MAX_BULK_IPS=85
+
+#
+# END OF DEFAULT SETTINGS
+#
+
+try:
+    with open(BASEDIR + "/settings/ip_blocks.conf") as f:
+        code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec')
+        exec(code)
+except IOError:
+    pass
+
+BLOCK_LIMIT = BLOCK_LOWER
+
+time_regex = r'(?P<t_day>\d\d)/(?P<t_month>[A-Za-z]+)/(?P<t_year>\d\d\d\d):(?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d) [+-]\d\d\d\d'
+
+format_pat= re.compile(r'(?P<ip>[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
+time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
+
+logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
+
+MONTHS = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
+           'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12 }
+
+class LogEntry:
+    def __init__(self, logline):
+        e = format_pat.match(logline)
+        if e is None:
+            raise ValueError("Invalid log line:", logline)
+        e = e.groupdict()
+        self.ip = e['ip']
+        self.date = datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+        qp = e['query'].split(' ', 2) 
+        if len(qp) < 2:
+            self.request = None
+            self.query = None
+        else:
+            self.query = qp[1]
+            if qp[0] == 'OPTIONS':
+                self.request = None
+            else:
+                if '/?' in qp[1]:
+                    self.request = 'S'
+                elif '/search' in qp[1]:
+                    self.request = 'S'
+                elif '/reverse' in qp[1]:
+                    self.request = 'R'
+                elif '/details' in qp[1]:
+                    self.request = 'D'
+                elif '/lookup' in qp[1]:
+                    self.request = 'L'
+                else:
+                    self.request = None
+        self.query = e['query']
+        self.retcode = int(e['return'])
+        self.referer = e['referer'] if e['referer'] != '-' else None
+        self.ua = e['ua'] if e['ua'] != '-' else None
+
+    def get_log_time(logline):
+        e = format_pat.match(logline)
+        if e is None:
+            return None
+        e = e.groupdict()
+        #return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
+        return datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+
+class LogFile:
+    """ An apache log file, unpacked. """
+
+    def __init__(self, filename):
+        self.fd = open(filename)
+        self.len = os.path.getsize(filename)
+
+    def __del__(self):
+        self.fd.close()
+
+    def seek_next(self, abstime):
+        self.fd.seek(abstime)
+        self.fd.readline()
+        l = self.fd.readline()
+        return LogEntry.get_log_time(l) if l is not None else None
+
+    def seek_to_date(self, target):
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
+            return False
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
+
+    def loglines(self):
+        for l in self.fd:
+            try:
+                yield LogEntry(l)
+            except ValueError:
+                pass # ignore invalid lines
+
+class BlockList:
+
+    def __init__(self):
+        self.whitelist = set(WHITELIST.split()) if WHITELIST else set()
+        self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set()
+        self.prevblocks = set()
+        self.prevbulks = set()
+
+        try:
+            fd = open(BLOCKEDFILE)
+            for line in fd:
+                ip, typ = line.strip().split(' ')
+                if ip not in self.blacklist:
+                    if typ == 'block':
+                        self.prevblocks.add(ip)
+                    elif typ == 'bulk':
+                        self.prevbulks.add(ip)
+            fd.close()
+        except IOError:
+            pass #ignore non-existing file
+
+
+class IPstats:
+
+    def __init__(self):
+        self.redirected = 0
+        self.short_total = 0
+        self.short_api = 0
+        self.long_total = 0
+        self.long_api = 0
+        self.block_total = 0
+        self.bad_ua = False
+
+    def add_long(self, logentry):
+        self.long_total += 1
+        if logentry.retcode == 301:
+            return
+        if logentry.request is not None:
+            self.long_api += 1
+        if not self.bad_ua:
+            if logentry.ua is None:
+                self.bad_ua = True
+
+    def add_short(self, logentry):
+        self.short_total += 1
+        if logentry.retcode == 301:
+            self.redirected += 1
+            return
+        if logentry.request is not None:
+            self.short_api += 1
+        self.add_long(logentry)
+
+    def add_block(self, logentry):
+        self.block_total += 1
+
+    def ignores_warnings(self, wasblocked):
+        return self.block_total > 5 or (wasblocked and self.redirected > 5)
+
+    def new_state(self, was_blocked, was_bulked):
+        if was_blocked:
+            # deblock only if the IP has been really quiet
+            # (properly catches the ones that simply ignore the HTTP error)
+            return None if self.long_total < 20 else 'block'
+        if self.long_api > BLOCK_UPPER \
+            or self.short_api > BLOCK_UPPER / 3 \
+            or (self.redirected > 100 and self.short_total == self.redirected):
+                # client totally overdoing it
+                return 'block'
+        if was_bulked:
+            if self.short_total < 20:
+                # client has stopped, debulk
+                return None
+            if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
+                # client is still hammering us, block
+                return 'emblock'
+            return 'bulk'
+
+        if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
+            #if self.bad_ua:
+            #    return 'uablock' # bad useragent
+            return 'bulk'
+
+        return None
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: %s logfile startdate" % sys.argv[0])
+        sys.exit(-1)
+
+    if len(sys.argv) == 2:
+        dt = datetime.now() - BLOCKCOOLOFF_DELTA
+    else:
+        dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
+
+    if os.path.getsize(sys.argv[1]) < 2*1030*1024:
+        sys.exit(0) # not enough data
+
+    lf = LogFile(sys.argv[1])
+    if not lf.seek_to_date(dt):
+        sys.exit(0)
+
+    bl = BlockList()
+
+    shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA
+    blockstart = dt + BLOCKCOOLOFF_DELTA - BLOCKCHECK_DELTA
+    notlogged = bl.whitelist | bl.blacklist
+
+    stats = defaultdict(IPstats)
+
+    for l in lf.loglines():
+        if l.ip not in notlogged:
+            stats[l.ip].add_long(l)
+        if l.date > shortstart:
+            break
+
+    total200 = 0
+    for l in lf.loglines():
+        if l.ip not in notlogged:
+            stats[l.ip].add_short(l)
+        if l.request is not None and l.retcode == 200:
+            total200 += 1
+        if l.date > blockstart and l.retcode in (403, 429):
+            stats[l.ip].add_block(l)
+
+    # adapt limits according to CPU and DB load
+    fd = open("/proc/loadavg")
+    cpuload = int(float(fd.readline().split()[2]))
+    fd.close()
+    # check the number of excess connections to apache
+    dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True))
+    fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True))
+    dbload = max(0, dbcons - fpms)
+
+    numbulks = len(bl.prevbulks)
+    BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload)
+    BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload)
+    if numbulks > MAX_BULK_IPS:
+        BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
+    # if the bulk pool is still empty, clients will be faster, avoid having
+    # them blocked in this case
+    if numbulks < 10:
+        BLOCK_UPPER *= 2
+        BLOCK_LIMIT = BLOCK_UPPER
+
+
+    # collecting statistics
+    unblocked = []
+    debulked = []
+    bulked = []
+    blocked = []
+    uablocked = []
+    emblocked = []
+    # write out new state file
+    fd = open(BLOCKEDFILE, 'w')
+    for k,v in stats.items():
+        wasblocked = k in bl.prevblocks
+        wasbulked = k in bl.prevbulks
+        state = v.new_state(wasblocked, wasbulked)
+        if state is not None:
+            if state == 'uablock':
+                uablocked.append(k)
+                state = 'block'
+            elif state == 'emblock':
+                emblocked.append(k)
+                state = 'block'
+            elif state == 'block':
+                if not wasblocked:
+                    blocked.append(k)
+            elif state == 'bulk':
+                if not wasbulked:
+                    bulked.append(k)
+            fd.write("%s %s\n" % (k, state))
+        else:
+            if wasblocked:
+                unblocked.append(k)
+            elif wasbulked:
+                debulked.append(k)
+    for i in bl.blacklist:
+        fd.write("%s ban\n" % i)
+    fd.close()
+
+    # TODO write logs (need to collect some statistics)
+    logstr = datetime.now().strftime('%d/%b/%Y:%H:%M:%S') + ' %s %s\n'
+    fd = open(LOGFILE, 'a')
+    if unblocked:
+        fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
+    if debulked:
+        fd.write(logstr % (' debulked:', ', '.join(debulked)))
+    if bulked:
+        fd.write(logstr % ('new bulks:', ', '.join(bulked)))
+    if emblocked:
+        fd.write(logstr % ('dir.block:', ', '.join(emblocked)))
+    if uablocked:
+        fd.write(logstr % (' ua block:', ', '.join(uablocked)))
+    if blocked:
+        fd.write(logstr % ('new block:', ', '.join(blocked)))
+    #for k,v in stats.items():
+    #    if v.ignores_warnings(k in bl.prevblocks) and k not in notlogged and ':' not in k:
+    #        fd.write(logstr % ('Warning ignored:', k))
+    fd.close()
diff --git a/website/403.html b/website/403.html

new file mode 100644 (file)

index 0000000..8d8e323
--- /dev/null
+++ b/website/403.html
@@ -0,0 +1,23 @@
+<html>
+<head>
+<title>Access blocked</title>
+</head>
+<body>
+<h1>Access blocked</h1>
+
+<p>You have been blocked because you have violated the
+<a href="https://operations.osmfoundation.org/policies/nominatim/">usage policy</a>
+of OSM's Nominatim geocoding service. Please be aware that OSM's resources are
+limited and shared between many users. The usage policy is there to ensure that
+the service remains usable for everybody.</p>
+
+<p>Please review the terms and make sure that your
+software adheres to the terms. You should in particular verify that you have set a
+<b>custom HTTP referrer or HTTP user agent</b> that identifies your application, and
+that you are not overusing the service with massive bulk requests.</p>
+
+<p>If you feel that this block is unjustified or remains after you have adopted
+your usage, you may contact the Nominatim system administrator at
+nominatim@openstreetmap.org to have this block lifted.</p>
+</body>
+</head>
diff --git a/website/404-old-search-syntax.html b/website/404-old-search-syntax.html

new file mode 100644 (file)

index 0000000..a6ea5b7
--- /dev/null
+++ b/website/404-old-search-syntax.html
@@ -0,0 +1,24 @@
+<html>
+<head>
+<title>File Not Found</title>
+</head>
+<body>
+<h1>File not found: API no longer accessible via this URL</h1>
+
+<p>Using the URL <tt>/search/</tt> and <tt>/reverse/</tt> (with slashes)
+is no longer supported. Please use URLs as given in the documentation.</p>
+
+<p><b>Examples how to change the URL:</b></p>
+
+<p>You use: <tt>https://nominatim.openstreetmap.org/search/?q=Berlin</tt><br/>
+Change to: <tt>https://nominatim.openstreetmap.org/search?q=Berlin</tt>
+</p>
+
+<p>You use: <tt>https://nominatim.openstreetmap.org/search/US/Texas/Huston</tt><br/>
+Change to: <tt>https://nominatim.openstreetmap.org/search?q=Huston, Texas, US</tt>
+</p>
+
+<p>See <a href="https://github.com/osm-search/Nominatim/issues/3134">github issue #3134</a>
+for more details.</p>
+</body>
+</head>
diff --git a/website/509.html b/website/509.html

new file mode 100644 (file)

index 0000000..628c53b
--- /dev/null
+++ b/website/509.html
@@ -0,0 +1,12 @@
+<html>
+<head>
+<title>Bandwidth limit exceeded</title>
+</head>
+<body>
+<h1>Bandwidth limit exceeded</h1>
+
+<p>You have been temporarily blocked because you have been overusing OSM's geocoding service or because you have not provided sufficient identification of your application. This block will be automatically lifted after a while. Please take the time and adapt your scripts to reduce the number of requests and make sure that you send a valid UserAgent or Referer.</p>
+
+<p>For more information, consult the <a href="https://operations.osmfoundation.org/policies/nominatim/">usage policy</a> for the OSM Nominatim server.</p>
+</body>
+</html>
diff --git a/website/crossdomain.xml b/website/crossdomain.xml

new file mode 100644 (file)

index 0000000..963a682
--- /dev/null
+++ b/website/crossdomain.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+           <!DOCTYPE cross-domain-policy SYSTEM "http://www.macromedia.com/xml/dtds/cross-domain-policy.dtd">
+           <cross-domain-policy>
+           <allow-access-from domain="*" />
+           </cross-domain-policy> 
diff --git a/website/favicon.ico b/website/favicon.ico

new file mode 100644 (file)

index 0000000..0157ea0

Binary files /dev/null and b/website/favicon.ico differ
diff --git a/website/nominatim.xml b/website/nominatim.xml

new file mode 100644 (file)

index 0000000..28684b1
--- /dev/null
+++ b/website/nominatim.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
+                       xmlns:moz="http://www.mozilla.org/2006/browser/search/">
+       <ShortName>Nominatim</ShortName>
+       <LongName>Nominatim OSM Search</LongName>
+       <Description>Search for a place in OpenStreetMap Nominatim</Description>
+       <InputEncoding>UTF-8</InputEncoding>
+       <OutputEncoding>UTF-8</OutputEncoding>
+       <Url type="text/html" method="get" template="http://nominatim.openstreetmap.org/search/?q={searchTerms}" />
+       <Query role="example" searchTerms="Reigate" />
+       <Developer>Brian Quinion</Developer>
+       <AdultContent>false</AdultContent>
+       <Attribution>Data &amp;copy; OpenStreetMap contributors, Some Rights Reserved. ODbL, http://www.osm.org/copyright.</Attribution>
+</OpenSearchDescription>
+
diff --git a/website/robots.txt b/website/robots.txt

new file mode 100644 (file)

index 0000000..9624d97
--- /dev/null
+++ b/website/robots.txt
@@ -0,0 +1,14 @@
+User-agent: ia_archiver
+Allow: /
+
+User-agent: *
+Disallow: /search.php
+Disallow: /search
+Disallow: /details.php
+Disallow: /details
+Disallow: /reverse.php
+Disallow: /reverse
+Disallow: /hierarchy
+Disallow: /hierarchy.php
+Disallow: /lookup
+Disallow: /lookup.php
diff --git a/website/taginfo.json b/website/taginfo.json

new file mode 100644 (file)

index 0000000..98f8b97
--- /dev/null
+++ b/website/taginfo.json
@@ -0,0 +1,112 @@
+{
+    "data_format": 1,
+    "data_url": "http://nominatim.openstreetmap.org/taginfo.json",
+    "project": {
+        "name": "Nominatim",
+        "description": "OSM search engine.",
+        "project_url": "http://nominatim.openstreetmap.org",
+        "doc_url": "http://wiki.osm.org/wiki/Nominatim",
+        "contact_name": "Sarah Hoffmann",
+        "contact_email": "lonvia@denofr.de"
+    },
+    "tags": [
+      { "key" : "ref", "description": "Searchable name of the place."},
+      { "key" : "int_ref", "description": "Searchable name of the place."},
+      { "key" : "nat_ref", "description": "Searchable name of the place."},
+      { "key" : "reg_ref", "description": "Searchable name of the place."},
+      { "key" : "loc_ref", "description": "Searchable name of the place."},
+      { "key" : "old_ref", "description": "Searchable name of the place."},
+      { "key" : "iata", "description": "Searchable name of the place."},
+      { "key" : "icao", "description": "Searchable name of the place."},
+      { "key" : "pcode", "description": "Searchable name of the place."},
+      { "key" : "name", "description": "Searchable name of the place."},
+      { "key" : "int_name", "description": "Searchable name of the place."},
+      { "key" : "nat_name", "description": "Searchable name of the place."},
+      { "key" : "reg_name", "description": "Searchable name of the place."},
+      { "key" : "loc_name", "description": "Searchable name of the place."},
+      { "key" : "old_name", "description": "Searchable name of the place."},
+      { "key" : "alt_name", "description": "Searchable name of the place."},
+      { "key" : "official_name", "description": "Searchable name of the place."},
+      { "key" : "place_name", "description": "Searchable name of the place."},
+      { "key" : "short_name", "description": "Searchable name of the place."},
+      { "key" : "addr:housename", "description": "Searchable name of the place."},
+      { "key" : "operator", "description": "Searchable name for amenities and shops." },
+      { "key" : "brand", "description": "Searchable name of POI places."},
+      { "key" : "bridge:name", "description" : "Searchable name for bridges."},
+      { "key" : "tunnel:name", "description" : "Searchable name for tunnels."},
+      { "key" : "emergency", "description": "POI in the search database." },
+      { "key" : "tourism", "description": "POI in the search database." },
+      { "key" : "historic", "description": "POI in the search database." },
+      { "key" : "military", "description": "POI in the search database." },
+      { "key" : "natural", "description": "POI in the search database." },
+      { "key" : "man_made", "description": "POI in the search database." },
+      { "key" : "mountain_pass", "description": "POI in the search database." },
+      { "key" : "highway", "description": "POI or street in the search database (not added are: 'no', 'turning_circle', 'traffic_signals', 'mini_roundabout', 'crossing' and traffic signs)." },
+      { "key" : "aerialway", "description": "POI in the search database (unless value is 'no', 'pylon')." },
+      { "key" : "aeroway", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "amenity", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "boundary", "description": "Area in the search database (used to compute addresses of other places)." },
+      { "key" : "bridge", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "craft", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "leisure", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "office", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "railway", "description": "Geographic feature in the search database (unless value is 'no')." },
+      { "key" : "landuse", "description": "Geographic feature in the search database (unless value is 'no')." },
+      { "key" : "shop", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "tunnel", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "waterway", "description": "Geographic feature in the search database (unless value is 'riverbank')."},
+      { "key" : "place", "description": "Settlement on the search database (used to compute addresses of other places)." },
+      { "key" : "postal_code", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "addr:postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "tiger:zip_left", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "tiger:zip_right", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "addr:street", "description": "Used to determine the street of a house or POI. Note that a street with the same name must exist for the tag to be effective."},
+      { "key" : "addr:place", "description": "Used to determine the settlement of a house or POI with a street-less address."},
+      { "key" : "country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "ISO3166-1", "description": "Used to determine the country a place is in."},
+      { "key" : "is_in:country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "is_in:country", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:country", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:housenumber", "description": "House number of the place (no ranges)."},
+      { "key" : "addr:conscriptionnumber", "description": "House number of the place (Eastern European system)."},
+      { "key" : "addr:streetnumber", "description": "House number of the place (Eastern European system)."},
+      { "key" : "addr:interpolation", "description": "Way along which house numbers are interpolated."} ,
+      { "key" : "tiger:county", "description": "Used to determine the address in the US (needs a place with the same name and a county suffix)."},
+      { "key" : "addr:suburb", "description": "Used to determine the address of a place."},
+      { "key" : "addr:city", "description": "Used to determine the address of a place."},
+      { "key" : "addr:state_code", "description": "Used to determine the address of a place."},
+      { "key" : "addr:state", "description": "Used to determine the address of a place."},
+      { "key" : "addr:province", "description": "Used to determine the address of a place."},
+      { "key" : "addr:district", "description": "Used to determine the address of a place."},
+      { "key" : "addr:region", "description": "Used to determine the address of a place."},
+      { "key" : "addr:county", "description": "Used to determine the address of a place."},
+      { "key" : "addr:municipality", "description": "Used to determine the address of a place."},
+      { "key" : "addr:hamlet", "description": "Used to determine the address of a place."},
+      { "key" : "addr:village", "description": "Used to determine the address of a place."},
+      { "key" : "addr:subdistrict", "description": "Used to determine the address of a place."},
+      { "key" : "addr:town", "description": "Used to determine the address of a place."},
+      { "key" : "addr:neighbourhood", "description": "Used to determine the address of a place."},
+      { "key" : "addr:quarter", "description": "Used to determine the address of a place."},
+      { "key" : "addr:parish", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:suburb", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:city", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:state_code", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:state", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:province", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:district", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:region", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:county", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:municipality", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:hamlet", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:village", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:subdistrict", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:town", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:neighbourhood", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:quarter", "description": "Used to determine the address of a place."},
+      { "key" : "is_in:parish", "description": "Used to determine the address of a place."},
+      { "key" : "admin_level", "description": "Determines the hierarchy for administrative boundaries."},
+      { "key" : "wikipedia", "description": "Linking to the right wikipedia article helps to guess the importance of a place, which determines how far up in the search results it appears."}
+   ]
+}
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
CMakeLists.txt		patch \| blob \| history
cmake/tool-installed.tmpl		patch \| blob \| history
cmake/tool.tmpl		patch \| blob \| history
lib-php/website/403.html	[new file with mode: 0644]	patch \| blob
lib-php/website/509.html	[new file with mode: 0644]	patch \| blob
lib-php/website/crossdomain.xml	[new file with mode: 0644]	patch \| blob
lib-php/website/favicon.ico	[new file with mode: 0644]	patch \| blob
lib-php/website/nominatim.xml	[new file with mode: 0644]	patch \| blob
lib-php/website/robots.txt	[new file with mode: 0644]	patch \| blob
lib-php/website/taginfo.json	[new file with mode: 0644]	patch \| blob
lib-sql/functions/address_lookup.sql		patch \| blob \| history
munin/nominatim_query_speed_querylog	[new file with mode: 0755]	patch \| blob
munin/nominatim_requests_querylog	[new file with mode: 0755]	patch \| blob
munin/nominatim_throttled_ips	[new file with mode: 0755]	patch \| blob
packaging/nominatim-api/pyproject.toml		patch \| blob \| history
packaging/nominatim-db/pyproject.toml		patch \| blob \| history
settings/flex-base.lua		patch \| blob \| history
src/nominatim_api/search/db_search_builder.py		patch \| blob \| history
src/nominatim_api/search/icu_tokenizer.py		patch \| blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py		patch \| blob \| history
utils/cron_ipanalyse.py	[new file with mode: 0755]	patch \| blob
website/403.html	[new file with mode: 0644]	patch \| blob
website/404-old-search-syntax.html	[new file with mode: 0644]	patch \| blob
website/509.html	[new file with mode: 0644]	patch \| blob
website/crossdomain.xml	[new file with mode: 0644]	patch \| blob
website/favicon.ico	[new file with mode: 0644]	patch \| blob
website/nominatim.xml	[new file with mode: 0644]	patch \| blob
website/robots.txt	[new file with mode: 0644]	patch \| blob
website/taginfo.json	[new file with mode: 0644]	patch \| blob