Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 4 Aug 2020 12:32:09 +0000 (14:32 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Tue, 4 Aug 2020 12:32:09 +0000 (14:32 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 4 Aug 2020 12:32:09 +0000 (14:32 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Tue, 4 Aug 2020 12:32:09 +0000 (14:32 +0200)
diff --git a/VAGRANT.md b/VAGRANT.md

index 4c8eb724e5fbad5c227b2aad111562cae22db1e7..b10a5ac779cc9e9fe756f7e375e838ad0a37a827 100644 (file)
--- a/VAGRANT.md
+++ b/VAGRANT.md
@@ -160,9 +160,9 @@ You can configure/download other Vagrant boxes from [https://app.vagrantup.com/b
  
  Let's say you have a Postgres database named `nominatim_it` on server `your-server.com` and port `5432`. The Postgres username is `postgres`. You can edit `settings/local.php` and point Nominatim to it.
  
-    pgsql://postgres@your-server.com:5432/nominatim_it
+    pgsql:host=your-server.com;port=5432;user=postgres;dbname=nominatim_it
      
-No data import necessary or restarting necessary.
+No data import or restarting necessary.
  
  If the Postgres installation is behind a firewall, you can try
  
diff --git a/docs/develop/Setup.md b/docs/develop/Setup.md

index 53a084d242b9730aa7a4a24af92760586d4a7b20..8a73280e89d21fef762608134563ad2507ffcd8f 100644 (file)
--- a/docs/develop/Setup.md
+++ b/docs/develop/Setup.md
@@ -12,6 +12,12 @@ move fast. We try to test against the newest stable PHP release and
  PHPUnit version even though we expect many Nominatim users will install
  older version on their production servers.
  
+ * Python 3 (https://www.python.org/)
+ * behave test framework >= 1.2.5 (https://github.com/behave/behave)
+ * nose (https://nose.readthedocs.org)
+ * pytidylib (http://countergram.com/open-source/pytidylib)
+ * psycopg2 (http://initd.org/psycopg/)
+
  #### Ubuntu 20
  
      sudo apt-get install -y phpunit php-codesniffer php-cgi
diff --git a/nominatim/indexer/__init__.py b/nominatim/indexer/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/nominatim/indexer/progress.py b/nominatim/indexer/progress.py

new file mode 100644 (file)

index 0000000..456d3ea
--- /dev/null
+++ b/nominatim/indexer/progress.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim.
+# Copyright (C) 2020 Sarah Hoffmann
+
+import logging
+from datetime import datetime
+
+log = logging.getLogger()
+
+class ProgressLogger(object):
+    """ Tracks and prints progress for the indexing process.
+        `name` is the name of the indexing step being tracked.
+        `total` sets up the total number of items that need processing.
+        `log_interval` denotes the interval in seconds at which progres
+        should be reported.
+    """
+
+    def __init__(self, name, total, log_interval=1):
+        self.name = name
+        self.total_places = total
+        self.done_places = 0
+        self.rank_start_time = datetime.now()
+        self.next_info = 100 if log.isEnabledFor(logging.INFO) else total + 1
+
+    def add(self, num=1):
+        """ Mark `num` places as processed. Print a log message if the
+            logging is at least info and the log interval has past.
+        """
+        self.done_places += num
+
+        if self.done_places >= self.next_info:
+            now = datetime.now()
+            done_time = (now - self.rank_start_time).total_seconds()
+            places_per_sec = self.done_places / done_time
+            eta = (self.total_places - self.done_places)/places_per_sec
+
+            log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}"
+                     .format(self.done_places, int(done_time),
+                             places_per_sec, self.name, eta))
+
+            self.next_info += int(places_per_sec)
+
+    def done(self):
+        """ Print final staticstics about the progress.
+        """
+        rank_end_time = datetime.now()
+        diff_seconds = (rank_end_time-self.rank_start_time).total_seconds()
+
+        log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format(
+                    self.done_places, self.total_places, int(diff_seconds),
+                    self.done_places/diff_seconds, self.name))
diff --git a/nominatim/nominatim.py b/nominatim/nominatim.py

index b29bf343e035e02cdfd62a43e0e0d3a01ebebcd5..e8600ca8dc41bb69cd19491c8a2b5abc01c302d1 100755 (executable)
--- a/nominatim/nominatim.py
+++ b/nominatim/nominatim.py
@@ -32,6 +32,8 @@ import psycopg2
  from psycopg2.extras import wait_select
  import select
  
+from indexer.progress import ProgressLogger
+
  log = logging.getLogger()
  
  def make_connection(options, asynchronous=False):
@@ -55,24 +57,19 @@ class RankRunner(object):
      def name(self):
          return "rank {}".format(self.rank)
  
-    def sql_index_sectors(self):
-        return """SELECT geometry_sector, count(*) FROM placex
+    def sql_count_objects(self):
+        return """SELECT count(*) FROM placex
                    WHERE rank_search = {} and indexed_status > 0
-                  GROUP BY geometry_sector
-                  ORDER BY geometry_sector""".format(self.rank)
+               """.format(self.rank)
  
-    def sql_nosector_places(self):
+    def sql_get_objects(self):
          return """SELECT place_id FROM placex
                    WHERE indexed_status > 0 and rank_search = {}
                    ORDER BY geometry_sector""".format(self.rank)
  
-    def sql_sector_places(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_search = {}
-                        and geometry_sector = %s""".format(self.rank)
-
-    def sql_index_place(self):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id = %s"
+    def sql_index_place(self, ids):
+        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
+               .format(','.join((str(i) for i in ids)))
  
  
  class InterpolationRunner(object):
@@ -83,25 +80,19 @@ class InterpolationRunner(object):
      def name(self):
          return "interpolation lines (location_property_osmline)"
  
-    def sql_index_sectors(self):
-        return """SELECT geometry_sector, count(*) FROM location_property_osmline
-                  WHERE indexed_status > 0
-                  GROUP BY geometry_sector
-                  ORDER BY geometry_sector"""
+    def sql_count_objects(self):
+        return """SELECT count(*) FROM location_property_osmline
+                  WHERE indexed_status > 0"""
  
-    def sql_nosector_places(self):
+    def sql_get_objects(self):
          return """SELECT place_id FROM location_property_osmline
                    WHERE indexed_status > 0
                    ORDER BY geometry_sector"""
  
-    def sql_sector_places(self):
-        return """SELECT place_id FROM location_property_osmline
-                  WHERE indexed_status > 0 and geometry_sector = %s
-                  ORDER BY geometry_sector"""
-
-    def sql_index_place(self):
+    def sql_index_place(self, ids):
          return """UPDATE location_property_osmline
-                  SET indexed_status = 0 WHERE place_id = %s"""
+                  SET indexed_status = 0 WHERE place_id IN ({})"""\
+               .format(','.join((str(i) for i in ids)))
  
  
  class DBConnection(object):
@@ -210,83 +201,48 @@ class Indexer(object):
              self.index(RankRunner(rank))
  
          if self.maxrank == 30:
-            self.index(InterpolationRunner())
+            self.index(InterpolationRunner(), 20)
  
-        self.index(RankRunner(self.maxrank))
+        self.index(RankRunner(self.maxrank), 20)
  
-    def index(self, obj):
+    def index(self, obj, batch=1):
          """ Index a single rank or table. `obj` describes the SQL to use
-            for indexing.
+            for indexing. `batch` describes the number of objects that
+            should be processed with a single SQL statement
          """
          log.warning("Starting {}".format(obj.name()))
  
-        cur = self.conn.cursor(name='main')
-        cur.execute(obj.sql_index_sectors())
+        cur = self.conn.cursor()
+        cur.execute(obj.sql_count_objects())
  
-        total_tuples = 0
-        for r in cur:
-            total_tuples += r[1]
-        log.debug("Total number of rows; {}".format(total_tuples))
+        total_tuples = cur.fetchone()[0]
+        log.debug("Total number of rows: {}".format(total_tuples))
  
-        cur.scroll(0, mode='absolute')
+        cur.close()
  
          next_thread = self.find_free_thread()
-        done_tuples = 0
-        rank_start_time = datetime.now()
-
-        sector_sql = obj.sql_sector_places()
-        index_sql = obj.sql_index_place()
-        min_grouped_tuples = total_tuples - len(self.threads) * 1000
+        progress = ProgressLogger(obj.name(), total_tuples)
  
-        next_info = 100 if log.isEnabledFor(logging.INFO) else total_tuples + 1
+        cur = self.conn.cursor(name='places')
+        cur.execute(obj.sql_get_objects())
  
-        for r in cur:
-            sector = r[0]
-
-            # Should we do the remaining ones together?
-            do_all = done_tuples > min_grouped_tuples
-
-            pcur = self.conn.cursor(name='places')
-
-            if do_all:
-                pcur.execute(obj.sql_nosector_places())
-            else:
-                pcur.execute(sector_sql, (sector, ))
-
-            for place in pcur:
-                place_id = place[0]
-                log.debug("Processing place {}".format(place_id))
-                thread = next(next_thread)
-
-                thread.perform(index_sql, (place_id,))
-                done_tuples += 1
+        while True:
+            places = [p[0] for p in cur.fetchmany(batch)]
+            if len(places) == 0:
+                break
  
-                if done_tuples >= next_info:
-                    now = datetime.now()
-                    done_time = (now - rank_start_time).total_seconds()
-                    tuples_per_sec = done_tuples / done_time
-                    log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}"
-                           .format(done_tuples, int(done_time),
-                                   tuples_per_sec, obj.name(),
-                                   (total_tuples - done_tuples)/tuples_per_sec))
-                    next_info += int(tuples_per_sec)
+            log.debug("Processing places: {}".format(places))
+            thread = next(next_thread)
  
-            pcur.close()
-
-            if do_all:
-                break
+            thread.perform(obj.sql_index_place(places))
+            progress.add(len(places))
  
          cur.close()
  
          for t in self.threads:
              t.wait()
  
-        rank_end_time = datetime.now()
-        diff_seconds = (rank_end_time-rank_start_time).total_seconds()
-
-        log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format(
-                 done_tuples, total_tuples, int(diff_seconds),
-                 done_tuples/diff_seconds, obj.name()))
+        progress.done()
  
      def find_free_thread(self):
          """ Generator that returns the next connection that is free for
diff --git a/sql/partition-functions.src.sql b/sql/partition-functions.src.sql

index 34b4d390750798cb506f5fe11507535c0c20278e..17db9c1681060ab8f1d2fc2f667b8e595990d3d9 100644 (file)
--- a/sql/partition-functions.src.sql
+++ b/sql/partition-functions.src.sql
@@ -10,6 +10,28 @@ CREATE TYPE nearfeaturecentr AS (
    centroid GEOMETRY
  );
  
+        -- feature intersects geoemtry
+        -- for areas and linestrings they must touch at least along a line
+CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
+RETURNS BOOLEAN
+AS $$
+BEGIN
+  IF substring(de9im from 1 for 2) != 'FF' THEN
+    RETURN TRUE;
+  END IF;
+
+  IF geom_type = 'ST_Point' THEN
+    RETURN substring(de9im from 4 for 1) = '0';
+  END IF;
+
+  IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
+    RETURN substring(de9im from 4 for 1) = '1';
+  END IF;
+
+  RETURN substring(de9im from 4 for 1) = '2';
+END
+$$ LANGUAGE plpgsql IMMUTABLE;
+
  create or replace function getNearFeatures(in_partition INTEGER, feature GEOMETRY, maxrank INTEGER, isin_tokens INT[]) RETURNS setof nearfeaturecentr AS $$
  DECLARE
    r nearfeaturecentr%rowtype;
@@ -20,7 +42,8 @@ BEGIN
      FOR r IN 
        SELECT place_id, keywords, rank_address, rank_search, min(ST_Distance(feature, centroid)) as distance, isguess, postcode, centroid
        FROM location_area_large_-partition-
-      WHERE ST_Intersects(geometry, feature)
+      WHERE geometry && feature
+        AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
          AND rank_search < maxrank AND rank_address < maxrank
        GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
        ORDER BY rank_address, isin_tokens && keywords desc, isguess asc,
diff --git a/test/README.md b/test/README.md

index 1f7a33ca47f5b1c5d4ca541a4113ec860733da82..c2dee41a9ae73d13e8f298760c82793de4b1b1d1 100644 (file)
--- a/test/README.md
+++ b/test/README.md
@@ -3,17 +3,7 @@ This directory contains functional and unit tests for the Nominatim API.
  Prerequisites
  =============
  
- * Python 3 (https://www.python.org/)
- * behave test framework >= 1.2.5 (https://github.com/behave/behave)
- * nose (https://nose.readthedocs.org)
- * pytidylib (http://countergram.com/open-source/pytidylib)
- * psycopg2 (http://initd.org/psycopg/)
-
-To get the prerequisites on a a fresh Ubuntu LTS 16.04 run:
-
-     [sudo] apt-get install python3-dev python3-pip python3-psycopg2 python3-tidylib phpunit php-cgi
-     pip3 install --user behave nose
-
+See `docs/develop/Setup.md`
  
  Overall structure
  =================
@@ -106,34 +96,47 @@ be documented.
  ### API Tests (`test/bdd/api`)
  
  These tests are meant to test the different API endpoints and their parameters.
-They require a preimported test database, which consists of the import of a
-planet extract. A precompiled PBF with the necessary data can be downloaded from
-https://www.nominatim.org/data/test/nominatim-api-testdata.pbf
+They require a to import several datasets into a test database. You need at
+least 2GB RAM and 10GB discspace.
  
-You need at least 2GB RAM and 10GB discspace.
  
-The polygons defining the extract can be found in the test/testdb
-directory. There is also a reduced set of wikipedia data for this extract,
-which you need to import as well. For Tiger tests the data of South Dakota
-is required. Get the Tiger files `46*`.
+1. Fetch the OSM planet extract (200MB). See end of document how it got created.
  
+    ```
      cd Nominatim/data
+    mkdir testdb
+    cd testdb
+    wget https://www.nominatim.org/data/test/nominatim-api-testdata.pbf
+    ```
+
+2. Fetch `46*` (South Dakota) Tiger data
+
+    ```
+    cd Nominatim/data/testdb
      wget https://nominatim.org/data/tiger2018-nominatim-preprocessed.tar.gz
      tar xvf tiger2018-nominatim-preprocessed.tar.gz --wildcards --no-anchored '46*'
      rm tiger2018-nominatim-preprocessed.tar.gz
+    ```
  
-The official test dataset is derived from the 180924 planet. Newer
-planets are likely to work as well but you may see isolated test
-failures where the data has changed. To recreate the input data
-for the test database run:
+3. Adapt build/settings/local.php settings:
  
-    wget https://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-180924.osm.pbf
-    osmconvert planet-180924.osm.pbf -B=test/testdb/testdb.polys -o=testdb.pbf
+       ```
+    @define('CONST_Database_DSN', 'pgsql:dbname=test_api_nominatim');
+    @define('CONST_Use_US_Tiger_Data', true);
+    @define('CONST_Tiger_Data_Path', CONST_ExtraDataPath.'/testdb');
+    @define('CONST_Wikipedia_Data_Path', CONST_BasePath.'/test/testdb');
+    ```
  
-Before importing make sure to add the following to your local settings:
+4. Import
  
-    @define('CONST_Database_DSN', 'pgsql://@/test_api_nominatim');
-    @define('CONST_Wikipedia_Data_Path', CONST_BasePath.'/test/testdb');
+    ```
+       LOGFILE=/tmp/nominatim-testdb.$$.log
+    dropdb --if-exists test_api_nominatim
+    ./utils/setup.php --all --osm-file ../Nominatim/data/testdb/nominatim-api-testdb.pbf 2>&1 | tee $LOGFILE
+    ./utils/specialphrases.php --wiki-import > specialphrases.sql
+    psql -d test_api_nominatim -f specialphrases.sql 2>&1 | tee -a $LOGFILE
+    ./utils/setup.php --import-tiger-data 2>&1 | tee -a $LOGFILE
+    ```
  
  #### Code Coverage
  
@@ -168,3 +171,16 @@ needs superuser rights for postgres.
  
  These tests check that data is imported correctly into the place table. They
  use the same template database as the Indexing tests, so the same remarks apply.
+
+
+
+How the test database extract was generated
+-------------------------------------------
+The official test dataset was derived from the 180924 planet (note: such
+file no longer exists at https://planet.openstreetmap.org/planet/2018/).
+Newer planets are likely to work as well but you may see isolated test
+failures where the data has changed. To recreate the input data
+for the test database run:
+
+    wget https://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-180924.osm.pbf
+    osmconvert planet-180924.osm.pbf -B=test/testdb/testdb.polys -o=testdb.pbf
diff --git a/test/bdd/api/search/queries.feature b/test/bdd/api/search/queries.feature

index 57ae2c2038383349e8b7404e0017ca1cedd1455a..b521c1714f89a1afc8ef058f29fc851c71ef91b9 100644 (file)
--- a/test/bdd/api/search/queries.feature
+++ b/test/bdd/api/search/queries.feature
@@ -23,7 +23,6 @@ Feature: Search queries
            | type          | value |
            | house_number  | 86 |
            | road          | Schellingstraße |
-          | neighbourhood | Auenviertel |
            | suburb        | Eilbek |
            | postcode      | 22089 |
            | city          | Hamburg |
@@ -38,7 +37,6 @@ Feature: Search queries
            | type          | value |
            | house_number  | 73 |
            | road          | Schellingstraße |
-          | neighbourhood | Auenviertel |
            | suburb        | Eilbek |
            | postcode      | 22089 |
            | city          | Hamburg |
@@ -183,4 +181,4 @@ Feature: Search queries
          When sending json search query "22nd   Street Southwest\v,\fHuron"
          Then results contain
            | class   | type |
-          | highway | residential |
-\ No newline at end of file
+          | highway | residential |
diff --git a/test/bdd/db/import/addressing.feature b/test/bdd/db/import/addressing.feature

index f4297f30a34deda9024edf85e58385fbd5d7454a..ddb7b438be69e1f7db5fddcdbff7c25c2d89d687 100644 (file)
--- a/test/bdd/db/import/addressing.feature
+++ b/test/bdd/db/import/addressing.feature
@@ -21,6 +21,99 @@ Feature: Address computation
              | W1     | W10     | 10                  |
              | W1     | W11     | 10                  |
  
+
+    Scenario: Roads following a boundary should contain both states
+        Given the grid
+            | 1 |   |   | 2 |   | 3 |
+            |   |   | 8 | 7 |   |   |
+            | 4 |   |   | 5 |   | 6 |
+        And the named places
+            | osm | class   | type | geometry |
+            | W1  | highway | road | 2, 7, 8  |
+        And the named places
+            | osm | class    | type           | admin | geometry      |
+            | W10 | boundary | administrative | 5     | (1, 2, 5, 4, 1) |
+            | W11 | boundary | administrative | 5     | (2, 3, 6, 5, 2) |
+        When importing
+        Then place_addressline contains
+            | object | address | cached_rank_address |
+            | W1     | W10     | 10                  |
+            | W1     | W11     | 10                  |
+
+    Scenario: Roads should not contain boundaries they touch in a end point
+        Given the grid
+            | 1 |   |   | 2 |   | 3 |
+            |   | 7 |   | 8 |   |   |
+            | 4 |   |   | 5 |   | 6 |
+        And the named places
+            | osm | class   | type | geometry |
+            | W1  | highway | road | 7, 8     |
+        And the named places
+            | osm | class    | type           | admin | geometry      |
+            | W10 | boundary | administrative | 5     | (1, 2, 8, 5, 4, 1) |
+            | W11 | boundary | administrative | 5     | (2, 3, 6, 5, 8, 2) |
+        When importing
+        Then place_addressline contains
+            | object | address | cached_rank_address |
+            | W1     | W10     | 10                  |
+        Then place_addressline doesn't contain
+            | object | address |
+            | W1     | W11     |
+
+    Scenario: Roads should not contain boundaries they touch in a end point
+        Given the grid
+            | 1 |   |   | 2 |   | 3 |
+            |   | 7 |   | 8 |   |   |
+            | 4 |   | 9 | 5 |   | 6 |
+        And the named places
+            | osm | class   | type | geometry |
+            | W1  | highway | road | 7, 8, 9     |
+        And the named places
+            | osm | class    | type           | admin | geometry      |
+            | W10 | boundary | administrative | 5     | (1, 2, 8, 5, 4, 1) |
+            | W11 | boundary | administrative | 5     | (2, 3, 6, 5, 8, 2) |
+        When importing
+        Then place_addressline contains
+            | object | address | cached_rank_address |
+            | W1     | W10     | 10                  |
+        Then place_addressline doesn't contain
+            | object | address |
+            | W1     | W11     |
+
+    Scenario: Locality points should contain all boundaries they touch
+        Given the grid
+            | 1 |   |   | 2 |   | 3 |
+            |   |   |   | 8 |   |   |
+            | 4 |   |   | 5 |   | 6 |
+        And the named places
+            | osm | class | type     | geometry |
+            | N1  | place | locality | 8        |
+        And the named places
+            | osm | class    | type           | admin | geometry      |
+            | W10 | boundary | administrative | 5     | (1, 2, 8, 5, 4, 1) |
+            | W11 | boundary | administrative | 5     | (2, 3, 6, 5, 8, 2) |
+        When importing
+        Then place_addressline contains
+            | object | address | cached_rank_address |
+            | N1     | W10     | 10                  |
+            | N1     | W11     | 10                  |
+
+    Scenario: Areas should not contain boundaries they touch
+        Given the grid
+            | 1 |   |   | 2 |   | 3 |
+            |   |   |   |   |   |   |
+            | 4 |   |   | 5 |   | 6 |
+        And the named places
+            | osm | class    | type           | geometry      |
+            | W1  | landuse  | industrial     | (1, 2, 5, 4, 1) |
+        And the named places
+            | osm | class    | type           | admin | geometry      |
+            | W10 | boundary | administrative | 5     | (2, 3, 6, 5, 2) |
+        When importing
+        Then place_addressline doesn't contain
+            | object | address |
+            | W1     | W10     |
+
      Scenario: buildings with only addr:postcodes do not appear in the address of a way
          Given the scene admin-areas
          And the named places
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 4 Aug 2020 12:32:09 +0000 (14:32 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 4 Aug 2020 12:32:09 +0000 (14:32 +0200)
VAGRANT.md		patch \| blob \| history
docs/develop/Setup.md		patch \| blob \| history
nominatim/indexer/__init__.py	[new file with mode: 0644]	patch \| blob
nominatim/indexer/progress.py	[new file with mode: 0644]	patch \| blob
nominatim/nominatim.py		patch \| blob \| history
sql/partition-functions.src.sql		patch \| blob \| history
test/README.md		patch \| blob \| history
test/bdd/api/search/queries.feature		patch \| blob \| history
test/bdd/db/import/addressing.feature		patch \| blob \| history