Let's say you have a Postgres database named `nominatim_it` on server `your-server.com` and port `5432`. The Postgres username is `postgres`. You can edit `settings/local.php` and point Nominatim to it.
- pgsql://postgres@your-server.com:5432/nominatim_it
+ pgsql:host=your-server.com;port=5432;user=postgres;dbname=nominatim_it
-No data import necessary or restarting necessary.
+No data import or restarting necessary.
If the Postgres installation is behind a firewall, you can try
PHPUnit version even though we expect many Nominatim users will install
older version on their production servers.
+ * Python 3 (https://www.python.org/)
+ * behave test framework >= 1.2.5 (https://github.com/behave/behave)
+ * nose (https://nose.readthedocs.org)
+ * pytidylib (http://countergram.com/open-source/pytidylib)
+ * psycopg2 (http://initd.org/psycopg/)
+
#### Ubuntu 20
sudo apt-get install -y phpunit php-codesniffer php-cgi
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim.
+# Copyright (C) 2020 Sarah Hoffmann
+
+import logging
+from datetime import datetime
+
+log = logging.getLogger()
+
+class ProgressLogger(object):
+ """ Tracks and prints progress for the indexing process.
+ `name` is the name of the indexing step being tracked.
+ `total` sets up the total number of items that need processing.
+ `log_interval` denotes the interval in seconds at which progres
+ should be reported.
+ """
+
+ def __init__(self, name, total, log_interval=1):
+ self.name = name
+ self.total_places = total
+ self.done_places = 0
+ self.rank_start_time = datetime.now()
+ self.next_info = 100 if log.isEnabledFor(logging.INFO) else total + 1
+
+ def add(self, num=1):
+ """ Mark `num` places as processed. Print a log message if the
+ logging is at least info and the log interval has past.
+ """
+ self.done_places += num
+
+ if self.done_places >= self.next_info:
+ now = datetime.now()
+ done_time = (now - self.rank_start_time).total_seconds()
+ places_per_sec = self.done_places / done_time
+ eta = (self.total_places - self.done_places)/places_per_sec
+
+ log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}"
+ .format(self.done_places, int(done_time),
+ places_per_sec, self.name, eta))
+
+ self.next_info += int(places_per_sec)
+
+ def done(self):
+ """ Print final staticstics about the progress.
+ """
+ rank_end_time = datetime.now()
+ diff_seconds = (rank_end_time-self.rank_start_time).total_seconds()
+
+ log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format(
+ self.done_places, self.total_places, int(diff_seconds),
+ self.done_places/diff_seconds, self.name))
from psycopg2.extras import wait_select
import select
+from indexer.progress import ProgressLogger
+
log = logging.getLogger()
def make_connection(options, asynchronous=False):
def name(self):
return "rank {}".format(self.rank)
- def sql_index_sectors(self):
- return """SELECT geometry_sector, count(*) FROM placex
+ def sql_count_objects(self):
+ return """SELECT count(*) FROM placex
WHERE rank_search = {} and indexed_status > 0
- GROUP BY geometry_sector
- ORDER BY geometry_sector""".format(self.rank)
+ """.format(self.rank)
- def sql_nosector_places(self):
+ def sql_get_objects(self):
return """SELECT place_id FROM placex
WHERE indexed_status > 0 and rank_search = {}
ORDER BY geometry_sector""".format(self.rank)
- def sql_sector_places(self):
- return """SELECT place_id FROM placex
- WHERE indexed_status > 0 and rank_search = {}
- and geometry_sector = %s""".format(self.rank)
-
- def sql_index_place(self):
- return "UPDATE placex SET indexed_status = 0 WHERE place_id = %s"
+ def sql_index_place(self, ids):
+ return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
+ .format(','.join((str(i) for i in ids)))
class InterpolationRunner(object):
def name(self):
return "interpolation lines (location_property_osmline)"
- def sql_index_sectors(self):
- return """SELECT geometry_sector, count(*) FROM location_property_osmline
- WHERE indexed_status > 0
- GROUP BY geometry_sector
- ORDER BY geometry_sector"""
+ def sql_count_objects(self):
+ return """SELECT count(*) FROM location_property_osmline
+ WHERE indexed_status > 0"""
- def sql_nosector_places(self):
+ def sql_get_objects(self):
return """SELECT place_id FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
- def sql_sector_places(self):
- return """SELECT place_id FROM location_property_osmline
- WHERE indexed_status > 0 and geometry_sector = %s
- ORDER BY geometry_sector"""
-
- def sql_index_place(self):
+ def sql_index_place(self, ids):
return """UPDATE location_property_osmline
- SET indexed_status = 0 WHERE place_id = %s"""
+ SET indexed_status = 0 WHERE place_id IN ({})"""\
+ .format(','.join((str(i) for i in ids)))
class DBConnection(object):
self.index(RankRunner(rank))
if self.maxrank == 30:
- self.index(InterpolationRunner())
+ self.index(InterpolationRunner(), 20)
- self.index(RankRunner(self.maxrank))
+ self.index(RankRunner(self.maxrank), 20)
- def index(self, obj):
+ def index(self, obj, batch=1):
""" Index a single rank or table. `obj` describes the SQL to use
- for indexing.
+ for indexing. `batch` describes the number of objects that
+ should be processed with a single SQL statement
"""
log.warning("Starting {}".format(obj.name()))
- cur = self.conn.cursor(name='main')
- cur.execute(obj.sql_index_sectors())
+ cur = self.conn.cursor()
+ cur.execute(obj.sql_count_objects())
- total_tuples = 0
- for r in cur:
- total_tuples += r[1]
- log.debug("Total number of rows; {}".format(total_tuples))
+ total_tuples = cur.fetchone()[0]
+ log.debug("Total number of rows: {}".format(total_tuples))
- cur.scroll(0, mode='absolute')
+ cur.close()
next_thread = self.find_free_thread()
- done_tuples = 0
- rank_start_time = datetime.now()
-
- sector_sql = obj.sql_sector_places()
- index_sql = obj.sql_index_place()
- min_grouped_tuples = total_tuples - len(self.threads) * 1000
+ progress = ProgressLogger(obj.name(), total_tuples)
- next_info = 100 if log.isEnabledFor(logging.INFO) else total_tuples + 1
+ cur = self.conn.cursor(name='places')
+ cur.execute(obj.sql_get_objects())
- for r in cur:
- sector = r[0]
-
- # Should we do the remaining ones together?
- do_all = done_tuples > min_grouped_tuples
-
- pcur = self.conn.cursor(name='places')
-
- if do_all:
- pcur.execute(obj.sql_nosector_places())
- else:
- pcur.execute(sector_sql, (sector, ))
-
- for place in pcur:
- place_id = place[0]
- log.debug("Processing place {}".format(place_id))
- thread = next(next_thread)
-
- thread.perform(index_sql, (place_id,))
- done_tuples += 1
+ while True:
+ places = [p[0] for p in cur.fetchmany(batch)]
+ if len(places) == 0:
+ break
- if done_tuples >= next_info:
- now = datetime.now()
- done_time = (now - rank_start_time).total_seconds()
- tuples_per_sec = done_tuples / done_time
- log.info("Done {} in {} @ {:.3f} per second - {} ETA (seconds): {:.2f}"
- .format(done_tuples, int(done_time),
- tuples_per_sec, obj.name(),
- (total_tuples - done_tuples)/tuples_per_sec))
- next_info += int(tuples_per_sec)
+ log.debug("Processing places: {}".format(places))
+ thread = next(next_thread)
- pcur.close()
-
- if do_all:
- break
+ thread.perform(obj.sql_index_place(places))
+ progress.add(len(places))
cur.close()
for t in self.threads:
t.wait()
- rank_end_time = datetime.now()
- diff_seconds = (rank_end_time-rank_start_time).total_seconds()
-
- log.warning("Done {}/{} in {} @ {:.3f} per second - FINISHED {}\n".format(
- done_tuples, total_tuples, int(diff_seconds),
- done_tuples/diff_seconds, obj.name()))
+ progress.done()
def find_free_thread(self):
""" Generator that returns the next connection that is free for
centroid GEOMETRY
);
+ -- feature intersects geoemtry
+ -- for areas and linestrings they must touch at least along a line
+CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
+RETURNS BOOLEAN
+AS $$
+BEGIN
+ IF substring(de9im from 1 for 2) != 'FF' THEN
+ RETURN TRUE;
+ END IF;
+
+ IF geom_type = 'ST_Point' THEN
+ RETURN substring(de9im from 4 for 1) = '0';
+ END IF;
+
+ IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
+ RETURN substring(de9im from 4 for 1) = '1';
+ END IF;
+
+ RETURN substring(de9im from 4 for 1) = '2';
+END
+$$ LANGUAGE plpgsql IMMUTABLE;
+
create or replace function getNearFeatures(in_partition INTEGER, feature GEOMETRY, maxrank INTEGER, isin_tokens INT[]) RETURNS setof nearfeaturecentr AS $$
DECLARE
r nearfeaturecentr%rowtype;
FOR r IN
SELECT place_id, keywords, rank_address, rank_search, min(ST_Distance(feature, centroid)) as distance, isguess, postcode, centroid
FROM location_area_large_-partition-
- WHERE ST_Intersects(geometry, feature)
+ WHERE geometry && feature
+ AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
AND rank_search < maxrank AND rank_address < maxrank
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
ORDER BY rank_address, isin_tokens && keywords desc, isguess asc,
Prerequisites
=============
- * Python 3 (https://www.python.org/)
- * behave test framework >= 1.2.5 (https://github.com/behave/behave)
- * nose (https://nose.readthedocs.org)
- * pytidylib (http://countergram.com/open-source/pytidylib)
- * psycopg2 (http://initd.org/psycopg/)
-
-To get the prerequisites on a a fresh Ubuntu LTS 16.04 run:
-
- [sudo] apt-get install python3-dev python3-pip python3-psycopg2 python3-tidylib phpunit php-cgi
- pip3 install --user behave nose
-
+See `docs/develop/Setup.md`
Overall structure
=================
### API Tests (`test/bdd/api`)
These tests are meant to test the different API endpoints and their parameters.
-They require a preimported test database, which consists of the import of a
-planet extract. A precompiled PBF with the necessary data can be downloaded from
-https://www.nominatim.org/data/test/nominatim-api-testdata.pbf
+They require a to import several datasets into a test database. You need at
+least 2GB RAM and 10GB discspace.
-You need at least 2GB RAM and 10GB discspace.
-The polygons defining the extract can be found in the test/testdb
-directory. There is also a reduced set of wikipedia data for this extract,
-which you need to import as well. For Tiger tests the data of South Dakota
-is required. Get the Tiger files `46*`.
+1. Fetch the OSM planet extract (200MB). See end of document how it got created.
+ ```
cd Nominatim/data
+ mkdir testdb
+ cd testdb
+ wget https://www.nominatim.org/data/test/nominatim-api-testdata.pbf
+ ```
+
+2. Fetch `46*` (South Dakota) Tiger data
+
+ ```
+ cd Nominatim/data/testdb
wget https://nominatim.org/data/tiger2018-nominatim-preprocessed.tar.gz
tar xvf tiger2018-nominatim-preprocessed.tar.gz --wildcards --no-anchored '46*'
rm tiger2018-nominatim-preprocessed.tar.gz
+ ```
-The official test dataset is derived from the 180924 planet. Newer
-planets are likely to work as well but you may see isolated test
-failures where the data has changed. To recreate the input data
-for the test database run:
+3. Adapt build/settings/local.php settings:
- wget https://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-180924.osm.pbf
- osmconvert planet-180924.osm.pbf -B=test/testdb/testdb.polys -o=testdb.pbf
+ ```
+ @define('CONST_Database_DSN', 'pgsql:dbname=test_api_nominatim');
+ @define('CONST_Use_US_Tiger_Data', true);
+ @define('CONST_Tiger_Data_Path', CONST_ExtraDataPath.'/testdb');
+ @define('CONST_Wikipedia_Data_Path', CONST_BasePath.'/test/testdb');
+ ```
-Before importing make sure to add the following to your local settings:
+4. Import
- @define('CONST_Database_DSN', 'pgsql://@/test_api_nominatim');
- @define('CONST_Wikipedia_Data_Path', CONST_BasePath.'/test/testdb');
+ ```
+ LOGFILE=/tmp/nominatim-testdb.$$.log
+ dropdb --if-exists test_api_nominatim
+ ./utils/setup.php --all --osm-file ../Nominatim/data/testdb/nominatim-api-testdb.pbf 2>&1 | tee $LOGFILE
+ ./utils/specialphrases.php --wiki-import > specialphrases.sql
+ psql -d test_api_nominatim -f specialphrases.sql 2>&1 | tee -a $LOGFILE
+ ./utils/setup.php --import-tiger-data 2>&1 | tee -a $LOGFILE
+ ```
#### Code Coverage
These tests check that data is imported correctly into the place table. They
use the same template database as the Indexing tests, so the same remarks apply.
+
+
+
+How the test database extract was generated
+-------------------------------------------
+The official test dataset was derived from the 180924 planet (note: such
+file no longer exists at https://planet.openstreetmap.org/planet/2018/).
+Newer planets are likely to work as well but you may see isolated test
+failures where the data has changed. To recreate the input data
+for the test database run:
+
+ wget https://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-180924.osm.pbf
+ osmconvert planet-180924.osm.pbf -B=test/testdb/testdb.polys -o=testdb.pbf
| type | value |
| house_number | 86 |
| road | Schellingstraße |
- | neighbourhood | Auenviertel |
| suburb | Eilbek |
| postcode | 22089 |
| city | Hamburg |
| type | value |
| house_number | 73 |
| road | Schellingstraße |
- | neighbourhood | Auenviertel |
| suburb | Eilbek |
| postcode | 22089 |
| city | Hamburg |
When sending json search query "22nd Street Southwest\v,\fHuron"
Then results contain
| class | type |
- | highway | residential |
\ No newline at end of file
+ | highway | residential |
| W1 | W10 | 10 |
| W1 | W11 | 10 |
+
+ Scenario: Roads following a boundary should contain both states
+ Given the grid
+ | 1 | | | 2 | | 3 |
+ | | | 8 | 7 | | |
+ | 4 | | | 5 | | 6 |
+ And the named places
+ | osm | class | type | geometry |
+ | W1 | highway | road | 2, 7, 8 |
+ And the named places
+ | osm | class | type | admin | geometry |
+ | W10 | boundary | administrative | 5 | (1, 2, 5, 4, 1) |
+ | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 2) |
+ When importing
+ Then place_addressline contains
+ | object | address | cached_rank_address |
+ | W1 | W10 | 10 |
+ | W1 | W11 | 10 |
+
+ Scenario: Roads should not contain boundaries they touch in a end point
+ Given the grid
+ | 1 | | | 2 | | 3 |
+ | | 7 | | 8 | | |
+ | 4 | | | 5 | | 6 |
+ And the named places
+ | osm | class | type | geometry |
+ | W1 | highway | road | 7, 8 |
+ And the named places
+ | osm | class | type | admin | geometry |
+ | W10 | boundary | administrative | 5 | (1, 2, 8, 5, 4, 1) |
+ | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 8, 2) |
+ When importing
+ Then place_addressline contains
+ | object | address | cached_rank_address |
+ | W1 | W10 | 10 |
+ Then place_addressline doesn't contain
+ | object | address |
+ | W1 | W11 |
+
+ Scenario: Roads should not contain boundaries they touch in a end point
+ Given the grid
+ | 1 | | | 2 | | 3 |
+ | | 7 | | 8 | | |
+ | 4 | | 9 | 5 | | 6 |
+ And the named places
+ | osm | class | type | geometry |
+ | W1 | highway | road | 7, 8, 9 |
+ And the named places
+ | osm | class | type | admin | geometry |
+ | W10 | boundary | administrative | 5 | (1, 2, 8, 5, 4, 1) |
+ | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 8, 2) |
+ When importing
+ Then place_addressline contains
+ | object | address | cached_rank_address |
+ | W1 | W10 | 10 |
+ Then place_addressline doesn't contain
+ | object | address |
+ | W1 | W11 |
+
+ Scenario: Locality points should contain all boundaries they touch
+ Given the grid
+ | 1 | | | 2 | | 3 |
+ | | | | 8 | | |
+ | 4 | | | 5 | | 6 |
+ And the named places
+ | osm | class | type | geometry |
+ | N1 | place | locality | 8 |
+ And the named places
+ | osm | class | type | admin | geometry |
+ | W10 | boundary | administrative | 5 | (1, 2, 8, 5, 4, 1) |
+ | W11 | boundary | administrative | 5 | (2, 3, 6, 5, 8, 2) |
+ When importing
+ Then place_addressline contains
+ | object | address | cached_rank_address |
+ | N1 | W10 | 10 |
+ | N1 | W11 | 10 |
+
+ Scenario: Areas should not contain boundaries they touch
+ Given the grid
+ | 1 | | | 2 | | 3 |
+ | | | | | | |
+ | 4 | | | 5 | | 6 |
+ And the named places
+ | osm | class | type | geometry |
+ | W1 | landuse | industrial | (1, 2, 5, 4, 1) |
+ And the named places
+ | osm | class | type | admin | geometry |
+ | W10 | boundary | administrative | 5 | (2, 3, 6, 5, 2) |
+ When importing
+ Then place_addressline doesn't contain
+ | object | address |
+ | W1 | W10 |
+
Scenario: buildings with only addr:postcodes do not appear in the address of a way
Given the scene admin-areas
And the named places