From: Sarah Hoffmann Date: Mon, 26 Sep 2022 08:38:20 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~93 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/90a2a5605ee856fe6da06c46924558de94b37355?hp=6b55ab5581a3f6541d16fb7544ce8fd57dc2a673 Merge remote-tracking branch 'upstream/master' --- diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..3ff8cfe3 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: lonvia +custom: "https://nominatim.org/funding/" diff --git a/SECURITY.md b/SECURITY.md index 9ac793b3..16dabafa 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -9,10 +9,10 @@ versions. | Version | End of support for security updates | | ------- | ----------------------------------- | +| 4.1.x | 2024-08-05 | | 4.0.x | 2023-11-02 | | 3.7.x | 2023-04-05 | | 3.6.x | 2022-12-12 | -| 3.5.x | 2022-06-05 | ## Reporting a Vulnerability diff --git a/docs/api/Output.md b/docs/api/Output.md index 4f5399f0..9a048ce0 100644 --- a/docs/api/Output.md +++ b/docs/api/Output.md @@ -211,8 +211,8 @@ be more than one. The attributes of that element contain: * `ref` - content of `ref` tag if it exists * `lat`, `lon` - latitude and longitude of the centroid of the object * `boundingbox` - comma-separated list of corner coordinates ([see notes](#boundingbox)) - * `place_rank` - class [search rank](../develop/Ranking#search-rank) - * `address_rank` - place [address rank](../develop/Ranking#address-rank) + * `place_rank` - class [search rank](../customize/Ranking#search-rank) + * `address_rank` - place [address rank](../customize/Ranking#address-rank) * `display_name` - full comma-separated address * `class`, `type` - key and value of the main OSM tag * `importance` - computed importance rank diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql index 96a105ae..fb822033 100644 --- a/lib-sql/functions/interpolation.sql +++ b/lib-sql/functions/interpolation.sql @@ -52,7 +52,9 @@ BEGIN IF parent_place_id is null THEN FOR location IN SELECT place_id FROM placex - WHERE ST_DWithin(geom, placex.geometry, 0.001) and placex.rank_search = 26 + WHERE ST_DWithin(geom, placex.geometry, 0.001) + and placex.rank_search = 26 + and placex.osm_type = 'W' -- needed for index selection ORDER BY CASE WHEN ST_GeometryType(geom) = 'ST_Line' THEN (ST_distance(placex.geometry, ST_LineInterpolatePoint(geom,0))+ ST_distance(placex.geometry, ST_LineInterpolatePoint(geom,0.5))+ diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index bb34883a..a2276f07 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -197,6 +197,7 @@ BEGIN SELECT place_id FROM placex WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox)) AND rank_address between 5 and 25 + AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') ORDER BY rank_address desc LOOP RETURN location.place_id; @@ -212,6 +213,7 @@ BEGIN SELECT place_id FROM placex WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox)) AND rank_address between 5 and 25 + AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') ORDER BY rank_address desc LOOP RETURN location.place_id; @@ -275,7 +277,9 @@ BEGIN -- If extratags has a place tag, look for linked nodes by their place type. -- Area and node still have to have the same name. - IF bnd.extratags ? 'place' and bnd_name is not null THEN + IF bnd.extratags ? 'place' and bnd.extratags->'place' != 'postcode' + and bnd_name is not null + THEN FOR linked_placex IN SELECT * FROM placex WHERE (position(lower(name->'name') in bnd_name) > 0 @@ -284,7 +288,6 @@ BEGIN AND placex.osm_type = 'N' AND (placex.linked_place_id is null or placex.linked_place_id = bnd.place_id) AND placex.rank_search < 26 -- needed to select the right index - AND placex.type != 'postcode' AND ST_Covers(bnd.geometry, placex.geometry) LOOP {% if debug %}RAISE WARNING 'Found type-matching place node %', linked_placex.osm_id;{% endif %} @@ -846,7 +849,8 @@ BEGIN FROM placex WHERE osm_type = 'R' and class = 'boundary' and type = 'administrative' and admin_level < NEW.admin_level and admin_level > 3 - and rank_address > 0 + and rank_address between 1 and 25 -- for index selection + and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') -- for index selection and geometry && NEW.centroid and _ST_Covers(geometry, NEW.centroid) ORDER BY admin_level desc LIMIT 1 LOOP @@ -874,8 +878,9 @@ BEGIN FROM placex, LATERAL compute_place_rank(country_code, 'A', class, type, admin_level, False, null) prank - WHERE class = 'place' and rank_address < 24 + WHERE class = 'place' and rank_address between 1 and 23 and prank.address_rank >= NEW.rank_address + and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') -- select right index and geometry && NEW.geometry and geometry ~ NEW.geometry -- needed because ST_Relate does not do bbox cover test and ST_Relate(geometry, NEW.geometry, 'T*T***FF*') -- contains but not equal @@ -896,6 +901,8 @@ BEGIN LATERAL compute_place_rank(country_code, 'A', class, type, admin_level, False, null) prank WHERE prank.address_rank < 24 + and rank_address between 1 and 25 -- select right index + and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') -- select right index and prank.address_rank >= NEW.rank_address and geometry && NEW.geometry and geometry ~ NEW.geometry -- needed because ST_Relate does not do bbox cover test @@ -916,6 +923,8 @@ BEGIN LATERAL compute_place_rank(country_code, 'A', class, type, admin_level, False, null) prank WHERE osm_type = 'R' + and rank_address between 1 and 25 -- select right index + and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') -- select right index and ((class = 'place' and prank.address_rank = NEW.rank_address) or (class = 'boundary' and rank_address = NEW.rank_address)) and geometry && NEW.centroid and _ST_Covers(geometry, NEW.centroid) diff --git a/lib-sql/indices.sql b/lib-sql/indices.sql index 9bbc7527..b1396034 100644 --- a/lib-sql/indices.sql +++ b/lib-sql/indices.sql @@ -10,62 +10,73 @@ CREATE INDEX IF NOT EXISTS idx_place_addressline_address_place_id ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}}; - +--- CREATE INDEX IF NOT EXISTS idx_placex_rank_search ON placex USING BTREE (rank_search) {{db.tablespace.search_index}}; - +--- CREATE INDEX IF NOT EXISTS idx_placex_rank_address ON placex USING BTREE (rank_address) {{db.tablespace.search_index}}; - +--- CREATE INDEX IF NOT EXISTS idx_placex_parent_place_id ON placex USING BTREE (parent_place_id) {{db.tablespace.search_index}} WHERE parent_place_id IS NOT NULL; - +--- +CREATE INDEX IF NOT EXISTS idx_placex_geometry ON placex + USING GIST (geometry) {{db.tablespace.search_index}}; +--- CREATE INDEX IF NOT EXISTS idx_placex_geometry_reverse_lookupPolygon ON placex USING gist (geometry) {{db.tablespace.search_index}} WHERE St_GeometryType(geometry) in ('ST_Polygon', 'ST_MultiPolygon') AND rank_address between 4 and 25 AND type != 'postcode' AND name is not null AND indexed_status = 0 AND linked_place_id is null; - +--- CREATE INDEX IF NOT EXISTS idx_osmline_parent_place_id ON location_property_osmline USING BTREE (parent_place_id) {{db.tablespace.search_index}} WHERE parent_place_id is not null; - +--- CREATE INDEX IF NOT EXISTS idx_osmline_parent_osm_id ON location_property_osmline USING BTREE (osm_id) {{db.tablespace.search_index}}; - +--- CREATE INDEX IF NOT EXISTS idx_postcode_postcode ON location_postcode USING BTREE (postcode) {{db.tablespace.search_index}}; +{% if drop %} +--- + DROP INDEX IF EXISTS idx_placex_geometry_address_area_candidates; + DROP INDEX IF EXISTS idx_placex_geometry_buildings; + DROP INDEX IF EXISTS idx_placex_geometry_lower_rank_ways; + DROP INDEX IF EXISTS idx_placex_wikidata; + DROP INDEX IF EXISTS idx_placex_rank_address_sector; + DROP INDEX IF EXISTS idx_placex_rank_boundaries_sector; +{% else %} -- Indices only needed for updating. - -{% if not drop %} - CREATE INDEX IF NOT EXISTS idx_placex_pendingsector - ON placex USING BTREE (rank_address,geometry_sector) {{db.tablespace.address_index}} - WHERE indexed_status > 0; - +--- CREATE INDEX IF NOT EXISTS idx_location_area_country_place_id ON location_area_country USING BTREE (place_id) {{db.tablespace.address_index}}; - +--- CREATE UNIQUE INDEX IF NOT EXISTS idx_place_osm_unique ON place USING btree(osm_id, osm_type, class, type) {{db.tablespace.address_index}}; {% endif %} -- Indices only needed for search. - {% if 'search_name' in db.tables %} +--- CREATE INDEX IF NOT EXISTS idx_search_name_nameaddress_vector ON search_name USING GIN (nameaddress_vector) WITH (fastupdate = off) {{db.tablespace.search_index}}; +--- CREATE INDEX IF NOT EXISTS idx_search_name_name_vector ON search_name USING GIN (name_vector) WITH (fastupdate = off) {{db.tablespace.search_index}}; +--- CREATE INDEX IF NOT EXISTS idx_search_name_centroid ON search_name USING GIST (centroid) {{db.tablespace.search_index}}; {% if postgres.has_index_non_key_column %} +--- CREATE INDEX IF NOT EXISTS idx_placex_housenumber ON placex USING btree (parent_place_id) INCLUDE (housenumber) {{db.tablespace.search_index}} WHERE housenumber is not null; +--- CREATE INDEX IF NOT EXISTS idx_osmline_parent_osm_id_with_hnr ON location_property_osmline USING btree(parent_place_id) INCLUDE (startnumber, endnumber) {{db.tablespace.search_index}} diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql index 03431d95..7ef74349 100644 --- a/lib-sql/tables.sql +++ b/lib-sql/tables.sql @@ -137,7 +137,9 @@ CREATE TABLE place_addressline ( ) {{db.tablespace.search_data}}; CREATE INDEX idx_place_addressline_place_id on place_addressline USING BTREE (place_id) {{db.tablespace.search_index}}; -drop table if exists placex; +--------- PLACEX - storage for all indexed places ----------------- + +DROP TABLE IF EXISTS placex; CREATE TABLE placex ( place_id BIGINT NOT NULL, parent_place_id BIGINT, @@ -157,20 +159,66 @@ CREATE TABLE placex ( postcode TEXT, centroid GEOMETRY(Geometry, 4326) ) {{db.tablespace.search_data}}; + CREATE UNIQUE INDEX idx_place_id ON placex USING BTREE (place_id) {{db.tablespace.search_index}}; -CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tablespace.search_index}}; -CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL; -CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}}; -CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}}; +{% for osm_type in ('N', 'W', 'R') %} +CREATE INDEX idx_placex_osmid_{{osm_type | lower}} ON placex + USING BTREE (osm_id) {{db.tablespace.search_index}} + WHERE osm_type = '{{osm_type}}'; +{% endfor %} + +-- Usage: - removing linkage status on update +-- - lookup linked places for /details +CREATE INDEX idx_placex_linked_place_id ON placex + USING BTREE (linked_place_id) {{db.tablespace.address_index}} + WHERE linked_place_id IS NOT NULL; + +-- Usage: - check that admin boundaries do not overtake each other rank-wise +-- - check that place node in a admin boundary with the same address level +-- - boundary is not completely contained in a place area +-- - parenting of large-area or unparentable features +CREATE INDEX idx_placex_geometry_address_area_candidates ON placex + USING gist (geometry) {{db.tablespace.address_index}} + WHERE rank_address between 1 and 25 + and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon'); + +-- Usage: - POI is within building with housenumber CREATE INDEX idx_placex_geometry_buildings ON placex - USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}} + USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.address_index}} WHERE address is not null and rank_search = 30 and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon'); + +-- Usage: - linking of similar named places to boundaries +-- - linking of place nodes with same type to boundaries +-- - lookupPolygon() CREATE INDEX idx_placex_geometry_placenode ON placex - USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}} + USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.address_index}} WHERE osm_type = 'N' and rank_search < 26 - and class = 'place' and type != 'postcode' and linked_place_id is null; -CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26; + and class = 'place' and type != 'postcode'; + +-- Usage: - is node part of a way? +-- - find parent of interpolation spatially +CREATE INDEX idx_placex_geometry_lower_rank_ways ON placex + USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.address_index}} + WHERE osm_type = 'W' and rank_search >= 26; + +-- Usage: - linking place nodes by wikidata tag to boundaries +CREATE INDEX idx_placex_wikidata on placex + USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} + WHERE extratags ? 'wikidata' and class = 'place' + and osm_type = 'N' and rank_search < 26; + +-- The following two indexes function as a todo list for indexing. + +CREATE INDEX idx_placex_rank_address_sector ON placex + USING BTREE (rank_address, geometry_sector) {{db.tablespace.address_index}} + WHERE indexed_status > 0; + +CREATE INDEX idx_placex_rank_boundaries_sector ON placex + USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}} + WHERE class = 'boundary' and type = 'administrative' + and indexed_status > 0; + DROP SEQUENCE IF EXISTS seq_place; CREATE SEQUENCE seq_place start 1; diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 6ffa7afb..29724433 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -15,7 +15,7 @@ from pathlib import Path import psutil from nominatim.config import Configuration -from nominatim.db.connection import connect, Connection +from nominatim.db.connection import connect from nominatim.db import status, properties from nominatim.tokenizer.base import AbstractTokenizer from nominatim.version import version_str @@ -72,6 +72,8 @@ class SetupAll: from ..tools import database_import, refresh, postcodes, freeze from ..indexer.indexer import Indexer + num_threads = args.threads or psutil.cpu_count() or 1 + country_info.setup_country_config(args.config) if args.continue_at is None: @@ -109,8 +111,7 @@ class SetupAll: database_import.truncate_data_tables(conn) LOG.warning('Load data into placex table') - database_import.load_data(args.config.get_libpq_dsn(), - args.threads or psutil.cpu_count() or 1) + database_import.load_data(args.config.get_libpq_dsn(), num_threads) LOG.warning("Setting up tokenizer") tokenizer = self._get_tokenizer(args.continue_at, args.config) @@ -121,18 +122,15 @@ class SetupAll: args.project_dir, tokenizer) if args.continue_at is None or args.continue_at in ('load-data', 'indexing'): - if args.continue_at is not None and args.continue_at != 'load-data': - with connect(args.config.get_libpq_dsn()) as conn: - self._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX) LOG.warning('Indexing places') - indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, - args.threads or psutil.cpu_count() or 1) + indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, num_threads) indexer.index_full(analyse=not args.index_noanalyse) LOG.warning('Post-process tables') with connect(args.config.get_libpq_dsn()) as conn: database_import.create_search_indices(conn, args.config, - drop=args.no_updates) + drop=args.no_updates, + threads=num_threads) LOG.warning('Create search index for default country names.') country_info.create_country_names(conn, tokenizer, args.config.get_str_list('LANGUAGES')) @@ -188,27 +186,6 @@ class SetupAll: return tokenizer_factory.get_tokenizer_for_db(config) - def _create_pending_index(self, conn: Connection, tablespace: str) -> None: - """ Add a supporting index for finding places still to be indexed. - - This index is normally created at the end of the import process - for later updates. When indexing was partially done, then this - index can greatly improve speed going through already indexed data. - """ - if conn.index_exists('idx_placex_pendingsector'): - return - - with conn.cursor() as cur: - LOG.warning('Creating support index') - if tablespace: - tablespace = 'TABLESPACE ' + tablespace - cur.execute(f"""CREATE INDEX idx_placex_pendingsector - ON placex USING BTREE (rank_address,geometry_sector) - {tablespace} WHERE indexed_status > 0 - """) - conn.commit() - - def _finalize_database(self, dsn: str, offline: bool) -> None: """ Determine the database date and set the status accordingly. """ diff --git a/nominatim/db/async_connection.py b/nominatim/db/async_connection.py index a2c8fe4d..d1e542f5 100644 --- a/nominatim/db/async_connection.py +++ b/nominatim/db/async_connection.py @@ -94,7 +94,8 @@ class DBConnection: # Use a dict to hand in the parameters because async is a reserved # word in Python3. - self.conn = psycopg2.connect(**{'dsn': self.dsn, 'async': True}) + self.conn = psycopg2.connect(**{'dsn': self.dsn, 'async': True}) # type: ignore + assert self.conn self.wait() if cursor_factory is not None: diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 44a293d4..77d463d8 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -189,7 +189,7 @@ def connect(dsn: str) -> ConnectionContext: try: conn = psycopg2.connect(dsn, connection_factory=Connection) ctxmgr = cast(ConnectionContext, contextlib.closing(conn)) - ctxmgr.connection = cast(Connection, conn) + ctxmgr.connection = conn return ctxmgr except psycopg2.OperationalError as err: raise UsageError(f"Cannot connect to database: {err}") from err diff --git a/nominatim/db/sql_preprocessor.py b/nominatim/db/sql_preprocessor.py index b450422d..31b4a8c0 100644 --- a/nominatim/db/sql_preprocessor.py +++ b/nominatim/db/sql_preprocessor.py @@ -11,6 +11,7 @@ from typing import Set, Dict, Any import jinja2 from nominatim.db.connection import Connection +from nominatim.db.async_connection import WorkerPool from nominatim.config import Configuration def _get_partitions(conn: Connection) -> Set[int]: @@ -96,3 +97,21 @@ class SQLPreprocessor: with conn.cursor() as cur: cur.execute(sql) conn.commit() + + + def run_parallel_sql_file(self, dsn: str, name: str, num_threads: int = 1, + **kwargs: Any) -> None: + """ Execure the given SQL files using parallel asynchronous connections. + The keyword arguments may supply additional parameters for + preprocessing. + + After preprocessing the SQL code is cut at lines containing only + '---'. Each chunk is sent to one of the `num_threads` workers. + """ + sql = self.env.get_template(name).render(**kwargs) + + parts = sql.split('\n---\n') + + with WorkerPool(dsn, num_threads) as pool: + for part in parts: + pool.next_free_worker().perform(part) diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py index 7372a49f..437775db 100644 --- a/nominatim/tools/check_database.py +++ b/nominatim/tools/check_database.py @@ -114,9 +114,10 @@ def _get_indexes(conn: Connection) -> List[str]: indexes.extend(('idx_placex_housenumber', 'idx_osmline_parent_osm_id_with_hnr')) if conn.table_exists('place'): - indexes.extend(('idx_placex_pendingsector', - 'idx_location_area_country_place_id', - 'idx_place_osm_unique')) + indexes.extend(('idx_location_area_country_place_id', + 'idx_place_osm_unique', + 'idx_placex_rank_address_sector', + 'idx_placex_rank_boundaries_sector')) return indexes @@ -199,7 +200,7 @@ def check_tokenizer(_: Connection, config: Configuration) -> CheckResult: def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult: """ Checking for wikipedia/wikidata data """ - if not conn.table_exists('search_name'): + if not conn.table_exists('search_name') or not conn.table_exists('place'): return CheckState.NOT_APPLICABLE with conn.cursor() as cur: @@ -268,7 +269,7 @@ def check_database_index_valid(conn: Connection, _: Configuration) -> CheckResul WHERE pg_index.indisvalid = false AND pg_index.indexrelid = pg_class.oid""") - broken = list(cur) + broken = [c[0] for c in cur] if broken: return CheckState.FAIL, dict(indexes='\n '.join(broken)) diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 447e90f1..f6ebe90d 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -225,7 +225,8 @@ def load_data(dsn: str, threads: int) -> None: cur.execute('ANALYSE') -def create_search_indices(conn: Connection, config: Configuration, drop: bool = False) -> None: +def create_search_indices(conn: Connection, config: Configuration, + drop: bool = False, threads: int = 1) -> None: """ Create tables that have explicit partitioning. """ @@ -243,4 +244,5 @@ def create_search_indices(conn: Connection, config: Configuration, drop: bool = sql = SQLPreprocessor(conn, config) - sql.run_sql_file(conn, 'indices.sql', drop=drop) + sql.run_parallel_sql_file(config.get_libpq_dsn(), + 'indices.sql', min(8, threads), drop=drop) diff --git a/test/python/cli/test_cmd_import.py b/test/python/cli/test_cmd_import.py index d545c760..737c4e5c 100644 --- a/test/python/cli/test_cmd_import.py +++ b/test/python/cli/test_cmd_import.py @@ -105,11 +105,8 @@ class TestCliImportWithDb: for mock in mocks: assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) - assert temp_db_conn.index_exists('idx_placex_pendingsector') - # Calling it again still works for the index assert self.call_nominatim('import', '--continue', 'indexing') == 0 - assert temp_db_conn.index_exists('idx_placex_pendingsector') def test_import_continue_postprocess(self, mock_func_factory):