From: Sarah Hoffmann Date: Wed, 9 Nov 2022 13:27:07 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~92 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/8d1a8e8e48702c69243f142da2059a2587cef705?hp=90a2a5605ee856fe6da06c46924558de94b37355 Merge remote-tracking branch 'upstream/master' --- diff --git a/.github/actions/build-nominatim/action.yml b/.github/actions/build-nominatim/action.yml index c6ff7a31..125a0f8b 100644 --- a/.github/actions/build-nominatim/action.yml +++ b/.github/actions/build-nominatim/action.yml @@ -23,7 +23,7 @@ runs: run: | sudo apt-get install -y -qq libboost-system-dev libboost-filesystem-dev libexpat1-dev zlib1g-dev libbz2-dev libpq-dev libproj-dev libicu-dev if [ "x$UBUNTUVER" == "x18" ]; then - pip3 install python-dotenv psycopg2==2.7.7 jinja2==2.8 psutil==5.4.2 pyicu osmium PyYAML==5.1 datrie + pip3 install python-dotenv psycopg2==2.7.7 jinja2==2.8 psutil==5.4.2 pyicu==2.9 osmium PyYAML==5.1 datrie else sudo apt-get install -y -qq python3-icu python3-datrie python3-pyosmium python3-jinja2 python3-psutil python3-psycopg2 python3-dotenv python3-yaml fi diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index a26ad000..cdc7ea1e 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -7,11 +7,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: | data/country_osm_grid.sql.gz @@ -27,7 +27,7 @@ jobs: mv nominatim-src.tar.bz2 Nominatim - name: 'Upload Artifact' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: full-source path: nominatim-src.tar.bz2 @@ -50,7 +50,7 @@ jobs: pytest: py.test-3 php: 7.4 - ubuntu: 22 - postgresql: 14 + postgresql: 15 postgis: 3 pytest: py.test-3 php: 8.1 @@ -58,7 +58,7 @@ jobs: runs-on: ubuntu-${{ matrix.ubuntu }}.04 steps: - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v3 with: name: full-source @@ -72,7 +72,7 @@ jobs: tools: phpunit, phpcs, composer ini-values: opcache.jit=disable - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.6 if: matrix.ubuntu == 18 @@ -99,7 +99,7 @@ jobs: if: matrix.ubuntu == 22 - name: Install latest pylint/mypy - run: pip3 install -U pylint mypy types-PyYAML types-jinja2 types-psycopg2 types-psutil typing-extensions + run: pip3 install -U pylint mypy types-PyYAML types-jinja2 types-psycopg2 types-psutil types-requests typing-extensions - name: PHP linting run: phpcs --report-width=120 . @@ -136,7 +136,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v3 with: name: full-source @@ -231,7 +231,7 @@ jobs: OS: ${{ matrix.name }} INSTALL_MODE: ${{ matrix.install_mode }} - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v3 with: name: full-source path: /home/nominatim diff --git a/docs/admin/Deployment.md b/docs/admin/Deployment.md index 5dcbcde9..8b635546 100644 --- a/docs/admin/Deployment.md +++ b/docs/admin/Deployment.md @@ -99,7 +99,7 @@ Unix socket instead, change the pool configuration ``` ini ; Replace the tcp listener and add the unix socket -listen = /var/run/php-fpm.sock +listen = /var/run/php-fpm-nominatim.sock ; Ensure that the daemon runs as the correct user listen.owner = www-data @@ -121,7 +121,7 @@ location @php { fastcgi_param SCRIPT_FILENAME "$document_root$uri.php"; fastcgi_param PATH_TRANSLATED "$document_root$uri.php"; fastcgi_param QUERY_STRING $args; - fastcgi_pass unix:/var/run/php-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index index.php; include fastcgi_params; } @@ -131,7 +131,7 @@ location ~ [^/]\.php(/|$) { if (!-f $document_root$fastcgi_script_name) { return 404; } - fastcgi_pass unix:/var/run/php-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index search.php; include fastcgi.conf; } diff --git a/docs/admin/Import.md b/docs/admin/Import.md index 90294959..8b6d6baa 100644 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@ -79,10 +79,10 @@ This data is available as a binary download. Put it into your project directory: The file is about 400MB and adds around 4GB to the Nominatim database. !!! tip - If you forgot to download the wikipedia rankings, you can also add - importances after the import. Download the files, then run - `nominatim refresh --wiki-data --importance`. Updating importances for - a planet can take a couple of hours. + If you forgot to download the wikipedia rankings, then you can + also add importances after the import. Download the SQL files, then + run `nominatim refresh --wiki-data --importance`. Updating + importances for a planet will take a couple of hours. ### External postcodes @@ -139,7 +139,7 @@ import. So this option is particularly interesting if you plan to transfer the database or reuse the space later. !!! warning - The datastructure for updates are also required when adding additional data + The data structure for updates are also required when adding additional data after the import, for example [TIGER housenumber data](../customize/Tiger.md). If you plan to use those, you must not use the `--no-updates` parameter. Do a normal import, add the external data and once you are done with diff --git a/docs/api/Status.md b/docs/api/Status.md index 8c3e25e9..1a5ff0a8 100644 --- a/docs/api/Status.md +++ b/docs/api/Status.md @@ -57,10 +57,11 @@ code and message, e.g. Possible status codes are - | | message | notes | - |-----|----------------------|---------------------------------------------------| - | 700 | "No database" | connection failed | - | 701 | "Module failed" | database could not load nominatim.so | - | 702 | "Module call failed" | nominatim.so loaded but calling a function failed | - | 703 | "Query failed" | test query against a database table failed | - | 704 | "No value" | test query worked but returned no results | +| | message | notes | +| --- | ------------------------------ | ----------------------------------------------------------------- | +| 700 | "No database" | connection failed | +| 701 | "Module failed" | database could not load nominatim.so | +| 702 | "Module call failed" | nominatim.so loaded but calling a function failed | +| 703 | "Query failed" | test query against a database table failed | +| 704 | "No value" | test query worked but returned no results | +| 705 | "Import date is not available" | No import dates were returned (enabling replication can fix this) | diff --git a/docs/customize/Importance.md b/docs/customize/Importance.md new file mode 100644 index 00000000..d12bfc86 --- /dev/null +++ b/docs/customize/Importance.md @@ -0,0 +1,49 @@ +## Importance + +Search requests can yield multiple results which match equally well with +the original query. In such case Nominatim needs to order the results +according to a different criterion: importance. This is a measure for how +likely it is that a user will search for a given place. This section explains +the sources Nominatim uses for computing importance of a place and how to +customize them. + +### How importance is computed + +The main value for importance is derived from page ranking values for Wikipedia +pages for a place. For places that do not have their own +Wikipedia page, a formula is used that derives a static importance from the +places [search rank](../customize/Ranking#search-rank). + +In a second step, a secondary importance value is added which is meant to +represent how well-known the general area is where the place is located. It +functions as a tie-breaker between places with very similar primary +importance values. + +nominatim.org has preprocessed importance tables for the +[primary Wikipedia rankings](https://nominatim.org/data/wikimedia-importance.sql.gz) +and for a secondary importance based on the number of tile views on openstreetmap.org. + +### Customizing secondary importance + +The secondary importance is implemented as a simple +[Postgis raster](https://postgis.net/docs/raster.html) table, where Nominatim +looks up the value for the coordinates of the centroid of a place. You can +provide your own secondary importance raster in form of an SQL file named +`secondary_importance.sql.gz` in your project directory. + +The SQL file needs to drop and (re)create a table `secondary_importance` which +must as a minimum contain a column `rast` of type `raster`. The raster must +be in EPSG:4326 and contain 16bit unsigned ints +(`raster_constraint_pixel_types(rast) = '{16BUI}'). Any other columns in the +table will be ignored. You must furthermore create an index as follows: + +``` +CREATE INDEX ON secondary_importance USING gist(ST_ConvexHull(gist)) +``` + +The following raster2pgsql command will create a table that conforms to +the requirements: + +``` +raster2pgsql -I -C -Y -d -t 128x128 input.tiff public.secondary_importance +``` diff --git a/docs/customize/Tiger.md b/docs/customize/Tiger.md index 3f1c3546..758a145a 100644 --- a/docs/customize/Tiger.md +++ b/docs/customize/Tiger.md @@ -5,14 +5,14 @@ address set to complement the OSM house number data in the US. You can add TIGER data to your own Nominatim instance by following these steps. The entire US adds about 10GB to your database. - 1. Get preprocessed TIGER 2021 data: + 1. Get preprocessed TIGER data: cd $PROJECT_DIR - wget https://nominatim.org/data/tiger2021-nominatim-preprocessed.csv.tar.gz + wget https://nominatim.org/data/tiger-nominatim-preprocessed-latest.csv.tar.gz 2. Import the data into your Nominatim database: - nominatim add-data --tiger-data tiger2021-nominatim-preprocessed.csv.tar.gz + nominatim add-data --tiger-data tiger-nominatim-preprocessed-latest.csv.tar.gz 3. Enable use of the Tiger data in your `.env` by adding: diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index e89c32d5..ab7dec30 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -30,6 +30,7 @@ nav: - 'Configuration Settings': 'customize/Settings.md' - 'Per-Country Data': 'customize/Country-Settings.md' - 'Place Ranking' : 'customize/Ranking.md' + - 'Importance' : 'customize/Importance.md' - 'Tokenizers' : 'customize/Tokenizers.md' - 'Special Phrases': 'customize/Special-Phrases.md' - 'External data: US housenumbers from TIGER': 'customize/Tiger.md' diff --git a/lib-php/ReverseGeocode.php b/lib-php/ReverseGeocode.php index 77c16a5b..2aea9038 100644 --- a/lib-php/ReverseGeocode.php +++ b/lib-php/ReverseGeocode.php @@ -71,7 +71,8 @@ class ReverseGeocode $sSQL .= ' ST_Distance(linegeo,'.$sPointSQL.') as distance'; $sSQL .= ' FROM location_property_osmline'; $sSQL .= ' WHERE ST_DWithin('.$sPointSQL.', linegeo, '.$fSearchDiam.')'; - $sSQL .= ' and indexed_status = 0 and startnumber is not NULL '; + $sSQL .= ' and indexed_status = 0 and startnumber is not NULL '; + $sSQL .= ' and parent_place_id != 0'; $sSQL .= ' ORDER BY distance ASC limit 1'; Debug::printSQL($sSQL); diff --git a/lib-sql/functions/importance.sql b/lib-sql/functions/importance.sql index ac3aa7f8..44e8bc8b 100644 --- a/lib-sql/functions/importance.sql +++ b/lib-sql/functions/importance.sql @@ -100,32 +100,55 @@ LANGUAGE plpgsql STABLE; CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), - osm_type varchar(1), osm_id BIGINT) + rank_search SMALLINT, + centroid GEOMETRY) RETURNS place_importance AS $$ DECLARE match RECORD; result place_importance; + osm_views_exists BIGINT; + views BIGINT; BEGIN - FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code) - WHERE language is not NULL + -- add importance by wikipedia article if the place has one + FOR match IN + SELECT * FROM get_wikipedia_match(extratags, country_code) + WHERE language is not NULL LOOP result.importance := match.importance; result.wikipedia := match.language || ':' || match.title; RETURN result; END LOOP; - IF extratags ? 'wikidata' THEN + -- Nothing? Then try with the wikidata tag. + IF result.importance is null AND extratags ? 'wikidata' THEN FOR match IN SELECT * FROM wikipedia_article WHERE wd_page_title = extratags->'wikidata' - ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP + ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 + LOOP result.importance := match.importance; result.wikipedia := match.language || ':' || match.title; RETURN result; END LOOP; END IF; - RETURN null; + -- Still nothing? Fall back to a default. + IF result.importance is null THEN + result.importance := 0.75001 - (rank_search::float / 40); + END IF; + +{% if 'secondary_importance' in db.tables %} + FOR match IN + SELECT ST_Value(rast, centroid) as importance + FROM secondary_importance + WHERE ST_Intersects(ST_ConvexHull(rast), centroid) LIMIT 1 + LOOP + -- Secondary importance as tie breaker with 0.0001 weight. + result.importance := result.importance + match.importance::float / 655350000; + END LOOP; +{% endif %} + + RETURN result; END; $$ LANGUAGE plpgsql; diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index a2276f07..367d2149 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -965,7 +965,7 @@ BEGIN NEW.importance := null; SELECT wikipedia, importance - FROM compute_importance(NEW.extratags, NEW.country_code, NEW.osm_type, NEW.osm_id) + FROM compute_importance(NEW.extratags, NEW.country_code, NEW.rank_search, NEW.centroid) INTO NEW.wikipedia,NEW.importance; {% if debug %}RAISE WARNING 'Importance computed from wikipedia: %', NEW.importance;{% endif %} @@ -1047,7 +1047,7 @@ BEGIN IF linked_place is not null THEN -- Recompute the ranks here as the ones from the linked place might -- have been shifted to accommodate surrounding boundaries. - SELECT place_id, osm_id, class, type, extratags, + SELECT place_id, osm_id, class, type, extratags, rank_search, centroid, geometry, (compute_place_rank(country_code, osm_type, class, type, admin_level, (extratags->'capital') = 'yes', null)).* @@ -1088,7 +1088,7 @@ BEGIN SELECT wikipedia, importance FROM compute_importance(location.extratags, NEW.country_code, - 'N', location.osm_id) + location.rank_search, NEW.centroid) INTO linked_wikipedia,linked_importance; -- Use the maximum importance if one could be computed from the linked object. diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql index 7ef74349..d576485e 100644 --- a/lib-sql/tables.sql +++ b/lib-sql/tables.sql @@ -276,7 +276,7 @@ CREATE SEQUENCE file start 1; -- null table so it won't error -- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it -CREATE TABLE wikipedia_article ( +CREATE TABLE IF NOT EXISTS wikipedia_article ( language text NOT NULL, title text NOT NULL, langcount integer, @@ -290,15 +290,12 @@ CREATE TABLE wikipedia_article ( wd_page_title text, instance_of text ); -ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title); -CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id); -CREATE TABLE wikipedia_redirect ( +CREATE TABLE IF NOT EXISTS wikipedia_redirect ( language text, from_title text, to_title text ); -ALTER TABLE ONLY wikipedia_redirect ADD CONSTRAINT wikipedia_redirect_pkey PRIMARY KEY (language, from_title); -- osm2pgsql does not create indexes on the middle tables for Nominatim -- Add one for lookup of associated street relations. diff --git a/module/CMakeLists.txt b/module/CMakeLists.txt index 9684a817..c8594298 100644 --- a/module/CMakeLists.txt +++ b/module/CMakeLists.txt @@ -1,6 +1,6 @@ # just use the pgxs makefile -foreach(suffix ${PostgreSQL_ADDITIONAL_VERSIONS} "14" "13" "12" "11" "10" "9.6") +foreach(suffix ${PostgreSQL_ADDITIONAL_VERSIONS} "15" "14" "13" "12" "11" "10" "9.6") list(APPEND PG_CONFIG_HINTS "/usr/pgsql-${suffix}/bin") endforeach() diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py index 4457db5f..2f8273d6 100644 --- a/nominatim/clicmd/args.py +++ b/nominatim/clicmd/args.py @@ -115,6 +115,7 @@ class NominatimArgs: address_levels: bool functions: bool wiki_data: bool + secondary_importance: bool importance: bool website: bool diffs: bool diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index dce28d98..ea605ea0 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -63,6 +63,8 @@ class UpdateRefresh: help='Update the PL/pgSQL functions in the database') group.add_argument('--wiki-data', action='store_true', help='Update Wikipedia/data importance numbers') + group.add_argument('--secondary-importance', action='store_true', + help='Update secondary importance raster data') group.add_argument('--importance', action='store_true', help='Recompute place importances (expensive!)') group.add_argument('--website', action='store_true', @@ -83,7 +85,7 @@ class UpdateRefresh: help='Enable debug warning statements in functions') - def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches + def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements from ..tools import refresh, postcodes from ..indexer.indexer import Indexer @@ -115,6 +117,20 @@ class UpdateRefresh: with connect(args.config.get_libpq_dsn()) as conn: refresh.load_address_levels_from_config(conn, args.config) + # Attention: must come BEFORE functions + if args.secondary_importance: + with connect(args.config.get_libpq_dsn()) as conn: + # If the table did not exist before, then the importance code + # needs to be enabled. + if not conn.table_exists('secondary_importance'): + args.functions = True + + LOG.warning('Import secondary importance raster data from %s', args.project_dir) + if refresh.import_secondary_importance(args.config.get_libpq_dsn(), + args.project_dir) > 0: + LOG.fatal('FATAL: Cannot update sendary importance raster data') + return 1 + if args.functions: LOG.warning('Create functions') with connect(args.config.get_libpq_dsn()) as conn: diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py index 2d6396a1..ad201663 100644 --- a/nominatim/clicmd/replication.py +++ b/nominatim/clicmd/replication.py @@ -76,7 +76,8 @@ class UpdateReplication: LOG.warning("Initialising replication updates") with connect(args.config.get_libpq_dsn()) as conn: - replication.init_replication(conn, base_url=args.config.REPLICATION_URL) + replication.init_replication(conn, base_url=args.config.REPLICATION_URL, + socket_timeout=args.socket_timeout) if args.update_functions: LOG.warning("Create functions") refresh.create_functions(conn, args.config, True, False) @@ -87,7 +88,8 @@ class UpdateReplication: from ..tools import replication with connect(args.config.get_libpq_dsn()) as conn: - return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL) + return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL, + socket_timeout=args.socket_timeout) def _report_update(self, batchdate: dt.datetime, @@ -148,7 +150,7 @@ class UpdateReplication: while True: with connect(args.config.get_libpq_dsn()) as conn: start = dt.datetime.now(dt.timezone.utc) - state = replication.update(conn, params) + state = replication.update(conn, params, socket_timeout=args.socket_timeout) if state is not replication.UpdateState.NO_CHANGES: status.log_status(conn, start, 'import') batchdate, _, _ = status.get_status(conn) diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 29724433..344167bb 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -59,7 +59,7 @@ class SetupAll: help="Do not keep tables that are only needed for " "updating the database later") group2.add_argument('--offline', action='store_true', - help="Do not attempt to load any additional data from the internet") + help="Do not attempt to load any additional data from the internet") group3 = parser.add_argument_group('Expert options') group3.add_argument('--ignore-errors', action='store_true', help='Continue import even when errors in SQL are present') @@ -96,14 +96,21 @@ class SetupAll: drop=args.no_updates, ignore_errors=args.ignore_errors) - self._setup_tables(args.config, args.reverse_only) - LOG.warning('Importing wikipedia importance data') data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir) if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), data_path) > 0: LOG.error('Wikipedia importance dump file not found. ' - 'Will be using default importances.') + 'Calculating importance values of locations will not ' + 'use Wikipedia importance data.') + + LOG.warning('Importing secondary importance raster data') + if refresh.import_secondary_importance(args.config.get_libpq_dsn(), + args.project_dir) != 0: + LOG.error('Secondary importance file not imported. ' + 'Falling back to default ranking.') + + self._setup_tables(args.config, args.reverse_only) if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Initialise tables') diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index 5425c8ff..233423f0 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -128,58 +128,64 @@ class Indexer: with conn.cursor() as cur: cur.execute('ANALYZE') - self.index_by_rank(0, 4) - _analyze() + if self.index_by_rank(0, 4) > 0: + _analyze() - self.index_boundaries(0, 30) - _analyze() + if self.index_boundaries(0, 30) > 100: + _analyze() - self.index_by_rank(5, 25) - _analyze() + if self.index_by_rank(5, 25) > 100: + _analyze() - self.index_by_rank(26, 30) - _analyze() + if self.index_by_rank(26, 30) > 1000: + _analyze() - self.index_postcodes() - _analyze() + if self.index_postcodes() > 100: + _analyze() - def index_boundaries(self, minrank: int, maxrank: int) -> None: + def index_boundaries(self, minrank: int, maxrank: int) -> int: """ Index only administrative boundaries within the given rank range. """ + total = 0 LOG.warning("Starting indexing boundaries using %s threads", self.num_threads) with self.tokenizer.name_analyzer() as analyzer: for rank in range(max(minrank, 4), min(maxrank, 26)): - self._index(runners.BoundaryRunner(rank, analyzer)) + total += self._index(runners.BoundaryRunner(rank, analyzer)) - def index_by_rank(self, minrank: int, maxrank: int) -> None: + return total + + def index_by_rank(self, minrank: int, maxrank: int) -> int: """ Index all entries of placex in the given rank range (inclusive) in order of their address rank. When rank 30 is requested then also interpolations and places with address rank 0 will be indexed. """ + total = 0 maxrank = min(maxrank, 30) LOG.warning("Starting indexing rank (%i to %i) using %i threads", minrank, maxrank, self.num_threads) with self.tokenizer.name_analyzer() as analyzer: for rank in range(max(1, minrank), maxrank + 1): - self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1) + total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1) if maxrank == 30: - self._index(runners.RankRunner(0, analyzer)) - self._index(runners.InterpolationRunner(analyzer), 20) + total += self._index(runners.RankRunner(0, analyzer)) + total += self._index(runners.InterpolationRunner(analyzer), 20) + + return total - def index_postcodes(self) -> None: + def index_postcodes(self) -> int: """Index the entries of the location_postcode table. """ LOG.warning("Starting indexing postcodes using %s threads", self.num_threads) - self._index(runners.PostcodeRunner(), 20) + return self._index(runners.PostcodeRunner(), 20) def update_status_table(self) -> None: @@ -191,7 +197,7 @@ class Indexer: conn.commit() - def _index(self, runner: runners.Runner, batch: int = 1) -> None: + def _index(self, runner: runners.Runner, batch: int = 1) -> int: """ Index a single rank or table. `runner` describes the SQL to use for indexing. `batch` describes the number of objects that should be processed with a single SQL statement @@ -233,4 +239,4 @@ class Indexer: conn.commit() - progress.done() + return progress.done() diff --git a/nominatim/indexer/progress.py b/nominatim/indexer/progress.py index 177c262b..33df37fb 100644 --- a/nominatim/indexer/progress.py +++ b/nominatim/indexer/progress.py @@ -55,7 +55,7 @@ class ProgressLogger: self.next_info += int(places_per_sec) * self.log_interval - def done(self) -> None: + def done(self) -> int: """ Print final statistics about the progress. """ rank_end_time = datetime.now() @@ -70,3 +70,5 @@ class ProgressLogger: LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n", self.done_places, self.total_places, int(diff_seconds), places_per_sec, self.name) + + return self.done_places diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index f6ebe90d..cb620d41 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -75,6 +75,11 @@ def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None: with conn.cursor() as cur: cur.execute('CREATE EXTENSION IF NOT EXISTS hstore') cur.execute('CREATE EXTENSION IF NOT EXISTS postgis') + + postgis_version = conn.postgis_version_tuple() + if postgis_version[0] >= 3: + cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster') + conn.commit() _require_version('PostGIS', diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 8c1e9d9b..c50493cc 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -15,7 +15,7 @@ from pathlib import Path from psycopg2 import sql as pysql from nominatim.config import Configuration -from nominatim.db.connection import Connection +from nominatim.db.connection import Connection, connect from nominatim.db.utils import execute_file from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.version import version_str @@ -146,6 +146,25 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F return 0 +def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int: + """ Replaces the secondary importance raster data table with new data. + + Returns 0 if all was well and 1 if the raster SQL file could not + be found. Throws an exception if there was an error reading the file. + """ + datafile = data_path / 'secondary_importance.sql.gz' + if not datafile.exists(): + return 1 + + with connect(dsn) as conn: + postgis_version = conn.postgis_version_tuple() + if postgis_version[0] < 3: + LOG.error('PostGIS version is too old for using OSM raster data.') + return 2 + + execute_file(dsn, datafile, ignore_errors=ignore_errors) + + return 0 def recompute_importance(conn: Connection) -> None: """ Recompute wikipedia links and importance for all entries in placex. @@ -157,7 +176,7 @@ def recompute_importance(conn: Connection) -> None: cur.execute(""" UPDATE placex SET (wikipedia, importance) = (SELECT wikipedia, importance - FROM compute_importance(extratags, country_code, osm_type, osm_id)) + FROM compute_importance(extratags, country_code, osm_type, osm_id, centroid)) """) cur.execute(""" UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance diff --git a/nominatim/tools/replication.py b/nominatim/tools/replication.py index db706bf6..d93335b8 100644 --- a/nominatim/tools/replication.py +++ b/nominatim/tools/replication.py @@ -7,13 +7,16 @@ """ Functions for updating a database from a replication source. """ -from typing import ContextManager, MutableMapping, Any, Generator, cast +from typing import ContextManager, MutableMapping, Any, Generator, cast, Iterator from contextlib import contextmanager import datetime as dt from enum import Enum import logging import time +import types +import urllib.request as urlrequest +import requests from nominatim.db import status from nominatim.db.connection import Connection from nominatim.tools.exec_utils import run_osm2pgsql @@ -22,6 +25,7 @@ from nominatim.errors import UsageError try: from osmium.replication.server import ReplicationServer from osmium import WriteHandler + from osmium import version as pyo_version except ImportError as exc: logging.getLogger().critical("pyosmium not installed. Replication functions not available.\n" "To install pyosmium via pip: pip3 install osmium") @@ -29,7 +33,8 @@ except ImportError as exc: LOG = logging.getLogger() -def init_replication(conn: Connection, base_url: str) -> None: +def init_replication(conn: Connection, base_url: str, + socket_timeout: int = 60) -> None: """ Set up replication for the server at the given base URL. """ LOG.info("Using replication source: %s", base_url) @@ -38,9 +43,8 @@ def init_replication(conn: Connection, base_url: str) -> None: # margin of error to make sure we get all data date -= dt.timedelta(hours=3) - repl = ReplicationServer(base_url) - - seq = repl.timestamp_to_sequence(date) + with _make_replication_server(base_url, socket_timeout) as repl: + seq = repl.timestamp_to_sequence(date) if seq is None: LOG.fatal("Cannot reach the configured replication service '%s'.\n" @@ -53,7 +57,8 @@ def init_replication(conn: Connection, base_url: str) -> None: LOG.warning("Updates initialised at sequence %s (%s)", seq, date) -def check_for_updates(conn: Connection, base_url: str) -> int: +def check_for_updates(conn: Connection, base_url: str, + socket_timeout: int = 60) -> int: """ Check if new data is available from the replication service at the given base URL. """ @@ -64,7 +69,8 @@ def check_for_updates(conn: Connection, base_url: str) -> int: "Please run 'nominatim replication --init' first.") return 254 - state = ReplicationServer(base_url).get_state_info() + with _make_replication_server(base_url, socket_timeout) as repl: + state = repl.get_state_info() if state is None: LOG.error("Cannot get state for URL %s.", base_url) @@ -86,7 +92,8 @@ class UpdateState(Enum): NO_CHANGES = 3 -def update(conn: Connection, options: MutableMapping[str, Any]) -> UpdateState: +def update(conn: Connection, options: MutableMapping[str, Any], + socket_timeout: int = 60) -> UpdateState: """ Update database from the next batch of data. Returns the state of updates according to `UpdateState`. """ @@ -114,7 +121,7 @@ def update(conn: Connection, options: MutableMapping[str, Any]) -> UpdateState: options['import_file'].unlink() # Read updates into file. - with _make_replication_server(options['base_url']) as repl: + with _make_replication_server(options['base_url'], socket_timeout) as repl: outhandler = WriteHandler(str(options['import_file'])) endseq = repl.apply_diffs(outhandler, startseq + 1, max_size=options['max_diff_size'] * 1024) @@ -136,14 +143,40 @@ def update(conn: Connection, options: MutableMapping[str, Any]) -> UpdateState: return UpdateState.UP_TO_DATE -def _make_replication_server(url: str) -> ContextManager[ReplicationServer]: +def _make_replication_server(url: str, timeout: int) -> ContextManager[ReplicationServer]: """ Returns a ReplicationServer in form of a context manager. Creates a light wrapper around older versions of pyosmium that did not support the context manager interface. """ if hasattr(ReplicationServer, '__enter__'): - return cast(ContextManager[ReplicationServer], ReplicationServer(url)) + # Patches the open_url function for pyosmium >= 3.2 + # where the socket timeout is no longer respected. + def patched_open_url(self: ReplicationServer, url: urlrequest.Request) -> Any: + """ Download a resource from the given URL and return a byte sequence + of the content. + """ + get_params = { + 'headers': {"User-Agent" : f"Nominatim (pyosmium/{pyo_version.pyosmium_release})"}, + 'timeout': timeout or None, + 'stream': True + } + + if self.session is not None: + return self.session.get(url.get_full_url(), **get_params) + + @contextmanager + def _get_url_with_session() -> Iterator[requests.Response]: + with requests.Session() as session: + request = session.get(url.get_full_url(), **get_params) # type: ignore + yield request + + return _get_url_with_session() + + repl = ReplicationServer(url) + repl.open_url = types.MethodType(patched_open_url, repl) + + return cast(ContextManager[ReplicationServer], repl) @contextmanager def get_cm() -> Generator[ReplicationServer, None, None]: diff --git a/osm2pgsql b/osm2pgsql index b0352aa8..6a5d2500 160000 --- a/osm2pgsql +++ b/osm2pgsql @@ -1 +1 @@ -Subproject commit b0352aa8f15e2739ba36d72561854a2738123770 +Subproject commit 6a5d2500e9689f55485d186306aadc55560085fd diff --git a/test/bdd/api/search/params.feature b/test/bdd/api/search/params.feature index 300948a9..053dbbcd 100644 --- a/test/bdd/api/search/params.feature +++ b/test/bdd/api/search/params.feature @@ -68,14 +68,15 @@ Feature: Search queries | 0 | Then there are duplicates + @fail-legacy Scenario: Search with bounded viewbox in right area - When sending json search query "bar" with address + When sending json search query "post" with address | bounded | viewbox | | 1 | 9,47,10,48 | Then result addresses contain | ID | town | | 0 | Vaduz | - When sending json search query "bar" with address + When sending json search query "post" with address | bounded | viewbox | | 1 | 9.49712,47.17122,9.52605,47.16242 | Then result addresses contain @@ -118,18 +119,18 @@ Feature: Search queries Then result has centroid in 9.49712,47.16242,9.52605,47.17122 Scenario: Prefer results within viewbox - When sending json search query "Gässle" with address - | accept-language | - | en | - Then result addresses contain - | ID | town | - | 0 | Balzers | When sending json search query "Gässle" with address | accept-language | viewbox | | en | 9.52413,47.10759,9.53140,47.10539 | Then result addresses contain | ID | village | | 0 | Triesen | + When sending json search query "Gässle" with address + | accept-language | viewbox | + | en | 9.45949,47.08421,9.54094,47.05466 | + Then result addresses contain + | ID | town | + | 0 | Balzers | Scenario: viewboxes cannot be points When sending json search query "foo" @@ -171,10 +172,12 @@ Feature: Search queries Scenario: Limit number of search results When sending json search query "landstr" + | dedupe | + | 0 | Then more than 4 results are returned When sending json search query "landstr" - | limit | - | 4 | + | limit | dedupe | + | 4 | 0 | Then exactly 4 results are returned Scenario: Limit parameter must be a number diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index e7234788..1feafd75 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -201,19 +201,21 @@ class NominatimEnvironment: self.api_db_done = True if not self._reuse_or_drop_db(self.api_test_db): - testdata = Path('__file__') / '..' / '..' / 'testdb' - self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve()) + testdata = (Path(__file__) / '..' / '..' / '..' / 'testdb').resolve() + self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata) + simp_file = Path(self.website_dir.name) / 'secondary_importance.sql.gz' + simp_file.symlink_to(testdata / 'secondary_importance.sql.gz') try: self.run_nominatim('import', '--osm-file', str(self.api_test_file)) - self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) + self.run_nominatim('add-data', '--tiger-data', str(testdata / 'tiger')) self.run_nominatim('freeze') if self.tokenizer == 'legacy': - phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) + phrase_file = str(testdata / 'specialphrases_testdb.sql') run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) else: - csv_path = str((testdata / 'full_en_phrases_test.csv').resolve()) + csv_path = str(testdata / 'full_en_phrases_test.csv') self.run_nominatim('special-phrases', '--import-from-csv', csv_path) except: self.db_drop_database(self.api_test_db) diff --git a/test/python/cli/test_cmd_import.py b/test/python/cli/test_cmd_import.py index 737c4e5c..d098e27e 100644 --- a/test/python/cli/test_cmd_import.py +++ b/test/python/cli/test_cmd_import.py @@ -40,6 +40,7 @@ class TestCliImportWithDb: mock_func_factory(nominatim.data.country_info, 'setup_country_tables'), mock_func_factory(nominatim.tools.database_import, 'import_osm_data'), mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'), + mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'), mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), mock_func_factory(nominatim.tools.database_import, 'load_data'), mock_func_factory(nominatim.tools.database_import, 'create_tables'), diff --git a/test/python/cli/test_cmd_refresh.py b/test/python/cli/test_cmd_refresh.py index 7f44765b..f3f93f0f 100644 --- a/test/python/cli/test_cmd_refresh.py +++ b/test/python/cli/test_cmd_refresh.py @@ -71,6 +71,18 @@ class TestRefresh: assert self.call_nominatim('refresh', '--wiki-data') == 1 + def test_refresh_secondary_importance_file_not_found(self): + assert self.call_nominatim('refresh', '--secondary-importance') == 1 + + + def test_refresh_secondary_importance_new_table(self, mock_func_factory): + mocks = [mock_func_factory(nominatim.tools.refresh, 'import_secondary_importance'), + mock_func_factory(nominatim.tools.refresh, 'create_functions')] + + assert self.call_nominatim('refresh', '--secondary-importance') == 0 + assert mocks[0].called == 1 + assert mocks[1].called == 1 + def test_refresh_importance_computed_after_wiki_import(self, monkeypatch): calls = [] diff --git a/test/python/tools/test_refresh.py b/test/python/tools/test_refresh.py index ac52aa36..c6be4fe7 100644 --- a/test/python/tools/test_refresh.py +++ b/test/python/tools/test_refresh.py @@ -17,6 +17,21 @@ def test_refresh_import_wikipedia_not_existing(dsn): assert refresh.import_wikipedia_articles(dsn, Path('.')) == 1 +def test_refresh_import_secondary_importance_non_existing(dsn): + assert refresh.import_secondary_importance(dsn, Path('.')) == 1 + +def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor): + temp_db_cursor.execute('CREATE EXTENSION postgis') + + if temp_db_conn.postgis_version_tuple()[0] < 3: + assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0 + else: + temp_db_cursor.execute('CREATE EXTENSION postgis_raster') + assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0 + + assert temp_db_conn.table_exists('secondary_importance') + + @pytest.mark.parametrize("replace", (True, False)) def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, replace): if replace: @@ -34,6 +49,7 @@ def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT, + centroid GEOMETRY, OUT importance FLOAT, OUT wikipedia TEXT) AS $$ SELECT 0.1::float, 'foo'::text $$ LANGUAGE SQL""") diff --git a/test/testdb/secondary_importance.sql.gz b/test/testdb/secondary_importance.sql.gz new file mode 100644 index 00000000..e9c115d1 Binary files /dev/null and b/test/testdb/secondary_importance.sql.gz differ diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh index e36086e1..53284ea8 100755 --- a/vagrant/Install-on-Ubuntu-18.sh +++ b/vagrant/Install-on-Ubuntu-18.sh @@ -206,7 +206,7 @@ if [ "x$2" == "xinstall-nginx" ]; then #DOCS: sudo tee /etc/php/7.2/fpm/pool.d/www.conf << EOF_PHP_FPM_CONF [www] ; Replace the tcp listener and add the unix socket -listen = /var/run/php7.2-fpm.sock +listen = /var/run/php-fpm-nominatim.sock ; Ensure that the daemon runs as the correct user listen.owner = www-data @@ -241,7 +241,7 @@ server { fastcgi_param SCRIPT_FILENAME "\$document_root\$uri.php"; fastcgi_param PATH_TRANSLATED "\$document_root\$uri.php"; fastcgi_param QUERY_STRING \$args; - fastcgi_pass unix:/var/run/php7.2-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index index.php; include fastcgi_params; } @@ -251,7 +251,7 @@ server { if (!-f \$document_root\$fastcgi_script_name) { return 404; } - fastcgi_pass unix:/var/run/php7.2-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index search.php; include fastcgi.conf; } diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh index d364cccd..a8654490 100755 --- a/vagrant/Install-on-Ubuntu-20.sh +++ b/vagrant/Install-on-Ubuntu-20.sh @@ -197,7 +197,7 @@ if [ "x$2" == "xinstall-nginx" ]; then #DOCS: sudo tee /etc/php/7.4/fpm/pool.d/www.conf << EOF_PHP_FPM_CONF [www] ; Replace the tcp listener and add the unix socket -listen = /var/run/php7.4-fpm.sock +listen = /var/run/php-fpm-nominatim.sock ; Ensure that the daemon runs as the correct user listen.owner = www-data @@ -232,7 +232,7 @@ server { fastcgi_param SCRIPT_FILENAME "\$document_root\$uri.php"; fastcgi_param PATH_TRANSLATED "\$document_root\$uri.php"; fastcgi_param QUERY_STRING \$args; - fastcgi_pass unix:/var/run/php7.4-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index index.php; include fastcgi_params; } @@ -242,7 +242,7 @@ server { if (!-f \$document_root\$fastcgi_script_name) { return 404; } - fastcgi_pass unix:/var/run/php7.4-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index search.php; include fastcgi.conf; } @@ -250,9 +250,9 @@ server { EOF_NGINX_CONF #DOCS:``` -# If you have some errors, make sure that php7.4-fpm.sock is well under +# If you have some errors, make sure that php-fpm-nominatim.sock is well under # /var/run/ and not under /var/run/php. Otherwise change the Nginx configuration -# to /var/run/php/php7.4-fpm.sock. +# to /var/run/php/php-fpm-nominatim.sock. # # Enable the configuration and restart Nginx # diff --git a/vagrant/Install-on-Ubuntu-22.sh b/vagrant/Install-on-Ubuntu-22.sh index 419a7313..a7ba6866 100755 --- a/vagrant/Install-on-Ubuntu-22.sh +++ b/vagrant/Install-on-Ubuntu-22.sh @@ -197,7 +197,7 @@ if [ "x$2" == "xinstall-nginx" ]; then #DOCS: sudo tee /etc/php/8.1/fpm/pool.d/www.conf << EOF_PHP_FPM_CONF [www] ; Replace the tcp listener and add the unix socket -listen = /var/run/php8.1-fpm.sock +listen = /var/run/php-fpm-nominatim.sock ; Ensure that the daemon runs as the correct user listen.owner = www-data @@ -232,7 +232,7 @@ server { fastcgi_param SCRIPT_FILENAME "\$document_root\$uri.php"; fastcgi_param PATH_TRANSLATED "\$document_root\$uri.php"; fastcgi_param QUERY_STRING \$args; - fastcgi_pass unix:/var/run/php8.1-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index index.php; include fastcgi_params; } @@ -242,7 +242,7 @@ server { if (!-f \$document_root\$fastcgi_script_name) { return 404; } - fastcgi_pass unix:/var/run/php7.4-fpm.sock; + fastcgi_pass unix:/var/run/php-fpm-nominatim.sock; fastcgi_index search.php; include fastcgi.conf; } @@ -250,9 +250,9 @@ server { EOF_NGINX_CONF #DOCS:``` -# If you have some errors, make sure that php8.1-fpm.sock is well under +# If you have some errors, make sure that php-fpm-nominatim.sock is well under # /var/run/ and not under /var/run/php. Otherwise change the Nginx configuration -# to /var/run/php/php8.1-fpm.sock. +# to /var/run/php/php-fpm-nominatim.sock. # # Enable the configuration and restart Nginx #