From: Sarah Hoffmann Date: Sat, 10 Dec 2022 16:31:18 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~85 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/6463839b4b5c3577dcce72795513c1a75444adb4?hp=dc2c384208dfbd2d1e3018eb2a0f9f1ee12bea6d Merge remote-tracking branch 'upstream/master' --- diff --git a/.github/actions/setup-postgresql/action.yml b/.github/actions/setup-postgresql/action.yml index 19a19e17..416ced5f 100644 --- a/.github/actions/setup-postgresql/action.yml +++ b/.github/actions/setup-postgresql/action.yml @@ -15,7 +15,9 @@ runs: - name: Remove existing PostgreSQL run: | sudo apt-get purge -yq postgresql* - sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' + sudo apt install curl ca-certificates gnupg + curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null + sudo sh -c 'echo "deb https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' sudo apt-get update -qq shell: bash diff --git a/.pylintrc b/.pylintrc index e8609407..e62371c6 100644 --- a/.pylintrc +++ b/.pylintrc @@ -15,4 +15,4 @@ ignored-classes=NominatimArgs,closing # typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager -good-names=i,x,y,fd,db,cc +good-names=i,x,y,m,fd,db,cc diff --git a/CMakeLists.txt b/CMakeLists.txt index bdc5cfdc..d4c16126 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") project(nominatim) set(NOMINATIM_VERSION_MAJOR 4) -set(NOMINATIM_VERSION_MINOR 1) +set(NOMINATIM_VERSION_MINOR 2) set(NOMINATIM_VERSION_PATCH 0) set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}") @@ -254,7 +254,14 @@ if (BUILD_IMPORTER) install(DIRECTORY nominatim DESTINATION ${NOMINATIM_LIBDIR}/lib-python FILES_MATCHING PATTERN "*.py" + PATTERN "paths.py" EXCLUDE PATTERN __pycache__ EXCLUDE) + + configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed) + install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed + DESTINATION ${NOMINATIM_LIBDIR}/lib-python/nominatim + RENAME paths.py) + install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR}) install(FILES ${COUNTRY_GRID_FILE} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e031cd91..6d75ce57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -64,3 +64,39 @@ Before submitting a pull request make sure that the tests pass: cd build make test ``` + +## Releases + +Nominatim follows semantic versioning. Major releases are done for large changes +that require (or at least strongly recommend) a reimport of the databases. +Minor releases can usually be applied to exisiting databases Patch releases +contain bug fixes only and are released from a separate branch where the +relevant changes are cherry-picked from the master branch. + +Checklist for releases: + +* [ ] increase version in `nominatim/version.py` and CMakeLists.txt +* [ ] update `ChangeLog` (copy information from patch releases from release branch) +* [ ] complete `docs/admin/Migration.md` +* [ ] update EOL dates in `SECURITY.md` +* [ ] commit and make sure CI tests pass +* [ ] test migration + * download, build and import previous version + * migrate using master version + * run updates using master version +* [ ] prepare tarball: + * `git clone --recursive https://github.com/osm-search/Nominatim` (switch to right branch!) + * `rm -r .git* osm2pgsql/.git*` + * copy country data into `data/` + * add version to base directory and package +* [ ] upload tarball to https://nominatim.org +* [ ] prepare documentation + * check out new docs branch + * change git checkout instructions to tarball download instructions or adapt version on existing ones + * build documentation and copy to https://github.com/osm-search/nominatim-org-site + * add new version to history +* [ ] check release tarball + * download tarball as per new documentation instructions + * compile and import Nominatim + * run `nominatim --version` to confirm correct version +* [ ] tag new release and add a release on github.com diff --git a/ChangeLog b/ChangeLog index c38b2a79..19e50631 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,26 @@ +4.2.0 + + * add experimental support for osm2pgsql flex style + * introduce secondary importance value to be retrieved from a raster data file + (currently still unused, to replace address importance, thanks to @tareqpi) + * add new report tool `nominatim admin --collect-os-info` + (thanks @micahcochran, @tareqpi) + * reorganise index to improve lookup performance and size + * run index creation after import in parallel + * run ANALYZE more selectively to speed up continuation of indexing + * fix crash on update when addr:interpolation receives an illegal value + * fix minimum number of retrieved results to be at least 10 + * fix search for combinations of special term + name (e.g Hotel Bellevue) + * do not return interpolations without a parent street on reverse search + * improve invalidation of linked places on updates + * fix address parsing for interpolation lines + * make sure socket timeouts are respected during replication + (working around a bug in some versions of pyosmium) + * update bundled osm2pgsql to 1.7.1 + * add support for PostgreSQL 15 + * typing fixes to work with latest type annotations from typeshed + * smaller improvements to documentation (thanks to @mausch) + 4.1.0 * switch to ICU tokenizer as default diff --git a/SECURITY.md b/SECURITY.md index 16dabafa..e8e6fcad 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -9,6 +9,7 @@ versions. | Version | End of support for security updates | | ------- | ----------------------------------- | +| 4.2.x | 2024-11-24 | | 4.1.x | 2024-08-05 | | 4.0.x | 2023-11-02 | | 3.7.x | 2023-04-05 | diff --git a/Vagrantfile b/Vagrantfile index 033e1507..0cc511c5 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -17,6 +17,14 @@ Vagrant.configure("2") do |config| checkout = "no" end + config.vm.provider "hyperv" do |hv, override| + hv.memory = 2048 + hv.linked_clone = true + if ENV['CHECKOUT'] != 'y' then + override.vm.synced_folder ".", "/home/vagrant/Nominatim", type: "smb", smb_host: ENV['SMB_HOST'] || ENV['COMPUTERNAME'] + end + end + config.vm.provider "virtualbox" do |vb, override| vb.gui = false vb.memory = 2048 @@ -34,7 +42,34 @@ Vagrant.configure("2") do |config| end end - config.vm.define "ubuntu", primary: true do |sub| + config.vm.define "ubuntu22", primary: true do |sub| + sub.vm.box = "generic/ubuntu2204" + sub.vm.provision :shell do |s| + s.path = "vagrant/Install-on-Ubuntu-22.sh" + s.privileged = false + s.args = [checkout] + end + end + + config.vm.define "ubuntu22-apache" do |sub| + sub.vm.box = "generic/ubuntu2204" + sub.vm.provision :shell do |s| + s.path = "vagrant/Install-on-Ubuntu-22.sh" + s.privileged = false + s.args = [checkout, "install-apache"] + end + end + + config.vm.define "ubuntu22-nginx" do |sub| + sub.vm.box = "generic/ubuntu2204" + sub.vm.provision :shell do |s| + s.path = "vagrant/Install-on-Ubuntu-22.sh" + s.privileged = false + s.args = [checkout, "install-nginx"] + end + end + + config.vm.define "ubuntu20" do |sub| sub.vm.box = "generic/ubuntu2004" sub.vm.provision :shell do |s| s.path = "vagrant/Install-on-Ubuntu-20.sh" @@ -43,7 +78,7 @@ Vagrant.configure("2") do |config| end end - config.vm.define "ubuntu-apache" do |sub| + config.vm.define "ubuntu20-apache" do |sub| sub.vm.box = "generic/ubuntu2004" sub.vm.provision :shell do |s| s.path = "vagrant/Install-on-Ubuntu-20.sh" @@ -52,7 +87,7 @@ Vagrant.configure("2") do |config| end end - config.vm.define "ubuntu-nginx" do |sub| + config.vm.define "ubuntu20-nginx" do |sub| sub.vm.box = "generic/ubuntu2004" sub.vm.provision :shell do |s| s.path = "vagrant/Install-on-Ubuntu-20.sh" @@ -88,23 +123,4 @@ Vagrant.configure("2") do |config| end end - config.vm.define "centos7" do |sub| - sub.vm.box = "centos/7" - sub.vm.provision :shell do |s| - s.path = "vagrant/Install-on-Centos-7.sh" - s.privileged = false - s.args = [checkout] - end - end - - config.vm.define "centos" do |sub| - sub.vm.box = "generic/centos8" - sub.vm.provision :shell do |s| - s.path = "vagrant/Install-on-Centos-8.sh" - s.privileged = false - s.args = [checkout] - end - end - - end diff --git a/cmake/paths-py.tmpl b/cmake/paths-py.tmpl new file mode 100644 index 00000000..372a4546 --- /dev/null +++ b/cmake/paths-py.tmpl @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Path settings for extra data used by Nominatim (installed version). +""" +from pathlib import Path + +PHPLIB_DIR = (Path('@NOMINATIM_LIBDIR@') / 'lib-php').resolve() +SQLLIB_DIR = (Path('@NOMINATIM_LIBDIR@') / 'lib-sql').resolve() +DATA_DIR = Path('@NOMINATIM_DATADIR@').resolve() +CONFIG_DIR = Path('@NOMINATIM_CONFIGDIR@').resolve() diff --git a/cmake/tool-installed.tmpl b/cmake/tool-installed.tmpl index 8825daaf..e38dafab 100644 --- a/cmake/tool-installed.tmpl +++ b/cmake/tool-installed.tmpl @@ -4,8 +4,6 @@ import os sys.path.insert(1, '@NOMINATIM_LIBDIR@/lib-python') -os.environ['NOMINATIM_NOMINATIM_TOOL'] = os.path.abspath(__file__) - from nominatim import cli from nominatim import version @@ -13,8 +11,4 @@ version.GIT_COMMIT_HASH = '@GIT_HASH@' exit(cli.nominatim(module_dir='@NOMINATIM_LIBDIR@/module', osm2pgsql_path='@NOMINATIM_LIBDIR@/osm2pgsql', - phplib_dir='@NOMINATIM_LIBDIR@/lib-php', - sqllib_dir='@NOMINATIM_LIBDIR@/lib-sql', - data_dir='@NOMINATIM_DATADIR@', - config_dir='@NOMINATIM_CONFIGDIR@', phpcgi_path='@PHPCGI_BIN@')) diff --git a/cmake/tool.tmpl b/cmake/tool.tmpl index c1ecd3f0..96c6c6dc 100755 --- a/cmake/tool.tmpl +++ b/cmake/tool.tmpl @@ -4,8 +4,6 @@ import os sys.path.insert(1, '@CMAKE_SOURCE_DIR@') -os.environ['NOMINATIM_NOMINATIM_TOOL'] = os.path.abspath(__file__) - from nominatim import cli from nominatim import version @@ -13,8 +11,4 @@ version.GIT_COMMIT_HASH = '@GIT_HASH@' exit(cli.nominatim(module_dir='@CMAKE_BINARY_DIR@/module', osm2pgsql_path='@CMAKE_BINARY_DIR@/osm2pgsql/osm2pgsql', - phplib_dir='@CMAKE_SOURCE_DIR@/lib-php', - sqllib_dir='@CMAKE_SOURCE_DIR@/lib-sql', - data_dir='@CMAKE_SOURCE_DIR@/data', - config_dir='@CMAKE_SOURCE_DIR@/settings', phpcgi_path='@PHPCGI_BIN@')) diff --git a/docs/api/Faq.md b/docs/api/Faq.md index 809f2c39..c4a64d84 100644 --- a/docs/api/Faq.md +++ b/docs/api/Faq.md @@ -59,3 +59,27 @@ suited for these kinds of queries. That said if you installed your own Nominatim instance you can use the `nominatim export` PHP script as basis to return such lists. + +#### 7. My result has a wrong postcode. Where does it come from? + +Most places in OSM don't have a postcode, so Nominatim tries to interpolate +one. It first look at all the places that make up the address of the place. +If one of them has a postcode defined, this is the one to be used. When +none of the address parts has a postcode either, Nominatim interpolates one +from the surrounding objects. If the postcode is for your result is one, then +most of the time there is an OSM object with the wrong postcode nearby. + +To find the bad postcode, go to +[https://nominatim.openstreetmap.org](https://nominatim.openstreetmap.org) +and search for your place. When you have found it, click on the 'details' link +under the result to go to the details page. There is a field 'Computed Postcode' +which should display the bad postcode. Click on the 'how?' link. A small +explanation text appears. It contains a link to a query for Overpass Turbo. +Click on that and you get a map with all places in the area that have the bad +postcode. If none is displayed, zoom the map out a bit and then click on 'Run'. + +Now go to [OpenStreetMap](https://openstreetmap.org) and fix the error you +have just found. It will take at least a day for Nominatim to catch up with +your data fix. Sometimes longer, depending on how much editing activity is in +the area. + diff --git a/docs/customize/Settings.md b/docs/customize/Settings.md index 289433d6..bb552744 100644 --- a/docs/customize/Settings.md +++ b/docs/customize/Settings.md @@ -666,7 +666,7 @@ The entries in the log file have the following format: "" Request time is the time when the request was started. The execution time is -given in ms and corresponds to the time the query took executing in PHP. +given in seconds and corresponds to the time the query took executing in PHP. type contains the name of the endpoint used. Can be used as the same time as NOMINATIM_LOG_DB. diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index c563b201..58606c29 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -213,6 +213,15 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +##### clean-tiger-tags + +::: nominatim.tokenizer.sanitizers.clean_tiger_tags + selection: + members: False + rendering: + heading_level: 6 + + #### Token Analysis diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index a8fb9fcc..295b838e 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -1230,7 +1230,11 @@ BEGIN {% endif %} END IF; - IF NEW.postcode is null AND NEW.rank_search > 8 THEN + IF NEW.postcode is null AND NEW.rank_search > 8 + AND (NEW.rank_address > 0 + OR ST_GeometryType(NEW.geometry) not in ('ST_LineString','ST_MultiLineString') + OR ST_Length(NEW.geometry) < 0.02) + THEN NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry); END IF; diff --git a/nominatim/cli.py b/nominatim/cli.py index 8c2136f4..56ed6a07 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -100,9 +100,7 @@ class CommandlineParser: self.parser.print_help() return 1 - for arg in ('module_dir', 'osm2pgsql_path', 'phplib_dir', 'sqllib_dir', - 'data_dir', 'config_dir', 'phpcgi_path'): - setattr(args, arg, Path(kwargs[arg])) + args.phpcgi_path = Path(kwargs['phpcgi_path']) args.project_dir = Path(args.project_dir).resolve() if 'cli_args' not in kwargs: @@ -111,13 +109,10 @@ class CommandlineParser: datefmt='%Y-%m-%d %H:%M:%S', level=max(4 - args.verbose, 1) * 10) - args.config = Configuration(args.project_dir, args.config_dir, + args.config = Configuration(args.project_dir, environ=kwargs.get('environ', os.environ)) - args.config.set_libdirs(module=args.module_dir, - osm2pgsql=args.osm2pgsql_path, - php=args.phplib_dir, - sql=args.sqllib_dir, - data=args.data_dir) + args.config.set_libdirs(module=kwargs['module_dir'], + osm2pgsql=kwargs['osm2pgsql_path']) log = logging.getLogger() log.warning('Using project directory: %s', str(args.project_dir)) @@ -195,7 +190,7 @@ class QueryExport: if args.restrict_to_osm_relation: params.extend(('--restrict-to-osm-relation', args.restrict_to_osm_relation)) - return run_legacy_script('export.php', *params, nominatim_env=args) + return run_legacy_script('export.php', *params, config=args.config) class AdminServe: diff --git a/nominatim/clicmd/admin.py b/nominatim/clicmd/admin.py index c3ba9c0b..0c773960 100644 --- a/nominatim/clicmd/admin.py +++ b/nominatim/clicmd/admin.py @@ -88,4 +88,4 @@ class AdminFuncs: params.append('--reverse-only') if args.target == 'search': params.append('--search-only') - return run_legacy_script(*params, nominatim_env=args) + return run_legacy_script(*params, config=args.config) diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py index 45599ad5..98673ed6 100644 --- a/nominatim/clicmd/args.py +++ b/nominatim/clicmd/args.py @@ -42,12 +42,6 @@ class NominatimArgs: # Basic environment set by root program. config: Configuration project_dir: Path - module_dir: Path - osm2pgsql_path: Path - phplib_dir: Path - sqllib_dir: Path - data_dir: Path - config_dir: Path phpcgi_path: Path # Global switches @@ -181,7 +175,7 @@ class NominatimArgs: from the command line arguments. The resulting dict can be further customized and then used in `run_osm2pgsql()`. """ - return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.osm2pgsql_path, + return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.config.lib_dir.osm2pgsql, osm2pgsql_cache=self.osm2pgsql_cache or default_cache, osm2pgsql_style=self.config.get_import_style_file(), osm2pgsql_style_path=self.config.config_dir, diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 344167bb..68884fe1 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -87,7 +87,7 @@ class SetupAll: LOG.warning('Setting up country tables') country_info.setup_country_tables(args.config.get_libpq_dsn(), - args.data_dir, + args.config.lib_dir.data, args.no_partitions) LOG.warning('Importing OSM data file') diff --git a/nominatim/config.py b/nominatim/config.py index 7502aff7..1728c291 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -20,6 +20,7 @@ from dotenv import dotenv_values from nominatim.typing import StrPath from nominatim.errors import UsageError +import nominatim.paths LOG = logging.getLogger() CONFIG_CACHE : Dict[str, Any] = {} @@ -58,21 +59,22 @@ class Configuration: avoid conflicts with other environment variables. """ - def __init__(self, project_dir: Path, config_dir: Path, + def __init__(self, project_dir: Optional[Path], environ: Optional[Mapping[str, str]] = None) -> None: self.environ = environ or os.environ self.project_dir = project_dir - self.config_dir = config_dir - self._config = dotenv_values(str((config_dir / 'env.defaults').resolve())) - if project_dir is not None and (project_dir / '.env').is_file(): - self._config.update(dotenv_values(str((project_dir / '.env').resolve()))) + self.config_dir = nominatim.paths.CONFIG_DIR + self._config = dotenv_values(str(self.config_dir / 'env.defaults')) + if self.project_dir is not None and (self.project_dir / '.env').is_file(): + self.project_dir = self.project_dir.resolve() + self._config.update(dotenv_values(str(self.project_dir / '.env'))) class _LibDirs: module: Path osm2pgsql: Path - php: Path - sql: Path - data: Path + php = nominatim.paths.PHPLIB_DIR + sql = nominatim.paths.SQLLIB_DIR + data = nominatim.paths.DATA_DIR self.lib_dir = _LibDirs() self._private_plugins: Dict[str, object] = {} @@ -82,7 +84,7 @@ class Configuration: """ Set paths to library functions and data. """ for key, value in kwargs.items(): - setattr(self.lib_dir, key, Path(value).resolve()) + setattr(self.lib_dir, key, Path(value)) def __getattr__(self, name: str) -> str: @@ -136,6 +138,7 @@ class Configuration: cfgpath = Path(value) if not cfgpath.is_absolute(): + assert self.project_dir is not None cfgpath = self.project_dir / cfgpath return cfgpath.resolve() @@ -174,11 +177,11 @@ class Configuration: return self.find_config_file('', 'IMPORT_STYLE') - def get_os_env(self) -> Dict[str, Optional[str]]: + def get_os_env(self) -> Dict[str, str]: """ Return a copy of the OS environment with the Nominatim configuration merged in. """ - env = dict(self._config) + env = {k: v for k, v in self._config.items() if v is not None} env.update(self.environ) return env diff --git a/nominatim/paths.py b/nominatim/paths.py new file mode 100644 index 00000000..a34628df --- /dev/null +++ b/nominatim/paths.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Path settings for extra data used by Nominatim. +""" +from pathlib import Path + +PHPLIB_DIR = (Path(__file__) / '..' / '..' / 'lib-php').resolve() +SQLLIB_DIR = (Path(__file__) / '..' / '..' / 'lib-sql').resolve() +DATA_DIR = (Path(__file__) / '..' / '..' / 'data').resolve() +CONFIG_DIR = (Path(__file__) / '..' / '..' / 'settings').resolve() diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py index f5159fa0..d6bc5163 100644 --- a/nominatim/tokenizer/factory.py +++ b/nominatim/tokenizer/factory.py @@ -55,6 +55,7 @@ def create_tokenizer(config: Configuration, init_db: bool = True, module_name = config.TOKENIZER # Create the directory for the tokenizer data + assert config.project_dir is not None basedir = config.project_dir / 'tokenizer' if not basedir.exists(): basedir.mkdir() @@ -80,6 +81,7 @@ def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer: The function looks up the appropriate tokenizer in the database and initialises it. """ + assert config.project_dir is not None basedir = config.project_dir / 'tokenizer' if not basedir.is_dir(): # Directory will be repopulated by tokenizer below. diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 1b0b2980..a50dedb2 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -106,6 +106,7 @@ class LegacyTokenizer(AbstractTokenizer): This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ + assert config.project_dir is not None module_dir = _install_module(config.DATABASE_MODULE_PATH, config.lib_dir.module, config.project_dir / 'module') @@ -127,6 +128,8 @@ class LegacyTokenizer(AbstractTokenizer): def init_from_project(self, config: Configuration) -> None: """ Initialise the tokenizer from the project directory. """ + assert config.project_dir is not None + with connect(self.dsn) as conn: self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) @@ -149,6 +152,8 @@ class LegacyTokenizer(AbstractTokenizer): def update_sql_functions(self, config: Configuration) -> None: """ Reimport the SQL functions for this tokenizer. """ + assert config.project_dir is not None + with connect(self.dsn) as conn: max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ) modulepath = config.DATABASE_MODULE_PATH or \ @@ -193,6 +198,8 @@ class LegacyTokenizer(AbstractTokenizer): This is a special migration function for updating existing databases to new software versions. """ + assert config.project_dir is not None + self.normalization = config.TERM_NORMALIZATION module_dir = _install_module(config.DATABASE_MODULE_PATH, config.lib_dir.module, diff --git a/nominatim/tokenizer/sanitizers/clean_tiger_tags.py b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py new file mode 100644 index 00000000..9698a326 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that preprocesses tags from the TIGER import. + +It makes the following changes: + +* remove state reference from tiger:county +""" +from typing import Callable +import re + +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.tokenizer.sanitizers.config import SanitizerConfig + +COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]') + +def _clean_tiger_county(obj: ProcessInfo) -> None: + """ Remove the state reference from tiger:county tags. + + This transforms a name like 'Hamilton, AL' into 'Hamilton'. + If no state reference is detected at the end, the name is left as is. + """ + if not obj.address: + return + + for item in obj.address: + if item.kind == 'tiger' and item.suffix == 'county': + m = COUNTY_MATCH.fullmatch(item.name) + if m: + item.name = m[1] + # Switch kind and suffix, the split left them reversed. + item.kind = 'county' + item.suffix = 'tiger' + + return + + +def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: + """ Create a housenumber processing function. + """ + return _clean_tiger_county diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index ed3bb53b..9c801142 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -15,6 +15,7 @@ import subprocess import urllib.request as urlrequest from urllib.parse import urlencode +from nominatim.config import Configuration from nominatim.typing import StrPath from nominatim.version import version_str from nominatim.db.connection import get_pg_env @@ -22,7 +23,7 @@ from nominatim.db.connection import get_pg_env LOG = logging.getLogger() def run_legacy_script(script: StrPath, *args: Union[int, str], - nominatim_env: Any, + config: Configuration, throw_on_fail: bool = False) -> int: """ Run a Nominatim PHP script with the given arguments. @@ -30,18 +31,18 @@ def run_legacy_script(script: StrPath, *args: Union[int, str], then throw a `CalledProcessError` on a non-zero exit. """ cmd = ['/usr/bin/env', 'php', '-Cq', - str(nominatim_env.phplib_dir / 'admin' / script)] + str(config.lib_dir.php / 'admin' / script)] cmd.extend([str(a) for a in args]) - env = nominatim_env.config.get_os_env() - env['NOMINATIM_DATADIR'] = str(nominatim_env.data_dir) - env['NOMINATIM_SQLDIR'] = str(nominatim_env.sqllib_dir) - env['NOMINATIM_CONFIGDIR'] = str(nominatim_env.config_dir) - env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str(nominatim_env.module_dir) + env = config.get_os_env() + env['NOMINATIM_DATADIR'] = str(config.lib_dir.data) + env['NOMINATIM_SQLDIR'] = str(config.lib_dir.sql) + env['NOMINATIM_CONFIGDIR'] = str(config.config_dir) + env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str(config.lib_dir.module) if not env['NOMINATIM_OSM2PGSQL_BINARY']: - env['NOMINATIM_OSM2PGSQL_BINARY'] = str(nominatim_env.osm2pgsql_path) + env['NOMINATIM_OSM2PGSQL_BINARY'] = str(config.lib_dir.osm2pgsql) - proc = subprocess.run(cmd, cwd=str(nominatim_env.project_dir), env=env, + proc = subprocess.run(cmd, cwd=str(config.project_dir), env=env, check=throw_on_fail) return proc.returncode diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index d5806097..147a9f9c 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -332,3 +332,19 @@ def add_place_deletion_todo_table(conn: Connection, **_: Any) -> None: class TEXT, type TEXT, deferred BOOLEAN)""") + + +@_migration(4, 1, 99, 1) +def split_pending_index(conn: Connection, **_: Any) -> None: + """ Reorganise indexes for pending updates. + """ + if conn.table_exists('place'): + with conn.cursor() as cur: + cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_address_sector + ON placex USING BTREE (rank_address, geometry_sector) + WHERE indexed_status > 0""") + cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_boundaries_sector + ON placex USING BTREE (rank_search, geometry_sector) + WHERE class = 'boundary' and type = 'administrative' + and indexed_status > 0""") + cur.execute("DROP INDEX IF EXISTS idx_placex_pendingsector") diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index c50493cc..b35d3aae 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -216,6 +216,7 @@ def setup_website(basedir: Path, config: Configuration, conn: Connection) -> Non LOG.info('Creating website directory.') basedir.mkdir() + assert config.project_dir is not None template = dedent(f"""\ 0 - if ADD_TIGER_COUNTY then - local v = o:grab_tag('tiger:county') - if v ~= nil then - v, num = v:gsub(',.*', ' county') - if num == 0 then - v = v .. ' county' - end - o:set_address('tiger:county', v) - end - end o:grab_address{match=ADDRESS_TAGS} if is_interpolation then diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 212fdcb9..f30578a2 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -24,6 +24,7 @@ transliteration: - ":: lower ()" - "[^a-z0-9[:Space:]] >" - ":: NFC ()" + - "[:Space:]+ > ' '" sanitizers: - step: clean-housenumbers filter-kind: @@ -35,6 +36,7 @@ sanitizers: - step: clean-postcodes convert-to-address: yes default-pattern: "[A-Z0-9- ]{3,12}" + - step: clean-tiger-tags - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/settings/import-extratags.lua b/settings/import-extratags.lua index 535af3c8..7b1880ef 100644 --- a/settings/import-extratags.lua +++ b/settings/import-extratags.lua @@ -123,8 +123,7 @@ HOUSENUMBER_TAGS = tag_match{keys = {'addr:housenumber', 'addr:conscriptionnumbe INTERPOLATION_TAGS = tag_match{keys = {'addr:interpolation'}} -ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*'}} -ADD_TIGER_COUNTY = true +ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*', 'tiger:county'}} SAVE_EXTRA_MAINS = true diff --git a/test/bdd/osm2pgsql/import/tags.feature b/test/bdd/osm2pgsql/import/tags.feature index 1f6857f2..60d241fe 100644 --- a/test/bdd/osm2pgsql/import/tags.feature +++ b/test/bdd/osm2pgsql/import/tags.feature @@ -166,20 +166,6 @@ Feature: Tag evaluation | N10003:place | place | island | - Scenario: Shorten tiger:county tags - When loading osm data - """ - n11001 Tplace=village,tiger:county=Feebourgh%2c%%20%AL - n11002 Tplace=village,addr:state=Alabama,tiger:county=Feebourgh%2c%%20%AL - n11003 Tplace=village,tiger:county=Feebourgh - """ - Then place contains exactly - | object | class | address | - | N11001 | place | 'tiger:county': 'Feebourgh county' | - | N11002 | place | 'tiger:county': 'Feebourgh county', 'state': 'Alabama' | - | N11003 | place | 'tiger:county': 'Feebourgh county' | - - Scenario: Building fallbacks When loading osm data """ diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index 5145327c..238081c0 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -43,7 +43,7 @@ class NominatimEnvironment: self.code_coverage_path = config['PHPCOV'] self.code_coverage_id = 1 - self.default_config = Configuration(None, self.src_dir / 'settings').get_os_env() + self.default_config = Configuration(None).get_os_env() self.test_env = None self.template_db_done = False self.api_db_done = False @@ -105,7 +105,6 @@ class NominatimEnvironment: self.test_env['NOMINATIM_CONFIGDIR'] = str((self.src_dir / 'settings').resolve()) self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve()) self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve()) - self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve()) if self.tokenizer is not None: self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer if self.import_style is not None: @@ -131,13 +130,9 @@ class NominatimEnvironment: def get_test_config(self): - cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings', - environ=self.test_env) + cfg = Configuration(Path(self.website_dir.name), environ=self.test_env) cfg.set_libdirs(module=self.build_dir / 'module', - osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql', - php=self.src_dir / 'lib-php', - sql=self.src_dir / 'lib-sql', - data=self.src_dir / 'data') + osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql') return cfg def get_libpq_dsn(self): @@ -307,10 +302,6 @@ class NominatimEnvironment: cli.nominatim(module_dir='', osm2pgsql_path=str(self.build_dir / 'osm2pgsql' / 'osm2pgsql'), - phplib_dir=str(self.src_dir / 'lib-php'), - sqllib_dir=str(self.src_dir / 'lib-sql'), - data_dir=str(self.src_dir / 'data'), - config_dir=str(self.src_dir / 'settings'), cli_args=cmdline, phpcgi_path='', environ=self.test_env) diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py index 420740cf..09bfd353 100644 --- a/test/python/cli/conftest.py +++ b/test/python/cli/conftest.py @@ -53,11 +53,7 @@ def cli_call(src_dir): def _call_nominatim(*args): return nominatim.cli.nominatim(module_dir='MODULE NOT AVAILABLE', osm2pgsql_path='OSM2PGSQL NOT AVAILABLE', - phplib_dir=str(src_dir / 'lib-php'), - data_dir=str(src_dir / 'data'), phpcgi_path='/usr/bin/php-cgi', - sqllib_dir=str(src_dir / 'lib-sql'), - config_dir=str(src_dir / 'settings'), cli_args=args) return _call_nominatim diff --git a/test/python/cli/test_cli.py b/test/python/cli/test_cli.py index 07d6c31f..1072f6c9 100644 --- a/test/python/cli/test_cli.py +++ b/test/python/cli/test_cli.py @@ -82,19 +82,17 @@ def test_cli_export_command(cli_call, mock_run_legacy): ('restrict-to-osm-way', '727'), ('restrict-to-osm-relation', '197532') ]) -def test_export_parameters(src_dir, tmp_path, param, value): +def test_export_parameters(src_dir, tmp_path, param, value, monkeypatch): (tmp_path / 'admin').mkdir() (tmp_path / 'admin' / 'export.php').write_text(f"""= 0 ? 0 : 10); """) + monkeypatch.setattr(nominatim.paths, 'PHPLIB_DIR', tmp_path) + assert nominatim.cli.nominatim(module_dir='MODULE NOT AVAILABLE', osm2pgsql_path='OSM2PGSQL NOT AVAILABLE', - phplib_dir=str(tmp_path), - data_dir=str(src_dir / 'data'), phpcgi_path='/usr/bin/php-cgi', - sqllib_dir=str(src_dir / 'lib-sql'), - config_dir=str(src_dir / 'settings'), cli_args=['export', '--' + param, value]) == 0 diff --git a/test/python/cli/test_cmd_api.py b/test/python/cli/test_cmd_api.py index 80248ac7..96415938 100644 --- a/test/python/cli/test_cmd_api.py +++ b/test/python/cli/test_cmd_api.py @@ -13,14 +13,10 @@ import nominatim.clicmd.api @pytest.mark.parametrize("endpoint", (('search', 'reverse', 'lookup', 'details', 'status'))) -def test_no_api_without_phpcgi(src_dir, endpoint): +def test_no_api_without_phpcgi(endpoint): assert nominatim.cli.nominatim(module_dir='MODULE NOT AVAILABLE', osm2pgsql_path='OSM2PGSQL NOT AVAILABLE', - phplib_dir=str(src_dir / 'lib-php'), - data_dir=str(src_dir / 'data'), phpcgi_path=None, - sqllib_dir=str(src_dir / 'lib-sql'), - config_dir=str(src_dir / 'settings'), cli_args=[endpoint]) == 1 @@ -36,24 +32,28 @@ def test_no_api_without_phpcgi(src_dir, endpoint): class TestCliApiCall: @pytest.fixture(autouse=True) - def setup_cli_call(self, cli_call): - self.call_nominatim = cli_call + def setup_cli_call(self, params, cli_call, mock_func_factory, tmp_path): + self.mock_run_api = mock_func_factory(nominatim.clicmd.api, 'run_api_script') - def test_api_commands_simple(self, mock_func_factory, params, tmp_path): + def _run(): + return cli_call(*params, '--project-dir', str(tmp_path)) + + self.run_nominatim = _run + + + def test_api_commands_simple(self, tmp_path, params): (tmp_path / 'website').mkdir() (tmp_path / 'website' / (params[0] + '.php')).write_text('') - mock_run_api = mock_func_factory(nominatim.clicmd.api, 'run_api_script') - assert self.call_nominatim(*params, '--project-dir', str(tmp_path)) == 0 + assert self.run_nominatim() == 0 - assert mock_run_api.called == 1 - assert mock_run_api.last_args[0] == params[0] + assert self.mock_run_api.called == 1 + assert self.mock_run_api.last_args[0] == params[0] - def test_bad_project_idr(self, mock_func_factory, params): - mock_run_api = mock_func_factory(nominatim.clicmd.api, 'run_api_script') + def test_bad_project_dir(self): + assert self.run_nominatim() == 1 - assert self.call_nominatim(*params) == 1 QUERY_PARAMS = { 'search': ('--query', 'somewhere'), diff --git a/test/python/cli/test_cmd_replication.py b/test/python/cli/test_cmd_replication.py index 9fd4f5b3..a22d077a 100644 --- a/test/python/cli/test_cmd_replication.py +++ b/test/python/cli/test_cmd_replication.py @@ -15,6 +15,7 @@ import pytest import nominatim.cli import nominatim.indexer.indexer import nominatim.tools.replication +import nominatim.tools.refresh from nominatim.db import status @pytest.fixture @@ -107,7 +108,7 @@ class TestCliReplication: def test_replication_update_once_no_index(self, update_mock): assert self.call_nominatim('--once', '--no-index') == 0 - assert str(update_mock.last_args[1]['osm2pgsql']) == 'OSM2PGSQL NOT AVAILABLE' + assert str(update_mock.last_args[1]['osm2pgsql']).endswith('OSM2PGSQL NOT AVAILABLE') def test_replication_update_custom_osm2pgsql(self, monkeypatch, update_mock): diff --git a/test/python/config/test_config.py b/test/python/config/test_config.py index a9cbb48d..a003065d 100644 --- a/test/python/config/test_config.py +++ b/test/python/config/test_config.py @@ -14,23 +14,23 @@ from nominatim.config import Configuration, flatten_config_list from nominatim.errors import UsageError @pytest.fixture -def make_config(src_dir): +def make_config(): """ Create a configuration object from the given project directory. """ def _mk_config(project_dir=None): - return Configuration(project_dir, src_dir / 'settings') + return Configuration(project_dir) return _mk_config @pytest.fixture -def make_config_path(src_dir, tmp_path): +def make_config_path(tmp_path): """ Create a configuration object with project and config directories in a temporary directory. """ def _mk_config(): (tmp_path / 'project').mkdir() (tmp_path / 'config').mkdir() - conf = Configuration(tmp_path / 'project', src_dir / 'settings') + conf = Configuration(tmp_path / 'project') conf.config_dir = tmp_path / 'config' return conf diff --git a/test/python/config/test_config_load_module.py b/test/python/config/test_config_load_module.py index df6c4794..7bc91fd7 100644 --- a/test/python/config/test_config_load_module.py +++ b/test/python/config/test_config_load_module.py @@ -21,7 +21,7 @@ def test_config(src_dir, tmp_path): """ (tmp_path / 'project').mkdir() (tmp_path / 'config').mkdir() - conf = Configuration(tmp_path / 'project', src_dir / 'settings') + conf = Configuration(tmp_path / 'project') conf.config_dir = tmp_path / 'config' return conf diff --git a/test/python/conftest.py b/test/python/conftest.py index 40526295..31463746 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -107,24 +107,18 @@ def table_factory(temp_db_cursor): @pytest.fixture -def def_config(src_dir): - cfg = Configuration(None, src_dir / 'settings') - cfg.set_libdirs(module='.', osm2pgsql='.', - php=src_dir / 'lib-php', - sql=src_dir / 'lib-sql', - data=src_dir / 'data') +def def_config(): + cfg = Configuration(None) + cfg.set_libdirs(module='.', osm2pgsql='.') return cfg @pytest.fixture -def project_env(src_dir, tmp_path): +def project_env(tmp_path): projdir = tmp_path / 'project' projdir.mkdir() - cfg = Configuration(projdir, src_dir / 'settings') - cfg.set_libdirs(module='.', osm2pgsql='.', - php=src_dir / 'lib-php', - sql=src_dir / 'lib-sql', - data=src_dir / 'data') + cfg = Configuration(projdir) + cfg.set_libdirs(module='.', osm2pgsql='.') return cfg @@ -214,9 +208,8 @@ def osmline_table(temp_db_with_extensions, table_factory): @pytest.fixture def sql_preprocessor_cfg(tmp_path, table_factory, temp_db_with_extensions): table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, ))) - cfg = Configuration(None, SRC_DIR.resolve() / 'settings') - cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php', - sql=tmp_path, data=SRC_DIR / 'data') + cfg = Configuration(None) + cfg.set_libdirs(module='.', osm2pgsql='.', sql=tmp_path) return cfg diff --git a/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py new file mode 100644 index 00000000..fc17ad24 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for sanitizer that clean up TIGER tags. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.data.place_info import PlaceInfo + +class TestCleanTigerTags: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, addr): + place = PlaceInfo({'address': addr}) + _, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in outaddr]) + + @pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'), + ('Little, Borough, CA', 'Little, Borough')]) + def test_well_formatted(self, inname, outname): + assert self.run_sanitizer_on({'tiger:county': inname})\ + == [(outname, 'county', 'tiger')] + + + @pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', '')) + def test_badly_formatted(self, name): + assert self.run_sanitizer_on({'tiger:county': name})\ + == [(name, 'county', 'tiger')] + + + def test_unmatched(self): + assert self.run_sanitizer_on({'tiger:country': 'US'})\ + == [('US', 'tiger', 'country')] diff --git a/test/python/tools/test_exec_utils.py b/test/python/tools/test_exec_utils.py index 26ea92b2..f73aec30 100644 --- a/test/python/tools/test_exec_utils.py +++ b/test/python/tools/test_exec_utils.py @@ -12,31 +12,28 @@ import subprocess import pytest +from nominatim.config import Configuration import nominatim.tools.exec_utils as exec_utils +import nominatim.paths class TestRunLegacyScript: @pytest.fixture(autouse=True) - def setup_nominatim_env(self, tmp_path, def_config): + def setup_nominatim_env(self, tmp_path, monkeypatch): tmp_phplib_dir = tmp_path / 'phplib' tmp_phplib_dir.mkdir() (tmp_phplib_dir / 'admin').mkdir() - class _NominatimEnv: - config = def_config - phplib_dir = tmp_phplib_dir - data_dir = Path('data') - project_dir = Path('.') - sqllib_dir = Path('lib-sql') - config_dir = Path('settings') - module_dir = 'module' - osm2pgsql_path = 'osm2pgsql' + monkeypatch.setattr(nominatim.paths, 'PHPLIB_DIR', tmp_phplib_dir) - self.testenv = _NominatimEnv + self.phplib_dir = tmp_phplib_dir + self.config = Configuration(tmp_path) + self.config.set_libdirs(module='.', osm2pgsql='default_osm2pgsql', + php=tmp_phplib_dir) def mk_script(self, code): - codefile = self.testenv.phplib_dir / 'admin' / 't.php' + codefile = self.phplib_dir / 'admin' / 't.php' codefile.write_text('