From: Sarah Hoffmann Date: Sun, 29 Sep 2024 09:44:04 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/0af8dac3d35a94afe0b6ad775f3226d8d147501d?hp=-c Merge remote-tracking branch 'upstream/master' --- 0af8dac3d35a94afe0b6ad775f3226d8d147501d diff --combined CMakeLists.txt index 4b4e3fc9,e31362fe..e6d59520 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@@ -19,7 -19,7 +19,7 @@@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_ project(nominatim) set(NOMINATIM_VERSION_MAJOR 4) - set(NOMINATIM_VERSION_MINOR 4) + set(NOMINATIM_VERSION_MINOR 5) set(NOMINATIM_VERSION_PATCH 0) set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}") @@@ -44,7 -44,6 +44,6 @@@ endif( set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database") set(BUILD_API on CACHE BOOL "Build everything for the API server") - set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer") set(BUILD_TESTS on CACHE BOOL "Build test suite") set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)") set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim") @@@ -74,25 -73,6 +73,6 @@@ if (BUILD_IMPORTER OR BUILD_API find_package(PythonInterp 3.7 REQUIRED) endif() - #----------------------------------------------------------------------------- - # PHP - #----------------------------------------------------------------------------- - - # Setting PHP binary variable as to command line (prevailing) or auto detect - - if (BUILD_API) - if (NOT PHP_BIN) - find_program (PHP_BIN php) - endif() - # sanity check if PHP binary exists - if (NOT EXISTS ${PHP_BIN}) - message(WARNING "PHP binary not found. Only Python frontend can be used.") - set(PHP_BIN "") - else() - message (STATUS "Using PHP binary " ${PHP_BIN}) - endif() - endif() - #----------------------------------------------------------------------------- # import scripts and utilities (importer only) #----------------------------------------------------------------------------- @@@ -113,27 -93,6 +93,27 @@@ if (BUILD_IMPORTER ${PROJECT_BINARY_DIR}/nominatim) endif() +#----------------------------------------------------------------------------- +# Targets for running a development webserver from the build directory. +#----------------------------------------------------------------------------- + +if (BUILD_API) + set(WEBSITEFILES + 403.html + 509.html + crossdomain.xml + favicon.ico + nominatim.xml + robots.txt + taginfo.json + ) + + foreach (webfile ${WEBSITEFILES}) + configure_file(${PROJECT_SOURCE_DIR}/website/${webfile} + ${PROJECT_BINARY_DIR}/website/${webfile}) + endforeach() +endif() + #----------------------------------------------------------------------------- # Tests #----------------------------------------------------------------------------- @@@ -146,8 -105,6 +126,6 @@@ if (BUILD_TESTS find_program(PYTHON_BEHAVE behave) find_program(PYLINT NAMES pylint3 pylint) find_program(PYTEST NAMES pytest py.test-3 py.test) - find_program(PHPCS phpcs) - find_program(PHPUNIT phpunit) if (PYTHON_BEHAVE) message(STATUS "Using Python behave binary ${PYTHON_BEHAVE}") @@@ -162,24 -119,6 +140,6 @@@ message(WARNING "behave not found. BDD tests disabled." ) endif() - if (PHPUNIT) - message(STATUS "Using phpunit binary ${PHPUNIT}") - add_test(NAME php - COMMAND ${PHPUNIT} ./ - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test/php) - else() - message(WARNING "phpunit not found. PHP unit tests disabled." ) - endif() - - if (PHPCS) - message(STATUS "Using phpcs binary ${PHPCS}") - add_test(NAME phpcs - COMMAND ${PHPCS} --report-width=120 --colors lib-php - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) - else() - message(WARNING "phpcs not found. PHP linting tests disabled." ) - endif() - if (PYLINT) message(STATUS "Using pylint binary ${PYLINT}") add_test(NAME pylint @@@ -199,14 -138,6 +159,6 @@@ endif() endif() - #----------------------------------------------------------------------------- - # Postgres module - #----------------------------------------------------------------------------- - - if (BUILD_MODULE) - add_subdirectory(module) - endif() - #----------------------------------------------------------------------------- # Installation #----------------------------------------------------------------------------- @@@ -224,11 -155,7 +176,7 @@@ if (BUILD_IMPORTER DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME nominatim) - if (EXISTS ${PHP_BIN}) - configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed) - else() - configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed) - endif() + configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed) foreach (submodule nominatim_db nominatim_api) install(DIRECTORY src/${submodule} @@@ -259,15 -186,6 +207,6 @@@ if (BUILD_OSM2PGSQL endif() endif() - if (BUILD_MODULE) - install(PROGRAMS ${PROJECT_BINARY_DIR}/module/nominatim.so - DESTINATION ${NOMINATIM_LIBDIR}/module) - endif() - - if (BUILD_API AND EXISTS ${PHP_BIN}) - install(DIRECTORY lib-php DESTINATION ${NOMINATIM_LIBDIR}) - endif() - install(FILES settings/env.defaults settings/address-levels.json settings/phrase-settings.json diff --combined src/nominatim_api/search/db_search_builder.py index 0269cf1f,1ac6db2b..b4346ee6 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@@ -167,8 -167,7 +167,7 @@@ class SearchBuilder expected_count = sum(t.count for t in hnrs) partials = {t.token: t.addr_count for trange in address - for t in self.query.get_partials_list(trange) - if t.is_indexed} + for t in self.query.get_partials_list(trange)} if not partials: # can happen when none of the partials is indexed @@@ -219,23 -218,19 +218,19 @@@ addr_partials = [t for r in address for t in self.query.get_partials_list(r)] addr_tokens = list({t.token for t in addr_partials}) - partials_indexed = all(t.is_indexed for t in name_partials.values()) \ - and all(t.is_indexed for t in addr_partials) exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1)) - if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed: + if (len(name_partials) > 3 or exp_count < 8000): yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens) return - addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000 + addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000 # Partial term to frequent. Try looking up by rare full names first. name_fulls = self.query.get_tokens(name, TokenType.WORD) if name_fulls: fulls_count = sum(t.count for t in name_fulls) - if partials_indexed: - penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) - if fulls_count < 50000 or addr_count < 30000: + if fulls_count < 80000 or addr_count < 50000: yield penalty,fulls_count / (2**len(addr_tokens)), \ self.get_full_name_ranking(name_fulls, addr_partials, fulls_count > 30000 / max(1, len(addr_tokens))) @@@ -243,8 -238,7 +238,7 @@@ # To catch remaining results, lookup by name and address # We only do this if there is a reasonable number of results expected. exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count - if exp_count < 10000 and addr_count < 20000\ - and all(t.is_indexed for t in name_partials.values()): + if exp_count < 10000 and addr_count < 20000: penalty += 0.35 * max(1 if name_fulls else 0.1, 5 - len(name_partials) - len(addr_tokens)) yield penalty, exp_count,\ @@@ -260,11 -254,10 +254,10 @@@ addr_restrict_tokens = [] addr_lookup_tokens = [] for t in addr_partials: - if t.is_indexed: - if t.addr_count > 20000: - addr_restrict_tokens.append(t.token) - else: - addr_lookup_tokens.append(t.token) + if t.addr_count > 20000: + addr_restrict_tokens.append(t.token) + else: + addr_lookup_tokens.append(t.token) if addr_restrict_tokens: lookup.append(dbf.FieldLookup('nameaddress_vector', @@@ -287,9 -280,14 +280,9 @@@ # This might yield wrong results, nothing we can do about that. if use_lookup: addr_restrict_tokens = [] - addr_lookup_tokens = [] - for t in addr_partials: - if t.addr_count > 20000: - addr_restrict_tokens.append(t.token) - else: - addr_lookup_tokens.append(t.token) + addr_lookup_tokens = [t.token for t in addr_partials if t.is_indexed] else: - addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed] + addr_restrict_tokens = [t.token for t in addr_partials] addr_lookup_tokens = [] return dbf.lookup_by_any_name([t.token for t in name_fulls], diff --combined src/nominatim_api/search/icu_tokenizer.py index 7bd2b092,1aadc97e..c2a26510 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@@ -123,7 -123,7 +123,7 @@@ class ICUToken(qmod.Token) lookup_word = row.word_token return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count), - lookup_word=lookup_word, is_indexed=True, + lookup_word=lookup_word, word_token=row.word_token, info=row.info, addr_count=max(1, addr_count)) @@@ -208,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: @@@ -264,7 -259,9 +264,9 @@@ if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) + ICUToken(penalty=0.5, token=0, + count=1, addr_count=1, lookup_word=part.token, + word_token=part.token, info=None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py index 4eee2c73,1b95a901..452bf26c --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@@ -13,7 -13,6 +13,6 @@@ from typing import Optional, Sequence, import itertools import logging from pathlib import Path - from textwrap import dedent from psycopg.types.json import Jsonb from psycopg import sql as pysql @@@ -64,7 -63,6 +63,6 @@@ class ICUTokenizer(AbstractTokenizer) """ self.loader = ICURuleLoader(config) - self._install_php(config.lib_dir.php, overwrite=True) self._save_config() if init_db: @@@ -81,8 -79,6 +79,6 @@@ with connect(self.dsn) as conn: self.loader.load_config_from_db(conn) - self._install_php(config.lib_dir.php, overwrite=False) - def finalize_import(self, config: Configuration) -> None: """ Do any required postprocessing to make the tokenizer data ready @@@ -190,7 -186,6 +186,7 @@@ END) as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id + ORDER BY word_id """) drop_tables(conn, 'word_frequencies') @@@ -282,22 -277,6 +278,6 @@@ return list(s[0].split('@')[0] for s in cur) - def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None: - """ Install the php script for the tokenizer. - """ - if phpdir is not None: - assert self.loader is not None - php_file = self.data_dir / "tokenizer.php" - - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: """ Save the configuration that needs to remain stable for the given database as database properties.