From: Sarah Hoffmann Date: Thu, 8 Aug 2024 09:11:04 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~4 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/a28e158bddaed6c690c13df01bfd18a6dd647485?ds=sidebyside;hp=-c Merge remote-tracking branch 'upstream/master' --- a28e158bddaed6c690c13df01bfd18a6dd647485 diff --combined CMakeLists.txt index 7011e463,dd5c3110..83e34403 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@@ -46,8 -46,6 +46,6 @@@ set(BUILD_IMPORTER on CACHE BOOL "Buil set(BUILD_API on CACHE BOOL "Build everything for the API server") set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer") set(BUILD_TESTS on CACHE BOOL "Build test suite") - set(BUILD_DOCS on CACHE BOOL "Build documentation") - set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page") set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)") set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim") @@@ -72,7 -70,7 +70,7 @@@ endif( # python (imports/updates only) #----------------------------------------------------------------------------- - if (BUILD_IMPORTER) + if (BUILD_IMPORTER OR BUILD_API) find_package(PythonInterp 3.7 REQUIRED) endif() @@@ -115,27 -113,6 +113,27 @@@ if (BUILD_IMPORTER ${PROJECT_BINARY_DIR}/nominatim) endif() +#----------------------------------------------------------------------------- +# Targets for running a development webserver from the build directory. +#----------------------------------------------------------------------------- + +if (BUILD_API) + set(WEBSITEFILES + 403.html + 509.html + crossdomain.xml + favicon.ico + nominatim.xml + robots.txt + taginfo.json + ) + + foreach (webfile ${WEBSITEFILES}) + configure_file(${PROJECT_SOURCE_DIR}/website/${webfile} + ${PROJECT_BINARY_DIR}/website/${webfile}) + endforeach() +endif() + #----------------------------------------------------------------------------- # Tests #----------------------------------------------------------------------------- @@@ -209,22 -186,6 +207,6 @@@ if (BUILD_MODULE add_subdirectory(module) endif() - #----------------------------------------------------------------------------- - # Documentation - #----------------------------------------------------------------------------- - - if (BUILD_DOCS) - add_subdirectory(docs) - endif() - - #----------------------------------------------------------------------------- - # Manual page - #----------------------------------------------------------------------------- - - if (BUILD_MANPAGE) - add_subdirectory(man) - endif() - #----------------------------------------------------------------------------- # Installation #----------------------------------------------------------------------------- @@@ -242,20 -203,22 +224,22 @@@ if (BUILD_IMPORTER DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME nominatim) - install(DIRECTORY nominatim - DESTINATION ${NOMINATIM_LIBDIR}/lib-python - FILES_MATCHING PATTERN "*.py" - PATTERN "paths.py" EXCLUDE - PATTERN __pycache__ EXCLUDE) - if (EXISTS ${PHP_BIN}) configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed) else() configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed) endif() - install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed - DESTINATION ${NOMINATIM_LIBDIR}/lib-python/nominatim - RENAME paths.py) + + foreach (submodule nominatim_db nominatim_api) + install(DIRECTORY src/${submodule} + DESTINATION ${NOMINATIM_LIBDIR}/lib-python + FILES_MATCHING PATTERN "*.py" + PATTERN "paths.py" EXCLUDE + PATTERN __pycache__ EXCLUDE) + install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed + DESTINATION ${NOMINATIM_LIBDIR}/lib-python/${submodule} + RENAME paths.py) + endforeach() install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR}) diff --combined src/nominatim_api/search/icu_tokenizer.py index f6590f5b,971e95be..7bd2b092 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@@ -2,7 -2,7 +2,7 @@@ # # This file is part of Nominatim. (https://nominatim.org) # - # Copyright (C) 2023 by the Nominatim developer community. + # Copyright (C) 2024 by the Nominatim developer community. # For a full list of authors see the git log. """ Implementation of query analysis for the ICU tokenizer. @@@ -16,12 -16,12 +16,12 @@@ from icu import Transliterato import sqlalchemy as sa - from nominatim.typing import SaRow - from nominatim.api.connection import SearchConnection - from nominatim.api.logging import log - from nominatim.api.search import query as qmod - from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer - from nominatim.db.sqlalchemy_types import Json + from ..typing import SaRow + from ..sql.sqlalchemy_types import Json + from ..connection import SearchConnection + from ..logging import log + from ..search import query as qmod + from ..search.query_analyzer_factory import AbstractQueryAnalyzer DB_TO_TOKEN_TYPE = { @@@ -208,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py index 70273b90,7cd96d59..4eee2c73 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@@ -1,8 -1,8 +1,8 @@@ - # SPDX-License-Identifier: GPL-2.0-only + # SPDX-License-Identifier: GPL-3.0-or-later # # This file is part of Nominatim. (https://nominatim.org) # - # Copyright (C) 2022 by the Nominatim developer community. + # Copyright (C) 2024 by the Nominatim developer community. # For a full list of authors see the git log. """ Tokenizer implementing normalisation as used before Nominatim 4 but using @@@ -11,21 -11,23 +11,23 @@@ libICU instead of the PostgreSQL module from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \ Dict, Set, Iterable import itertools - import json import logging from pathlib import Path from textwrap import dedent - from nominatim.db.connection import connect, Connection, Cursor - from nominatim.config import Configuration - from nominatim.db.utils import CopyBuffer - from nominatim.db.sql_preprocessor import SQLPreprocessor - from nominatim.data.place_info import PlaceInfo - from nominatim.tokenizer.icu_rule_loader import ICURuleLoader - from nominatim.tokenizer.place_sanitizer import PlaceSanitizer - from nominatim.data.place_name import PlaceName - from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis - from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer + from psycopg.types.json import Jsonb + from psycopg import sql as pysql + + from ..db.connection import connect, Connection, Cursor, server_version_tuple,\ + drop_tables, table_exists, execute_scalar + from ..config import Configuration + from ..db.sql_preprocessor import SQLPreprocessor + from ..data.place_info import PlaceInfo + from ..data.place_name import PlaceName + from .icu_rule_loader import ICURuleLoader + from .place_sanitizer import PlaceSanitizer + from .icu_token_analysis import ICUTokenAnalysis + from .base import AbstractAnalyzer, AbstractTokenizer DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" @@@ -108,19 -110,18 +110,18 @@@ class ICUTokenizer(AbstractTokenizer) """ Recompute frequencies for all name words. """ with connect(self.dsn) as conn: - if not conn.table_exists('search_name'): + if not table_exists(conn, 'search_name'): return with conn.cursor() as cur: cur.execute('ANALYSE search_name') if threads > 1: - cur.execute('SET max_parallel_workers_per_gather TO %s', - (min(threads, 6),)) + cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}') + .format(pysql.Literal(min(threads, 6),))) - if conn.server_version_tuple() < (12, 0): + if server_version_tuple(conn) < (12, 0): LOG.info('Computing word frequencies') - cur.drop_table('word_frequencies') - cur.drop_table('addressword_frequencies') + drop_tables(conn, 'word_frequencies', 'addressword_frequencies') cur.execute("""CREATE TEMP TABLE word_frequencies AS SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id""") @@@ -152,19 -153,16 +153,16 @@@ $$ LANGUAGE plpgsql IMMUTABLE; """) LOG.info('Update word table with recomputed frequencies') - cur.drop_table('tmp_word') + drop_tables(conn, 'tmp_word') cur.execute("""CREATE TABLE tmp_word AS SELECT word_id, word_token, type, word, word_freq_update(word_id, info) as info FROM word """) - cur.drop_table('word_frequencies') - cur.drop_table('addressword_frequencies') + drop_tables(conn, 'word_frequencies', 'addressword_frequencies') else: LOG.info('Computing word frequencies') - cur.drop_table('word_frequencies') - cur.execute('ANALYSE search_name') - cur.execute('ANALYSE word') + drop_tables(conn, 'word_frequencies') cur.execute(""" CREATE TEMP TABLE word_frequencies AS WITH word_freq AS MATERIALIZED ( @@@ -184,7 -182,7 +182,7 @@@ cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') cur.execute('ANALYSE word_frequencies') LOG.info('Update word table with recomputed frequencies') - cur.drop_table('tmp_word') + drop_tables(conn, 'tmp_word') cur.execute("""CREATE TABLE tmp_word AS SELECT word_id, word_token, type, word, (CASE WHEN wf.info is null THEN word.info @@@ -192,9 -190,8 +190,9 @@@ END) as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id + ORDER BY word_id """) - cur.drop_table('word_frequencies') + drop_tables(conn, 'word_frequencies') with conn.cursor() as cur: cur.execute('SET max_parallel_workers_per_gather TO 0') @@@ -213,7 -210,7 +211,7 @@@ """ Remove unused house numbers. """ with connect(self.dsn) as conn: - if not conn.table_exists('search_name'): + if not table_exists(conn, 'search_name'): return with conn.cursor(name="hnr_counter") as cur: cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) @@@ -314,8 -311,7 +312,7 @@@ frequencies. """ with connect(self.dsn) as conn: - with conn.cursor() as cur: - cur.drop_table('word') + drop_tables(conn, 'word') sqlp = SQLPreprocessor(conn, config) sqlp.run_string(conn, """ CREATE TABLE word ( @@@ -373,8 -369,8 +370,8 @@@ """ Rename all tables and indexes used by the tokenizer. """ with connect(self.dsn) as conn: + drop_tables(conn, 'word') with conn.cursor() as cur: - cur.drop_table('word') cur.execute(f"ALTER TABLE {old} RENAME TO word") for idx in ('word_token', 'word_id'): cur.execute(f"""ALTER INDEX idx_{old}_{idx} @@@ -396,7 -392,7 +393,7 @@@ class ICUNameAnalyzer(AbstractAnalyzer) def __init__(self, dsn: str, sanitizer: PlaceSanitizer, token_analysis: ICUTokenAnalysis) -> None: - self.conn: Optional[Connection] = connect(dsn).connection + self.conn: Optional[Connection] = connect(dsn) self.conn.autocommit = True self.sanitizer = sanitizer self.token_analysis = token_analysis @@@ -538,9 -534,7 +535,7 @@@ if terms: with self.conn.cursor() as cur: - cur.execute_values("""SELECT create_postcode_word(pc, var) - FROM (VALUES %s) AS v(pc, var)""", - terms) + cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms) @@@ -583,18 -577,15 +578,15 @@@ to_add = new_phrases - existing_phrases added = 0 - with CopyBuffer() as copystr: + with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy: for word, cls, typ, oper in to_add: term = self._search_normalized(word) if term: - copystr.add(term, 'S', word, - json.dumps({'class': cls, 'type': typ, - 'op': oper if oper in ('in', 'near') else None})) + copy.write_row((term, 'S', word, + Jsonb({'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None}))) added += 1 - copystr.copy_out(cursor, 'word', - columns=['word_token', 'type', 'word', 'info']) - return added @@@ -607,11 -598,11 +599,11 @@@ to_delete = existing_phrases - new_phrases if to_delete: - cursor.execute_values( - """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE type = 'S' and word = name - and info->>'class' = in_class and info->>'type' = in_type - and ((op = '-' and info->>'op' is null) or op = info->>'op') + cursor.executemany( + """ DELETE FROM word + WHERE type = 'S' and word = %s + and info->>'class' = %s and info->>'type' = %s + and %s = coalesce(info->>'op', '-') """, to_delete) return len(to_delete) @@@ -658,7 -649,7 +650,7 @@@ gone_tokens.update(existing_tokens[False] & word_tokens) if gone_tokens: cur.execute("""DELETE FROM word - USING unnest(%s) as token + USING unnest(%s::text[]) as token WHERE type = 'C' and word = %s and word_token = token""", (list(gone_tokens), country_code)) @@@ -671,12 -662,12 +663,12 @@@ if internal: sql = """INSERT INTO word (word_token, type, word, info) (SELECT token, 'C', %s, '{"internal": "yes"}' - FROM unnest(%s) as token) + FROM unnest(%s::text[]) as token) """ else: sql = """INSERT INTO word (word_token, type, word) (SELECT token, 'C', %s - FROM unnest(%s) as token) + FROM unnest(%s::text[]) as token) """ cur.execute(sql, (country_code, list(new_tokens))) @@@ -736,11 -727,10 +728,10 @@@ if norm_name: result = self._cache.housenumbers.get(norm_name, result) if result[0] is None: - with self.conn.cursor() as cur: - hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, )) - result = hid, norm_name - self._cache.housenumbers[norm_name] = result + result = hid, norm_name + self._cache.housenumbers[norm_name] = result else: # Otherwise use the analyzer to determine the canonical name. # Per convention we use the first variant as the 'lookup name', the @@@ -751,11 -741,10 +742,10 @@@ if result[0] is None: variants = analyzer.compute_variants(word_id) if variants: - with self.conn.cursor() as cur: - hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)", + hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)", (word_id, list(variants))) - result = hid, variants[0] - self._cache.housenumbers[word_id] = result + result = hid, variants[0] + self._cache.housenumbers[word_id] = result return result