Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
diff --combined CMakeLists.txt

index 7011e463779a986f50967b17808e80d37e91ec6d,dd5c3110e3d5e89b750281220a5ce9745bf8371f..83e3440338d5fdd2a91385bf381bee9d13587a13
--- 1/CMakeLists.txt
--- 2/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -46,8 -46,6 +46,6 @@@ set(BUILD_IMPORTER on CACHE BOOL "Buil
   set(BUILD_API on CACHE BOOL "Build everything for the API server")
   set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
   set(BUILD_TESTS on CACHE BOOL "Build test suite")
- set(BUILD_DOCS on CACHE BOOL "Build documentation")
- set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
   set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
   set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
   
@@@ -72,7 -70,7 +70,7 @@@ endif(
   #  python (imports/updates only)
   #-----------------------------------------------------------------------------
   
- if (BUILD_IMPORTER)
+ if (BUILD_IMPORTER OR BUILD_API)
       find_package(PythonInterp 3.7 REQUIRED)
   endif()
   
@@@ -115,27 -113,6 +113,27 @@@ if (BUILD_IMPORTER
                     ${PROJECT_BINARY_DIR}/nominatim)
   endif()
   
+ +#-----------------------------------------------------------------------------
+ +# Targets for running a development webserver from the build directory.
+ +#-----------------------------------------------------------------------------
+ +
+ +if (BUILD_API)
+ +   set(WEBSITEFILES
+ +       403.html
+ +       509.html
+ +       crossdomain.xml
+ +       favicon.ico
+ +       nominatim.xml
+ +       robots.txt
+ +       taginfo.json
+ +   )
+ +
+ +   foreach (webfile ${WEBSITEFILES})
+ +       configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
+ +                      ${PROJECT_BINARY_DIR}/website/${webfile})
+ +   endforeach()
+ +endif()
+ +
   #-----------------------------------------------------------------------------
   # Tests
   #-----------------------------------------------------------------------------
@@@ -209,22 -186,6 +207,6 @@@ if (BUILD_MODULE
       add_subdirectory(module)
   endif()
   
- #-----------------------------------------------------------------------------
- # Documentation
- #-----------------------------------------------------------------------------
- 
- if (BUILD_DOCS)
-    add_subdirectory(docs)
- endif()
- 
- #-----------------------------------------------------------------------------
- # Manual page
- #-----------------------------------------------------------------------------
- 
- if (BUILD_MANPAGE)
-    add_subdirectory(man)
- endif()
- 
   #-----------------------------------------------------------------------------
   # Installation
   #-----------------------------------------------------------------------------
@@@ -242,20 -203,22 +224,22 @@@ if (BUILD_IMPORTER
               DESTINATION ${CMAKE_INSTALL_BINDIR}
               RENAME nominatim)
   
-     install(DIRECTORY nominatim
-             DESTINATION ${NOMINATIM_LIBDIR}/lib-python
-             FILES_MATCHING PATTERN "*.py"
-             PATTERN "paths.py" EXCLUDE
-             PATTERN __pycache__ EXCLUDE)
- 
       if (EXISTS ${PHP_BIN})
           configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed)
       else()
           configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
       endif()
-     install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed
-             DESTINATION ${NOMINATIM_LIBDIR}/lib-python/nominatim
-             RENAME paths.py)
+ 
+     foreach (submodule nominatim_db nominatim_api)
+         install(DIRECTORY src/${submodule}
+                 DESTINATION ${NOMINATIM_LIBDIR}/lib-python
+                 FILES_MATCHING PATTERN "*.py"
+                 PATTERN "paths.py" EXCLUDE
+                 PATTERN __pycache__ EXCLUDE)
+         install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed
+                 DESTINATION ${NOMINATIM_LIBDIR}/lib-python/${submodule}
+                 RENAME paths.py)
+     endforeach()
   
       install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR})
   
diff --combined src/nominatim_api/search/icu_tokenizer.py

index f6590f5b36f87f0ac81f80dd5c1ef7a12fb8a726,971e95beec1a6935b58e7c9cc4879d9797a73f1b..7bd2b09259d70facf28c5505f2c923f5e7aab46e
--- 1/nominatim/api/search/icu_tokenizer.py
--- 2/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@@ -2,7 -2,7 +2,7 @@@
   #
   # This file is part of Nominatim. (https://nominatim.org)
   #
- # Copyright (C) 2023 by the Nominatim developer community.
+ # Copyright (C) 2024 by the Nominatim developer community.
   # For a full list of authors see the git log.
   """
   Implementation of query analysis for the ICU tokenizer.
@@@ -16,12 -16,12 +16,12 @@@ from icu import Transliterato
   
   import sqlalchemy as sa
   
- from nominatim.typing import SaRow
- from nominatim.api.connection import SearchConnection
- from nominatim.api.logging import log
- from nominatim.api.search import query as qmod
- from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
- from nominatim.db.sqlalchemy_types import Json
+ from ..typing import SaRow
+ from ..sql.sqlalchemy_types import Json
+ from ..connection import SearchConnection
+ from ..logging import log
+ from ..search import query as qmod
+ from ..search.query_analyzer_factory import AbstractQueryAnalyzer
   
   
   DB_TO_TOKEN_TYPE = {
@@@ -208,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
               standardized form search will work with. All information removed
               at this stage is inevitably lost.
           """
- -        return cast(str, self.normalizer.transliterate(text))
+ +        norm = cast(str, self.normalizer.transliterate(text))
+ +        numspaces = norm.count(' ')
+ +        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+ +            return ''
+ +
+ +        return norm
   
   
       def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py

index 70273b90e0af59e43a6a58108ffb427b31b5a654,7cd96d591fcec97483a721542829f4da90e52430..4eee2c73b0c9a02001c8c0e49b9b88ab6da36c3b
--- 1/nominatim/tokenizer/icu_tokenizer.py
--- 2/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@@ -1,8 -1,8 +1,8 @@@
- # SPDX-License-Identifier: GPL-2.0-only
+ # SPDX-License-Identifier: GPL-3.0-or-later
   #
   # This file is part of Nominatim. (https://nominatim.org)
   #
- # Copyright (C) 2022 by the Nominatim developer community.
+ # Copyright (C) 2024 by the Nominatim developer community.
   # For a full list of authors see the git log.
   """
   Tokenizer implementing normalisation as used before Nominatim 4 but using
@@@ -11,21 -11,23 +11,23 @@@ libICU instead of the PostgreSQL module
   from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
                      Dict, Set, Iterable
   import itertools
- import json
   import logging
   from pathlib import Path
   from textwrap import dedent
   
- from nominatim.db.connection import connect, Connection, Cursor
- from nominatim.config import Configuration
- from nominatim.db.utils import CopyBuffer
- from nominatim.db.sql_preprocessor import SQLPreprocessor
- from nominatim.data.place_info import PlaceInfo
- from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
- from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
- from nominatim.data.place_name import PlaceName
- from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
- from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
+ from psycopg.types.json import Jsonb
+ from psycopg import sql as pysql
+ 
+ from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+                             drop_tables, table_exists, execute_scalar
+ from ..config import Configuration
+ from ..db.sql_preprocessor import SQLPreprocessor
+ from ..data.place_info import PlaceInfo
+ from ..data.place_name import PlaceName
+ from .icu_rule_loader import ICURuleLoader
+ from .place_sanitizer import PlaceSanitizer
+ from .icu_token_analysis import ICUTokenAnalysis
+ from .base import AbstractAnalyzer, AbstractTokenizer
   
   DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
   
@@@ -108,19 -110,18 +110,18 @@@ class ICUTokenizer(AbstractTokenizer)
           """ Recompute frequencies for all name words.
           """
           with connect(self.dsn) as conn:
-             if not conn.table_exists('search_name'):
+             if not table_exists(conn, 'search_name'):
                   return
   
               with conn.cursor() as cur:
                   cur.execute('ANALYSE search_name')
                   if threads > 1:
-                     cur.execute('SET max_parallel_workers_per_gather TO %s',
-                                 (min(threads, 6),))
+                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
+                                      .format(pysql.Literal(min(threads, 6),)))
   
-                 if conn.server_version_tuple() < (12, 0):
+                 if server_version_tuple(conn) < (12, 0):
                       LOG.info('Computing word frequencies')
-                     cur.drop_table('word_frequencies')
-                     cur.drop_table('addressword_frequencies')
+                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
                       cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                        SELECT unnest(name_vector) as id, count(*)
                                        FROM search_name GROUP BY id""")
@@@ -152,19 -153,16 +153,16 @@@
                                      $$ LANGUAGE plpgsql IMMUTABLE;
                                   """)
                       LOG.info('Update word table with recomputed frequencies')
-                     cur.drop_table('tmp_word')
+                     drop_tables(conn, 'tmp_word')
                       cur.execute("""CREATE TABLE tmp_word AS
                                       SELECT word_id, word_token, type, word,
                                              word_freq_update(word_id, info) as info
                                       FROM word
                                   """)
-                     cur.drop_table('word_frequencies')
-                     cur.drop_table('addressword_frequencies')
+                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
                   else:
                       LOG.info('Computing word frequencies')
-                     cur.drop_table('word_frequencies')
-                     cur.execute('ANALYSE search_name')
-                     cur.execute('ANALYSE word')
+                     drop_tables(conn, 'word_frequencies')
                       cur.execute("""
                         CREATE TEMP TABLE word_frequencies AS
                         WITH word_freq AS MATERIALIZED (
@@@ -184,7 -182,7 +182,7 @@@
                       cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
                       cur.execute('ANALYSE word_frequencies')
                       LOG.info('Update word table with recomputed frequencies')
-                     cur.drop_table('tmp_word')
+                     drop_tables(conn, 'tmp_word')
                       cur.execute("""CREATE TABLE tmp_word AS
                                       SELECT word_id, word_token, type, word,
                                              (CASE WHEN wf.info is null THEN word.info
@@@ -192,9 -190,8 +190,9 @@@
                                               END) as info
                                       FROM word LEFT JOIN word_frequencies wf
                                            ON word.word_id = wf.id
+ +                                    ORDER BY word_id
                                   """)
-                     cur.drop_table('word_frequencies')
+                     drop_tables(conn, 'word_frequencies')
   
               with conn.cursor() as cur:
                   cur.execute('SET max_parallel_workers_per_gather TO 0')
@@@ -213,7 -210,7 +211,7 @@@
           """ Remove unused house numbers.
           """
           with connect(self.dsn) as conn:
-             if not conn.table_exists('search_name'):
+             if not table_exists(conn, 'search_name'):
                   return
               with conn.cursor(name="hnr_counter") as cur:
                   cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
@@@ -314,8 -311,7 +312,7 @@@
               frequencies.
           """
           with connect(self.dsn) as conn:
-             with conn.cursor() as cur:
-                 cur.drop_table('word')
+             drop_tables(conn, 'word')
               sqlp = SQLPreprocessor(conn, config)
               sqlp.run_string(conn, """
                   CREATE TABLE word (
@@@ -373,8 -369,8 +370,8 @@@
           """ Rename all tables and indexes used by the tokenizer.
           """
           with connect(self.dsn) as conn:
+             drop_tables(conn, 'word')
               with conn.cursor() as cur:
-                 cur.drop_table('word')
                   cur.execute(f"ALTER TABLE {old} RENAME TO word")
                   for idx in ('word_token', 'word_id'):
                       cur.execute(f"""ALTER INDEX idx_{old}_{idx}
@@@ -396,7 -392,7 +393,7 @@@ class ICUNameAnalyzer(AbstractAnalyzer)
   
       def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
                    token_analysis: ICUTokenAnalysis) -> None:
-         self.conn: Optional[Connection] = connect(dsn).connection
+         self.conn: Optional[Connection] = connect(dsn)
           self.conn.autocommit = True
           self.sanitizer = sanitizer
           self.token_analysis = token_analysis
@@@ -538,9 -534,7 +535,7 @@@
   
           if terms:
               with self.conn.cursor() as cur:
-                 cur.execute_values("""SELECT create_postcode_word(pc, var)
-                                       FROM (VALUES %s) AS v(pc, var)""",
-                                    terms)
+                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
   
   
   
@@@ -583,18 -577,15 +578,15 @@@
           to_add = new_phrases - existing_phrases
   
           added = 0
-         with CopyBuffer() as copystr:
+         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
               for word, cls, typ, oper in to_add:
                   term = self._search_normalized(word)
                   if term:
-                     copystr.add(term, 'S', word,
-                                 json.dumps({'class': cls, 'type': typ,
-                                             'op': oper if oper in ('in', 'near') else None}))
+                     copy.write_row((term, 'S', word,
+                                     Jsonb({'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None})))
                       added += 1
   
-             copystr.copy_out(cursor, 'word',
-                              columns=['word_token', 'type', 'word', 'info'])
- 
           return added
   
   
@@@ -607,11 -598,11 +599,11 @@@
           to_delete = existing_phrases - new_phrases
   
           if to_delete:
-             cursor.execute_values(
-                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                     WHERE type = 'S' and word = name
-                           and info->>'class' = in_class and info->>'type' = in_type
-                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
+             cursor.executemany(
+                 """ DELETE FROM word
+                       WHERE type = 'S' and word = %s
+                             and info->>'class' = %s and info->>'type' = %s
+                             and %s = coalesce(info->>'op', '-')
                   """, to_delete)
   
           return len(to_delete)
@@@ -658,7 -649,7 +650,7 @@@
                   gone_tokens.update(existing_tokens[False] & word_tokens)
               if gone_tokens:
                   cur.execute("""DELETE FROM word
-                                USING unnest(%s) as token
+                                USING unnest(%s::text[]) as token
                                  WHERE type = 'C' and word = %s
                                        and word_token = token""",
                               (list(gone_tokens), country_code))
@@@ -671,12 -662,12 +663,12 @@@
                   if internal:
                       sql = """INSERT INTO word (word_token, type, word, info)
                                  (SELECT token, 'C', %s, '{"internal": "yes"}'
-                                   FROM unnest(%s) as token)
+                                   FROM unnest(%s::text[]) as token)
                              """
                   else:
                       sql = """INSERT INTO word (word_token, type, word)
                                      (SELECT token, 'C', %s
-                                     FROM unnest(%s) as token)
+                                     FROM unnest(%s::text[]) as token)
                             """
                   cur.execute(sql, (country_code, list(new_tokens)))
   
@@@ -736,11 -727,10 +728,10 @@@
               if norm_name:
                   result = self._cache.housenumbers.get(norm_name, result)
                   if result[0] is None:
-                     with self.conn.cursor() as cur:
-                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
   
-                         result = hid, norm_name
-                         self._cache.housenumbers[norm_name] = result
+                     result = hid, norm_name
+                     self._cache.housenumbers[norm_name] = result
           else:
               # Otherwise use the analyzer to determine the canonical name.
               # Per convention we use the first variant as the 'lookup name', the
@@@ -751,11 -741,10 +742,10 @@@
                   if result[0] is None:
                       variants = analyzer.compute_variants(word_id)
                       if variants:
-                         with self.conn.cursor() as cur:
-                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
                                                (word_id, list(variants)))
-                             result = hid, variants[0]
-                             self._cache.housenumbers[word_id] = result
+                         result = hid, variants[0]
+                         self._cache.housenumbers[word_id] = result
   
           return result
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_api/search/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history