]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 8 Aug 2024 09:11:04 +0000 (11:11 +0200)
1  2 
CMakeLists.txt
src/nominatim_api/search/icu_tokenizer.py
src/nominatim_db/tokenizer/icu_tokenizer.py

diff --combined CMakeLists.txt
index 7011e463779a986f50967b17808e80d37e91ec6d,dd5c3110e3d5e89b750281220a5ce9745bf8371f..83e3440338d5fdd2a91385bf381bee9d13587a13
@@@ -46,8 -46,6 +46,6 @@@ set(BUILD_IMPORTER on CACHE BOOL "Buil
  set(BUILD_API on CACHE BOOL "Build everything for the API server")
  set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
  set(BUILD_TESTS on CACHE BOOL "Build test suite")
- set(BUILD_DOCS on CACHE BOOL "Build documentation")
- set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
  set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
  set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
  
@@@ -72,7 -70,7 +70,7 @@@ endif(
  #  python (imports/updates only)
  #-----------------------------------------------------------------------------
  
- if (BUILD_IMPORTER)
+ if (BUILD_IMPORTER OR BUILD_API)
      find_package(PythonInterp 3.7 REQUIRED)
  endif()
  
@@@ -115,27 -113,6 +113,27 @@@ if (BUILD_IMPORTER
                    ${PROJECT_BINARY_DIR}/nominatim)
  endif()
  
 +#-----------------------------------------------------------------------------
 +# Targets for running a development webserver from the build directory.
 +#-----------------------------------------------------------------------------
 +
 +if (BUILD_API)
 +   set(WEBSITEFILES
 +       403.html
 +       509.html
 +       crossdomain.xml
 +       favicon.ico
 +       nominatim.xml
 +       robots.txt
 +       taginfo.json
 +   )
 +
 +   foreach (webfile ${WEBSITEFILES})
 +       configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
 +                      ${PROJECT_BINARY_DIR}/website/${webfile})
 +   endforeach()
 +endif()
 +
  #-----------------------------------------------------------------------------
  # Tests
  #-----------------------------------------------------------------------------
@@@ -209,22 -186,6 +207,6 @@@ if (BUILD_MODULE
      add_subdirectory(module)
  endif()
  
- #-----------------------------------------------------------------------------
- # Documentation
- #-----------------------------------------------------------------------------
- if (BUILD_DOCS)
-    add_subdirectory(docs)
- endif()
- #-----------------------------------------------------------------------------
- # Manual page
- #-----------------------------------------------------------------------------
- if (BUILD_MANPAGE)
-    add_subdirectory(man)
- endif()
  #-----------------------------------------------------------------------------
  # Installation
  #-----------------------------------------------------------------------------
@@@ -242,20 -203,22 +224,22 @@@ if (BUILD_IMPORTER
              DESTINATION ${CMAKE_INSTALL_BINDIR}
              RENAME nominatim)
  
-     install(DIRECTORY nominatim
-             DESTINATION ${NOMINATIM_LIBDIR}/lib-python
-             FILES_MATCHING PATTERN "*.py"
-             PATTERN "paths.py" EXCLUDE
-             PATTERN __pycache__ EXCLUDE)
      if (EXISTS ${PHP_BIN})
          configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed)
      else()
          configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
      endif()
-     install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed
-             DESTINATION ${NOMINATIM_LIBDIR}/lib-python/nominatim
-             RENAME paths.py)
+     foreach (submodule nominatim_db nominatim_api)
+         install(DIRECTORY src/${submodule}
+                 DESTINATION ${NOMINATIM_LIBDIR}/lib-python
+                 FILES_MATCHING PATTERN "*.py"
+                 PATTERN "paths.py" EXCLUDE
+                 PATTERN __pycache__ EXCLUDE)
+         install(FILES ${PROJECT_BINARY_DIR}/paths-py.installed
+                 DESTINATION ${NOMINATIM_LIBDIR}/lib-python/${submodule}
+                 RENAME paths.py)
+     endforeach()
  
      install(DIRECTORY lib-sql DESTINATION ${NOMINATIM_LIBDIR})
  
index f6590f5b36f87f0ac81f80dd5c1ef7a12fb8a726,971e95beec1a6935b58e7c9cc4879d9797a73f1b..7bd2b09259d70facf28c5505f2c923f5e7aab46e
@@@ -2,7 -2,7 +2,7 @@@
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
- # Copyright (C) 2023 by the Nominatim developer community.
+ # Copyright (C) 2024 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Implementation of query analysis for the ICU tokenizer.
@@@ -16,12 -16,12 +16,12 @@@ from icu import Transliterato
  
  import sqlalchemy as sa
  
- from nominatim.typing import SaRow
- from nominatim.api.connection import SearchConnection
- from nominatim.api.logging import log
- from nominatim.api.search import query as qmod
- from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
- from nominatim.db.sqlalchemy_types import Json
+ from ..typing import SaRow
+ from ..sql.sqlalchemy_types import Json
+ from ..connection import SearchConnection
+ from ..logging import log
+ from ..search import query as qmod
+ from ..search.query_analyzer_factory import AbstractQueryAnalyzer
  
  
  DB_TO_TOKEN_TYPE = {
@@@ -208,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
 -        return cast(str, self.normalizer.transliterate(text))
 +        norm = cast(str, self.normalizer.transliterate(text))
 +        numspaces = norm.count(' ')
 +        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
 +            return ''
 +
 +        return norm
  
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
index 70273b90e0af59e43a6a58108ffb427b31b5a654,7cd96d591fcec97483a721542829f4da90e52430..4eee2c73b0c9a02001c8c0e49b9b88ab6da36c3b
@@@ -1,8 -1,8 +1,8 @@@
- # SPDX-License-Identifier: GPL-2.0-only
+ # SPDX-License-Identifier: GPL-3.0-or-later
  #
  # This file is part of Nominatim. (https://nominatim.org)
  #
- # Copyright (C) 2022 by the Nominatim developer community.
+ # Copyright (C) 2024 by the Nominatim developer community.
  # For a full list of authors see the git log.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
@@@ -11,21 -11,23 +11,23 @@@ libICU instead of the PostgreSQL module
  from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
                     Dict, Set, Iterable
  import itertools
- import json
  import logging
  from pathlib import Path
  from textwrap import dedent
  
- from nominatim.db.connection import connect, Connection, Cursor
- from nominatim.config import Configuration
- from nominatim.db.utils import CopyBuffer
- from nominatim.db.sql_preprocessor import SQLPreprocessor
- from nominatim.data.place_info import PlaceInfo
- from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
- from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
- from nominatim.data.place_name import PlaceName
- from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
- from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
+ from psycopg.types.json import Jsonb
+ from psycopg import sql as pysql
+ from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+                             drop_tables, table_exists, execute_scalar
+ from ..config import Configuration
+ from ..db.sql_preprocessor import SQLPreprocessor
+ from ..data.place_info import PlaceInfo
+ from ..data.place_name import PlaceName
+ from .icu_rule_loader import ICURuleLoader
+ from .place_sanitizer import PlaceSanitizer
+ from .icu_token_analysis import ICUTokenAnalysis
+ from .base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
@@@ -108,19 -110,18 +110,18 @@@ class ICUTokenizer(AbstractTokenizer)
          """ Recompute frequencies for all name words.
          """
          with connect(self.dsn) as conn:
-             if not conn.table_exists('search_name'):
+             if not table_exists(conn, 'search_name'):
                  return
  
              with conn.cursor() as cur:
                  cur.execute('ANALYSE search_name')
                  if threads > 1:
-                     cur.execute('SET max_parallel_workers_per_gather TO %s',
-                                 (min(threads, 6),))
+                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
+                                      .format(pysql.Literal(min(threads, 6),)))
  
-                 if conn.server_version_tuple() < (12, 0):
+                 if server_version_tuple(conn) < (12, 0):
                      LOG.info('Computing word frequencies')
-                     cur.drop_table('word_frequencies')
-                     cur.drop_table('addressword_frequencies')
+                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
                      cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                       SELECT unnest(name_vector) as id, count(*)
                                       FROM search_name GROUP BY id""")
                                     $$ LANGUAGE plpgsql IMMUTABLE;
                                  """)
                      LOG.info('Update word table with recomputed frequencies')
-                     cur.drop_table('tmp_word')
+                     drop_tables(conn, 'tmp_word')
                      cur.execute("""CREATE TABLE tmp_word AS
                                      SELECT word_id, word_token, type, word,
                                             word_freq_update(word_id, info) as info
                                      FROM word
                                  """)
-                     cur.drop_table('word_frequencies')
-                     cur.drop_table('addressword_frequencies')
+                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
                  else:
                      LOG.info('Computing word frequencies')
-                     cur.drop_table('word_frequencies')
-                     cur.execute('ANALYSE search_name')
-                     cur.execute('ANALYSE word')
+                     drop_tables(conn, 'word_frequencies')
                      cur.execute("""
                        CREATE TEMP TABLE word_frequencies AS
                        WITH word_freq AS MATERIALIZED (
                      cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
                      cur.execute('ANALYSE word_frequencies')
                      LOG.info('Update word table with recomputed frequencies')
-                     cur.drop_table('tmp_word')
+                     drop_tables(conn, 'tmp_word')
                      cur.execute("""CREATE TABLE tmp_word AS
                                      SELECT word_id, word_token, type, word,
                                             (CASE WHEN wf.info is null THEN word.info
                                              END) as info
                                      FROM word LEFT JOIN word_frequencies wf
                                           ON word.word_id = wf.id
 +                                    ORDER BY word_id
                                  """)
-                     cur.drop_table('word_frequencies')
+                     drop_tables(conn, 'word_frequencies')
  
              with conn.cursor() as cur:
                  cur.execute('SET max_parallel_workers_per_gather TO 0')
          """ Remove unused house numbers.
          """
          with connect(self.dsn) as conn:
-             if not conn.table_exists('search_name'):
+             if not table_exists(conn, 'search_name'):
                  return
              with conn.cursor(name="hnr_counter") as cur:
                  cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
              frequencies.
          """
          with connect(self.dsn) as conn:
-             with conn.cursor() as cur:
-                 cur.drop_table('word')
+             drop_tables(conn, 'word')
              sqlp = SQLPreprocessor(conn, config)
              sqlp.run_string(conn, """
                  CREATE TABLE word (
          """ Rename all tables and indexes used by the tokenizer.
          """
          with connect(self.dsn) as conn:
+             drop_tables(conn, 'word')
              with conn.cursor() as cur:
-                 cur.drop_table('word')
                  cur.execute(f"ALTER TABLE {old} RENAME TO word")
                  for idx in ('word_token', 'word_id'):
                      cur.execute(f"""ALTER INDEX idx_{old}_{idx}
@@@ -396,7 -392,7 +393,7 @@@ class ICUNameAnalyzer(AbstractAnalyzer)
  
      def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
                   token_analysis: ICUTokenAnalysis) -> None:
-         self.conn: Optional[Connection] = connect(dsn).connection
+         self.conn: Optional[Connection] = connect(dsn)
          self.conn.autocommit = True
          self.sanitizer = sanitizer
          self.token_analysis = token_analysis
  
          if terms:
              with self.conn.cursor() as cur:
-                 cur.execute_values("""SELECT create_postcode_word(pc, var)
-                                       FROM (VALUES %s) AS v(pc, var)""",
-                                    terms)
+                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
  
  
  
          to_add = new_phrases - existing_phrases
  
          added = 0
-         with CopyBuffer() as copystr:
+         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
              for word, cls, typ, oper in to_add:
                  term = self._search_normalized(word)
                  if term:
-                     copystr.add(term, 'S', word,
-                                 json.dumps({'class': cls, 'type': typ,
-                                             'op': oper if oper in ('in', 'near') else None}))
+                     copy.write_row((term, 'S', word,
+                                     Jsonb({'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None})))
                      added += 1
  
-             copystr.copy_out(cursor, 'word',
-                              columns=['word_token', 'type', 'word', 'info'])
          return added
  
  
          to_delete = existing_phrases - new_phrases
  
          if to_delete:
-             cursor.execute_values(
-                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                     WHERE type = 'S' and word = name
-                           and info->>'class' = in_class and info->>'type' = in_type
-                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
+             cursor.executemany(
+                 """ DELETE FROM word
+                       WHERE type = 'S' and word = %s
+                             and info->>'class' = %s and info->>'type' = %s
+                             and %s = coalesce(info->>'op', '-')
                  """, to_delete)
  
          return len(to_delete)
                  gone_tokens.update(existing_tokens[False] & word_tokens)
              if gone_tokens:
                  cur.execute("""DELETE FROM word
-                                USING unnest(%s) as token
+                                USING unnest(%s::text[]) as token
                                 WHERE type = 'C' and word = %s
                                       and word_token = token""",
                              (list(gone_tokens), country_code))
                  if internal:
                      sql = """INSERT INTO word (word_token, type, word, info)
                                 (SELECT token, 'C', %s, '{"internal": "yes"}'
-                                   FROM unnest(%s) as token)
+                                   FROM unnest(%s::text[]) as token)
                             """
                  else:
                      sql = """INSERT INTO word (word_token, type, word)
                                     (SELECT token, 'C', %s
-                                     FROM unnest(%s) as token)
+                                     FROM unnest(%s::text[]) as token)
                            """
                  cur.execute(sql, (country_code, list(new_tokens)))
  
              if norm_name:
                  result = self._cache.housenumbers.get(norm_name, result)
                  if result[0] is None:
-                     with self.conn.cursor() as cur:
-                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
  
-                         result = hid, norm_name
-                         self._cache.housenumbers[norm_name] = result
+                     result = hid, norm_name
+                     self._cache.housenumbers[norm_name] = result
          else:
              # Otherwise use the analyzer to determine the canonical name.
              # Per convention we use the first variant as the 'lookup name', the
                  if result[0] is None:
                      variants = analyzer.compute_variants(word_id)
                      if variants:
-                         with self.conn.cursor() as cur:
-                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
                                               (word_id, list(variants)))
-                             result = hid, variants[0]
-                             self._cache.housenumbers[word_id] = result
+                         result = hid, variants[0]
+                         self._cache.housenumbers[word_id] = result
  
          return result