From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 29 Sep 2024 09:44:04 +0000 (+0200)
Subject: Merge remote-tracking branch 'upstream/master'
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/0af8dac3d35a94afe0b6ad775f3226d8d147501d?hp=-c

Merge remote-tracking branch 'upstream/master'
---

0af8dac3d35a94afe0b6ad775f3226d8d147501d
diff --combined CMakeLists.txt
index 4b4e3fc9,e31362fe..e6d59520
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -19,7 -19,7 +19,7 @@@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_
  project(nominatim)
  
  set(NOMINATIM_VERSION_MAJOR 4)
- set(NOMINATIM_VERSION_MINOR 4)
+ set(NOMINATIM_VERSION_MINOR 5)
  set(NOMINATIM_VERSION_PATCH 0)
  
  set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
@@@ -44,7 -44,6 +44,6 @@@ endif(
  
  set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
  set(BUILD_API on CACHE BOOL "Build everything for the API server")
- set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
  set(BUILD_TESTS on CACHE BOOL "Build test suite")
  set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
  set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
@@@ -74,25 -73,6 +73,6 @@@ if (BUILD_IMPORTER OR BUILD_API
      find_package(PythonInterp 3.7 REQUIRED)
  endif()
  
- #-----------------------------------------------------------------------------
- # PHP
- #-----------------------------------------------------------------------------
- 
- # Setting PHP binary variable as to command line (prevailing) or auto detect
- 
- if (BUILD_API)
-     if (NOT PHP_BIN)
-          find_program (PHP_BIN php)
-     endif()
-     # sanity check if PHP binary exists
-     if (NOT EXISTS ${PHP_BIN})
-         message(WARNING "PHP binary not found. Only Python frontend can be used.")
-         set(PHP_BIN "")
-     else()
-         message (STATUS "Using PHP binary " ${PHP_BIN})
-     endif()
- endif()
- 
  #-----------------------------------------------------------------------------
  # import scripts and utilities (importer only)
  #-----------------------------------------------------------------------------
@@@ -113,27 -93,6 +93,27 @@@ if (BUILD_IMPORTER
                    ${PROJECT_BINARY_DIR}/nominatim)
  endif()
  
 +#-----------------------------------------------------------------------------
 +# Targets for running a development webserver from the build directory.
 +#-----------------------------------------------------------------------------
 +
 +if (BUILD_API)
 +   set(WEBSITEFILES
 +       403.html
 +       509.html
 +       crossdomain.xml
 +       favicon.ico
 +       nominatim.xml
 +       robots.txt
 +       taginfo.json
 +   )
 +
 +   foreach (webfile ${WEBSITEFILES})
 +       configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
 +                      ${PROJECT_BINARY_DIR}/website/${webfile})
 +   endforeach()
 +endif()
 +
  #-----------------------------------------------------------------------------
  # Tests
  #-----------------------------------------------------------------------------
@@@ -146,8 -105,6 +126,6 @@@ if (BUILD_TESTS
      find_program(PYTHON_BEHAVE behave)
      find_program(PYLINT NAMES pylint3 pylint)
      find_program(PYTEST NAMES pytest py.test-3 py.test)
-     find_program(PHPCS phpcs)
-     find_program(PHPUNIT phpunit)
  
      if (PYTHON_BEHAVE)
          message(STATUS "Using Python behave binary ${PYTHON_BEHAVE}")
@@@ -162,24 -119,6 +140,6 @@@
          message(WARNING "behave not found. BDD tests disabled." )
      endif()
  
-     if (PHPUNIT)
-         message(STATUS "Using phpunit binary ${PHPUNIT}")
-         add_test(NAME php
-                  COMMAND ${PHPUNIT} ./
-                  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test/php)
-     else()
-         message(WARNING "phpunit not found. PHP unit tests disabled." )
-     endif()
- 
-     if (PHPCS)
-         message(STATUS "Using phpcs binary ${PHPCS}")
-         add_test(NAME phpcs
-                  COMMAND ${PHPCS} --report-width=120 --colors lib-php
-                  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
-     else()
-         message(WARNING "phpcs not found. PHP linting tests disabled." )
-     endif()
- 
      if (PYLINT)
          message(STATUS "Using pylint binary ${PYLINT}")
          add_test(NAME pylint
@@@ -199,14 -138,6 +159,6 @@@
      endif()
  endif()
  
- #-----------------------------------------------------------------------------
- # Postgres module
- #-----------------------------------------------------------------------------
- 
- if (BUILD_MODULE)
-     add_subdirectory(module)
- endif()
- 
  #-----------------------------------------------------------------------------
  # Installation
  #-----------------------------------------------------------------------------
@@@ -224,11 -155,7 +176,7 @@@ if (BUILD_IMPORTER
              DESTINATION ${CMAKE_INSTALL_BINDIR}
              RENAME nominatim)
  
-     if (EXISTS ${PHP_BIN})
-         configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed)
-     else()
-         configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
-     endif()
+     configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
  
      foreach (submodule nominatim_db nominatim_api)
          install(DIRECTORY src/${submodule}
@@@ -259,15 -186,6 +207,6 @@@ if (BUILD_OSM2PGSQL
      endif()
  endif()
  
- if (BUILD_MODULE)
-     install(PROGRAMS ${PROJECT_BINARY_DIR}/module/nominatim.so
-             DESTINATION ${NOMINATIM_LIBDIR}/module)
- endif()
- 
- if (BUILD_API AND EXISTS ${PHP_BIN})
-     install(DIRECTORY lib-php DESTINATION ${NOMINATIM_LIBDIR})
- endif()
- 
  install(FILES settings/env.defaults
                settings/address-levels.json
                settings/phrase-settings.json
diff --combined src/nominatim_api/search/db_search_builder.py
index 0269cf1f,1ac6db2b..b4346ee6
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@@ -167,8 -167,7 +167,7 @@@ class SearchBuilder
          expected_count = sum(t.count for t in hnrs)
  
          partials = {t.token: t.addr_count for trange in address
-                        for t in self.query.get_partials_list(trange)
-                        if t.is_indexed}
+                        for t in self.query.get_partials_list(trange)}
  
          if not partials:
              # can happen when none of the partials is indexed
@@@ -219,23 -218,19 +218,19 @@@
          addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
          addr_tokens = list({t.token for t in addr_partials})
  
-         partials_indexed = all(t.is_indexed for t in name_partials.values()) \
-                            and all(t.is_indexed for t in addr_partials)
          exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
  
-         if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
+         if (len(name_partials) > 3 or exp_count < 8000):
              yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
              return
  
 -        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
 +        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
          if name_fulls:
              fulls_count = sum(t.count for t in name_fulls)
-             if partials_indexed:
-                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
  
 -            if fulls_count < 50000 or addr_count < 30000:
 +            if fulls_count < 80000 or addr_count < 50000:
                  yield penalty,fulls_count / (2**len(addr_tokens)), \
                      self.get_full_name_ranking(name_fulls, addr_partials,
                                                 fulls_count > 30000 / max(1, len(addr_tokens)))
@@@ -243,8 -238,7 +238,7 @@@
          # To catch remaining results, lookup by name and address
          # We only do this if there is a reasonable number of results expected.
          exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
-         if exp_count < 10000 and addr_count < 20000\
-            and all(t.is_indexed for t in name_partials.values()):
+         if exp_count < 10000 and addr_count < 20000:
              penalty += 0.35 * max(1 if name_fulls else 0.1,
                                    5 - len(name_partials) - len(addr_tokens))
              yield penalty, exp_count,\
@@@ -260,11 -254,10 +254,10 @@@
          addr_restrict_tokens = []
          addr_lookup_tokens = []
          for t in addr_partials:
-             if t.is_indexed:
-                 if t.addr_count > 20000:
-                     addr_restrict_tokens.append(t.token)
-                 else:
-                     addr_lookup_tokens.append(t.token)
+             if t.addr_count > 20000:
+                 addr_restrict_tokens.append(t.token)
+             else:
+                 addr_lookup_tokens.append(t.token)
  
          if addr_restrict_tokens:
              lookup.append(dbf.FieldLookup('nameaddress_vector',
@@@ -287,9 -280,14 +280,9 @@@
          # This might yield wrong results, nothing we can do about that.
          if use_lookup:
              addr_restrict_tokens = []
 -            addr_lookup_tokens = []
 -            for t in addr_partials:
 -                if t.addr_count > 20000:
 -                    addr_restrict_tokens.append(t.token)
 -                else:
 -                    addr_lookup_tokens.append(t.token)
 +            addr_lookup_tokens = [t.token for t in addr_partials if t.is_indexed]
          else:
-             addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
+             addr_restrict_tokens = [t.token for t in addr_partials]
              addr_lookup_tokens = []
  
          return dbf.lookup_by_any_name([t.token for t in name_fulls],
diff --combined src/nominatim_api/search/icu_tokenizer.py
index 7bd2b092,1aadc97e..c2a26510
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@@ -123,7 -123,7 +123,7 @@@ class ICUToken(qmod.Token)
              lookup_word = row.word_token
  
          return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
-                         lookup_word=lookup_word, is_indexed=True,
+                         lookup_word=lookup_word,
                          word_token=row.word_token, info=row.info,
                          addr_count=max(1, addr_count))
  
@@@ -208,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
 -        return cast(str, self.normalizer.transliterate(text))
 +        norm = cast(str, self.normalizer.transliterate(text))
 +        numspaces = norm.count(' ')
 +        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
 +            return ''
 +
 +        return norm
  
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
@@@ -264,7 -259,9 +264,9 @@@
              if len(part.token) <= 4 and part[0].isdigit()\
                 and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                  query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                 ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
+                                 ICUToken(penalty=0.5, token=0,
+                                          count=1, addr_count=1, lookup_word=part.token,
+                                          word_token=part.token, info=None))
  
  
      def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py
index 4eee2c73,1b95a901..452bf26c
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@@ -13,7 -13,6 +13,6 @@@ from typing import Optional, Sequence, 
  import itertools
  import logging
  from pathlib import Path
- from textwrap import dedent
  
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
@@@ -64,7 -63,6 +63,6 @@@ class ICUTokenizer(AbstractTokenizer)
          """
          self.loader = ICURuleLoader(config)
  
-         self._install_php(config.lib_dir.php, overwrite=True)
          self._save_config()
  
          if init_db:
@@@ -81,8 -79,6 +79,6 @@@
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
-         self._install_php(config.lib_dir.php, overwrite=False)
- 
  
      def finalize_import(self, config: Configuration) -> None:
          """ Do any required postprocessing to make the tokenizer data ready
@@@ -190,7 -186,6 +186,7 @@@
                                              END) as info
                                      FROM word LEFT JOIN word_frequencies wf
                                           ON word.word_id = wf.id
 +                                    ORDER BY word_id
                                  """)
                      drop_tables(conn, 'word_frequencies')
  
@@@ -282,22 -277,6 +278,6 @@@
              return list(s[0].split('@')[0] for s in cur)
  
  
-     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
-         """ Install the php script for the tokenizer.
-         """
-         if phpdir is not None:
-             assert self.loader is not None
-             php_file = self.data_dir / "tokenizer.php"
- 
-             if not php_file.exists() or overwrite:
-                 php_file.write_text(dedent(f"""\
-                     <?php
-                     @define('CONST_Max_Word_Frequency', 10000000);
-                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
- 
- 
      def _save_config(self) -> None:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.