Merge remote-tracking branch 'upstream/master'

author Sarah Hoffmann <lonvia@denofr.de>

Sun, 29 Sep 2024 09:44:04 +0000 (11:44 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 29 Sep 2024 09:44:04 +0000 (11:44 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sun, 29 Sep 2024 09:44:04 +0000 (11:44 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 29 Sep 2024 09:44:04 +0000 (11:44 +0200)
diff --combined CMakeLists.txt

index 4b4e3fc9d139b4f714afe8d315c7e233b385dc5e,e31362feddda1e67ad906f22df02a2906d8a5d09..e6d59520c10601b8660a855c8f5ad3dc8ca1c4c5
--- 1/CMakeLists.txt
--- 2/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -19,7 -19,7 +19,7 @@@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_
   project(nominatim)
   
   set(NOMINATIM_VERSION_MAJOR 4)
- set(NOMINATIM_VERSION_MINOR 4)
+ set(NOMINATIM_VERSION_MINOR 5)
   set(NOMINATIM_VERSION_PATCH 0)
   
   set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
@@@ -44,7 -44,6 +44,6 @@@ endif(
   
   set(BUILD_IMPORTER on CACHE BOOL "Build everything for importing/updating the database")
   set(BUILD_API on CACHE BOOL "Build everything for the API server")
- set(BUILD_MODULE off CACHE BOOL "Build PostgreSQL module for legacy tokenizer")
   set(BUILD_TESTS on CACHE BOOL "Build test suite")
   set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
   set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
@@@ -74,25 -73,6 +73,6 @@@ if (BUILD_IMPORTER OR BUILD_API
       find_package(PythonInterp 3.7 REQUIRED)
   endif()
   
- #-----------------------------------------------------------------------------
- # PHP
- #-----------------------------------------------------------------------------
- 
- # Setting PHP binary variable as to command line (prevailing) or auto detect
- 
- if (BUILD_API)
-     if (NOT PHP_BIN)
-          find_program (PHP_BIN php)
-     endif()
-     # sanity check if PHP binary exists
-     if (NOT EXISTS ${PHP_BIN})
-         message(WARNING "PHP binary not found. Only Python frontend can be used.")
-         set(PHP_BIN "")
-     else()
-         message (STATUS "Using PHP binary " ${PHP_BIN})
-     endif()
- endif()
- 
   #-----------------------------------------------------------------------------
   # import scripts and utilities (importer only)
   #-----------------------------------------------------------------------------
@@@ -113,27 -93,6 +93,27 @@@ if (BUILD_IMPORTER
                     ${PROJECT_BINARY_DIR}/nominatim)
   endif()
   
+ +#-----------------------------------------------------------------------------
+ +# Targets for running a development webserver from the build directory.
+ +#-----------------------------------------------------------------------------
+ +
+ +if (BUILD_API)
+ +   set(WEBSITEFILES
+ +       403.html
+ +       509.html
+ +       crossdomain.xml
+ +       favicon.ico
+ +       nominatim.xml
+ +       robots.txt
+ +       taginfo.json
+ +   )
+ +
+ +   foreach (webfile ${WEBSITEFILES})
+ +       configure_file(${PROJECT_SOURCE_DIR}/website/${webfile}
+ +                      ${PROJECT_BINARY_DIR}/website/${webfile})
+ +   endforeach()
+ +endif()
+ +
   #-----------------------------------------------------------------------------
   # Tests
   #-----------------------------------------------------------------------------
@@@ -146,8 -105,6 +126,6 @@@ if (BUILD_TESTS
       find_program(PYTHON_BEHAVE behave)
       find_program(PYLINT NAMES pylint3 pylint)
       find_program(PYTEST NAMES pytest py.test-3 py.test)
-     find_program(PHPCS phpcs)
-     find_program(PHPUNIT phpunit)
   
       if (PYTHON_BEHAVE)
           message(STATUS "Using Python behave binary ${PYTHON_BEHAVE}")
@@@ -162,24 -119,6 +140,6 @@@
           message(WARNING "behave not found. BDD tests disabled." )
       endif()
   
-     if (PHPUNIT)
-         message(STATUS "Using phpunit binary ${PHPUNIT}")
-         add_test(NAME php
-                  COMMAND ${PHPUNIT} ./
-                  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/test/php)
-     else()
-         message(WARNING "phpunit not found. PHP unit tests disabled." )
-     endif()
- 
-     if (PHPCS)
-         message(STATUS "Using phpcs binary ${PHPCS}")
-         add_test(NAME phpcs
-                  COMMAND ${PHPCS} --report-width=120 --colors lib-php
-                  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
-     else()
-         message(WARNING "phpcs not found. PHP linting tests disabled." )
-     endif()
- 
       if (PYLINT)
           message(STATUS "Using pylint binary ${PYLINT}")
           add_test(NAME pylint
@@@ -199,14 -138,6 +159,6 @@@
       endif()
   endif()
   
- #-----------------------------------------------------------------------------
- # Postgres module
- #-----------------------------------------------------------------------------
- 
- if (BUILD_MODULE)
-     add_subdirectory(module)
- endif()
- 
   #-----------------------------------------------------------------------------
   # Installation
   #-----------------------------------------------------------------------------
@@@ -224,11 -155,7 +176,7 @@@ if (BUILD_IMPORTER
               DESTINATION ${CMAKE_INSTALL_BINDIR}
               RENAME nominatim)
   
-     if (EXISTS ${PHP_BIN})
-         configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py.tmpl paths-py.installed)
-     else()
-         configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
-     endif()
+     configure_file(${PROJECT_SOURCE_DIR}/cmake/paths-py-no-php.tmpl paths-py.installed)
   
       foreach (submodule nominatim_db nominatim_api)
           install(DIRECTORY src/${submodule}
@@@ -259,15 -186,6 +207,6 @@@ if (BUILD_OSM2PGSQL
       endif()
   endif()
   
- if (BUILD_MODULE)
-     install(PROGRAMS ${PROJECT_BINARY_DIR}/module/nominatim.so
-             DESTINATION ${NOMINATIM_LIBDIR}/module)
- endif()
- 
- if (BUILD_API AND EXISTS ${PHP_BIN})
-     install(DIRECTORY lib-php DESTINATION ${NOMINATIM_LIBDIR})
- endif()
- 
   install(FILES settings/env.defaults
                 settings/address-levels.json
                 settings/phrase-settings.json
diff --combined src/nominatim_api/search/db_search_builder.py

index 0269cf1f57edd7699853d6eaf2ba9e5388dda425,1ac6db2b2a96d5ebc5095d81bd936b0002194600..b4346ee61acf1ef79b037fc8c20e54e0150e52fe
--- 1/src/nominatim_api/search/db_search_builder.py
--- 2/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@@ -167,8 -167,7 +167,7 @@@ class SearchBuilder
           expected_count = sum(t.count for t in hnrs)
   
           partials = {t.token: t.addr_count for trange in address
-                        for t in self.query.get_partials_list(trange)
-                        if t.is_indexed}
+                        for t in self.query.get_partials_list(trange)}
   
           if not partials:
               # can happen when none of the partials is indexed
@@@ -219,23 -218,19 +218,19 @@@
           addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
           addr_tokens = list({t.token for t in addr_partials})
   
-         partials_indexed = all(t.is_indexed for t in name_partials.values()) \
-                            and all(t.is_indexed for t in addr_partials)
           exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
   
-         if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
+         if (len(name_partials) > 3 or exp_count < 8000):
               yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
               return
   
- -        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
+ +        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
           # Partial term to frequent. Try looking up by rare full names first.
           name_fulls = self.query.get_tokens(name, TokenType.WORD)
           if name_fulls:
               fulls_count = sum(t.count for t in name_fulls)
-             if partials_indexed:
-                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
   
- -            if fulls_count < 50000 or addr_count < 30000:
+ +            if fulls_count < 80000 or addr_count < 50000:
                   yield penalty,fulls_count / (2**len(addr_tokens)), \
                       self.get_full_name_ranking(name_fulls, addr_partials,
                                                  fulls_count > 30000 / max(1, len(addr_tokens)))
@@@ -243,8 -238,7 +238,7 @@@
           # To catch remaining results, lookup by name and address
           # We only do this if there is a reasonable number of results expected.
           exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
-         if exp_count < 10000 and addr_count < 20000\
-            and all(t.is_indexed for t in name_partials.values()):
+         if exp_count < 10000 and addr_count < 20000:
               penalty += 0.35 * max(1 if name_fulls else 0.1,
                                     5 - len(name_partials) - len(addr_tokens))
               yield penalty, exp_count,\
@@@ -260,11 -254,10 +254,10 @@@
           addr_restrict_tokens = []
           addr_lookup_tokens = []
           for t in addr_partials:
-             if t.is_indexed:
-                 if t.addr_count > 20000:
-                     addr_restrict_tokens.append(t.token)
-                 else:
-                     addr_lookup_tokens.append(t.token)
+             if t.addr_count > 20000:
+                 addr_restrict_tokens.append(t.token)
+             else:
+                 addr_lookup_tokens.append(t.token)
   
           if addr_restrict_tokens:
               lookup.append(dbf.FieldLookup('nameaddress_vector',
@@@ -287,9 -280,14 +280,9 @@@
           # This might yield wrong results, nothing we can do about that.
           if use_lookup:
               addr_restrict_tokens = []
- -            addr_lookup_tokens = []
- -            for t in addr_partials:
- -                if t.addr_count > 20000:
- -                    addr_restrict_tokens.append(t.token)
- -                else:
- -                    addr_lookup_tokens.append(t.token)
+ +            addr_lookup_tokens = [t.token for t in addr_partials if t.is_indexed]
           else:
-             addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
+             addr_restrict_tokens = [t.token for t in addr_partials]
               addr_lookup_tokens = []
   
           return dbf.lookup_by_any_name([t.token for t in name_fulls],
diff --combined src/nominatim_api/search/icu_tokenizer.py

index 7bd2b09259d70facf28c5505f2c923f5e7aab46e,1aadc97e80170181fad8c35a1d70f7eb6464696d..c2a265105a69d08eb3d7d8a75331e4a8c4d61dc9
--- 1/src/nominatim_api/search/icu_tokenizer.py
--- 2/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@@ -123,7 -123,7 +123,7 @@@ class ICUToken(qmod.Token)
               lookup_word = row.word_token
   
           return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count),
-                         lookup_word=lookup_word, is_indexed=True,
+                         lookup_word=lookup_word,
                           word_token=row.word_token, info=row.info,
                           addr_count=max(1, addr_count))
   
@@@ -208,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna
               standardized form search will work with. All information removed
               at this stage is inevitably lost.
           """
- -        return cast(str, self.normalizer.transliterate(text))
+ +        norm = cast(str, self.normalizer.transliterate(text))
+ +        numspaces = norm.count(' ')
+ +        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
+ +            return ''
+ +
+ +        return norm
   
   
       def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
@@@ -264,7 -259,9 +264,9 @@@
               if len(part.token) <= 4 and part[0].isdigit()\
                  and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                   query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
-                                 ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
+                                 ICUToken(penalty=0.5, token=0,
+                                          count=1, addr_count=1, lookup_word=part.token,
+                                          word_token=part.token, info=None))
   
   
       def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py

index 4eee2c73b0c9a02001c8c0e49b9b88ab6da36c3b,1b95a90174841f00f552e8023c6ab4ffaef25280..452bf26ce951c949e2bda958b3fb7e1581134879
--- 1/src/nominatim_db/tokenizer/icu_tokenizer.py
--- 2/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@@ -13,7 -13,6 +13,6 @@@ from typing import Optional, Sequence, 
   import itertools
   import logging
   from pathlib import Path
- from textwrap import dedent
   
   from psycopg.types.json import Jsonb
   from psycopg import sql as pysql
@@@ -64,7 -63,6 +63,6 @@@ class ICUTokenizer(AbstractTokenizer)
           """
           self.loader = ICURuleLoader(config)
   
-         self._install_php(config.lib_dir.php, overwrite=True)
           self._save_config()
   
           if init_db:
@@@ -81,8 -79,6 +79,6 @@@
           with connect(self.dsn) as conn:
               self.loader.load_config_from_db(conn)
   
-         self._install_php(config.lib_dir.php, overwrite=False)
- 
   
       def finalize_import(self, config: Configuration) -> None:
           """ Do any required postprocessing to make the tokenizer data ready
@@@ -190,7 -186,6 +186,7 @@@
                                               END) as info
                                       FROM word LEFT JOIN word_frequencies wf
                                            ON word.word_id = wf.id
+ +                                    ORDER BY word_id
                                   """)
                       drop_tables(conn, 'word_frequencies')
   
@@@ -282,22 -277,6 +278,6 @@@
               return list(s[0].split('@')[0] for s in cur)
   
   
-     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
-         """ Install the php script for the tokenizer.
-         """
-         if phpdir is not None:
-             assert self.loader is not None
-             php_file = self.data_dir / "tokenizer.php"
- 
-             if not php_file.exists() or overwrite:
-                 php_file.write_text(dedent(f"""\
-                     <?php
-                     @define('CONST_Max_Word_Frequency', 10000000);
-                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
- 
- 
       def _save_config(self) -> None:
           """ Save the configuration that needs to remain stable for the given
               database as database properties.
author	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 29 Sep 2024 09:44:04 +0000 (11:44 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 29 Sep 2024 09:44:04 +0000 (11:44 +0200)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_api/search/db_search_builder.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_api/search/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history
src/nominatim_db/tokenizer/icu_tokenizer.py	patch \|	diff1 \|	diff2 \|	blob \| history