From faeee7528f710dc98bf14a08e592137d3e6d37f2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 16 Jul 2023 20:12:53 +0200 Subject: [PATCH] move warm script to python code --- lib-php/admin/warm.php | 115 ------------------------ nominatim/clicmd/admin.py | 30 +++++-- nominatim/tokenizer/base.py | 8 ++ nominatim/tokenizer/icu_tokenizer.py | 12 +++ nominatim/tokenizer/legacy_tokenizer.py | 10 +++ test/python/cli/test_cmd_admin.py | 11 --- 6 files changed, 53 insertions(+), 133 deletions(-) delete mode 100644 lib-php/admin/warm.php diff --git a/lib-php/admin/warm.php b/lib-php/admin/warm.php deleted file mode 100644 index 32f78f46..00000000 --- a/lib-php/admin/warm.php +++ /dev/null @@ -1,115 +0,0 @@ -connect(); - -$bVerbose = $aResult['verbose']; - -function print_results($aResults, $bVerbose) -{ - if ($bVerbose) { - if ($aResults && count($aResults)) { - echo $aResults[0]['langaddress']."\n"; - } else { - echo "\n"; - } - } else { - echo '.'; - } -} - -if (!$aResult['search-only']) { - $oReverseGeocode = new Nominatim\ReverseGeocode($oDB); - $oReverseGeocode->setZoom(20); - $oPlaceLookup = new Nominatim\PlaceLookup($oDB); - $oPlaceLookup->setIncludeAddressDetails(true); - $oPlaceLookup->setLanguagePreference(array('en')); - - echo 'Warm reverse: '; - if ($bVerbose) { - echo "\n"; - } - for ($i = 0; $i < 1000; $i++) { - $fLat = rand(-9000, 9000) / 100; - $fLon = rand(-18000, 18000) / 100; - if ($bVerbose) { - echo "$fLat, $fLon = "; - } - - $oLookup = $oReverseGeocode->lookup($fLat, $fLon); - $aSearchResults = $oLookup ? $oPlaceLookup->lookup(array($oLookup->iId => $oLookup)) : null; - print_results($aSearchResults, $bVerbose); - } - echo "\n"; -} - -if (!$aResult['reverse-only']) { - $oGeocode = new Nominatim\Geocode($oDB); - - echo 'Warm search: '; - if ($bVerbose) { - echo "\n"; - } - - $oTokenizer = new \Nominatim\Tokenizer($oDB); - - $aWords = $oTokenizer->mostFrequentWords(1000); - - $sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000'; - foreach ($aWords as $sWord) { - if ($bVerbose) { - echo "$sWord = "; - } - - $oGeocode->setLanguagePreference(array('en')); - $oGeocode->setQuery($sWord); - $aSearchResults = $oGeocode->lookup(); - print_results($aSearchResults, $bVerbose); - } - echo "\n"; -} diff --git a/nominatim/clicmd/admin.py b/nominatim/clicmd/admin.py index 0c773960..a84b0db2 100644 --- a/nominatim/clicmd/admin.py +++ b/nominatim/clicmd/admin.py @@ -9,9 +9,11 @@ Implementation of the 'admin' subcommand. """ import logging import argparse +import random -from nominatim.tools.exec_utils import run_legacy_script +from nominatim.db.connection import connect from nominatim.clicmd.args import NominatimArgs +import nominatim.api as napi # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 @@ -81,11 +83,25 @@ class AdminFuncs: return 1 + def _warm(self, args: NominatimArgs) -> int: LOG.warning('Warming database caches') - params = ['warm.php'] - if args.target == 'reverse': - params.append('--reverse-only') - if args.target == 'search': - params.append('--search-only') - return run_legacy_script(*params, config=args.config) + + api = napi.NominatimAPI(args.project_dir) + + if args.target != 'reverse': + for _ in range(1000): + api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)), + address_details=True) + + if args.target != 'search': + from ..tokenizer import factory as tokenizer_factory + + tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) + with connect(args.config.get_libpq_dsn()) as conn: + words = tokenizer.most_frequent_words(conn, 1000) + + for word in words: + api.search(word) + + return 0 diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index afbd1914..f0fd9dd0 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -13,6 +13,7 @@ from typing import List, Tuple, Dict, Any, Optional, Iterable from pathlib import Path from nominatim.config import Configuration +from nominatim.db.connection import Connection from nominatim.data.place_info import PlaceInfo from nominatim.typing import Protocol @@ -233,6 +234,13 @@ class AbstractTokenizer(ABC): """ + @abstractmethod + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + + class TokenizerModule(Protocol): """ Interface that must be exported by modules that implement their own tokenizer. diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index b6e64637..39c1cbc6 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute("""SELECT word, sum((info->'count')::int) as count + FROM word WHERE type = 'W' + GROUP BY word + ORDER BY count DESC LIMIT %s""", (num,)) + return list(s[0].split('@')[0] for s in cur) + + def _install_php(self, phpdir: Path, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index e09700d9..1b68a494 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -256,6 +256,16 @@ class LegacyTokenizer(AbstractTokenizer): return LegacyNameAnalyzer(self.dsn, normalizer) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute(""" SELECT word FROM word WHERE word is not null + ORDER BY search_name_count DESC LIMIT %s""", (num,)) + return list(s[0] for s in cur) + + def _install_php(self, config: Configuration, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ diff --git a/test/python/cli/test_cmd_admin.py b/test/python/cli/test_cmd_admin.py index 696e2dd2..75ae3cd2 100644 --- a/test/python/cli/test_cmd_admin.py +++ b/test/python/cli/test_cmd_admin.py @@ -19,17 +19,6 @@ import nominatim.tools.migration import nominatim.clicmd.admin -@pytest.mark.parametrize("params", [('--warm', ), - ('--warm', '--reverse-only'), - ('--warm', '--search-only')]) -def test_admin_command_legacy(cli_call, mock_func_factory, params): - mock_run_legacy = mock_func_factory(nominatim.clicmd.admin, 'run_legacy_script') - - assert cli_call('admin', *params) == 0 - - assert mock_run_legacy.called == 1 - - def test_admin_command_check_database(cli_call, mock_func_factory): mock = mock_func_factory(nominatim.tools.check_database, 'check_database') -- 2.39.5