From: Sarah Hoffmann Date: Tue, 19 Oct 2021 09:21:16 +0000 (+0200) Subject: make word recount a tokenizer-specific function X-Git-Tag: v4.0.0~16^2~2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/e8e2502e2f9d2275b8d567341400672adea9fea3?ds=inline make word recount a tokenizer-specific function --- diff --git a/lib-sql/words_from_search_name.sql b/lib-sql/words_from_search_name.sql deleted file mode 100644 index b7727dc6..00000000 --- a/lib-sql/words_from_search_name.sql +++ /dev/null @@ -1,11 +0,0 @@ -DROP TABLE IF EXISTS word_frequencies; -CREATE TABLE word_frequencies AS - SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id; - -CREATE INDEX idx_word_frequencies ON word_frequencies(id); - -UPDATE word SET search_name_count = count - FROM word_frequencies - WHERE word_token like ' %' and word_id = id; - -DROP TABLE word_frequencies; diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py index aa540f6b..e7d7d7ba 100644 --- a/nominatim/clicmd/refresh.py +++ b/nominatim/clicmd/refresh.py @@ -71,8 +71,8 @@ class UpdateRefresh: "Postcode updates on a frozen database is not possible.") if args.word_counts: - LOG.warning('Recompute frequency of full-word search terms') - refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir) + LOG.warning('Recompute word statistics') + self._get_tokenizer(args.config).update_statistics() if args.address_levels: cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 02bc312f..94fac1fc 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -205,6 +205,16 @@ class AbstractTokenizer(ABC): pass + @abstractmethod + def update_statistics(self) -> None: + """ Recompute any tokenizer statistics necessary for efficient lookup. + This function is meant to be called from time to time by the user + to improve performance. However, the tokenizer must not depend on + it to be called in order to work. + """ + pass + + @abstractmethod def name_analyzer(self) -> AbstractAnalyzer: """ Create a new analyzer for tokenizing names and queries diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 12d1eccd..686fbd79 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer): return None + def update_statistics(self): + """ Recompute frequencies for all name words. + """ + with connect(self.dsn) as conn: + with conn.cursor() as cur: + cur.drop_table("word_frequencies") + LOG.info("Computing word frequencies") + cur.execute("""CREATE TEMP TABLE word_frequencies AS + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute("CREATE INDEX ON word_frequencies(id)") + LOG.info("Update word table with recomputed frequencies") + cur.execute("""UPDATE word + SET info = info || jsonb_build_object('count', count) + FROM word_frequencies WHERE word_id = id""") + cur.drop_table("word_frequencies") + conn.commit() + + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index c935f20d..d901a68d 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer): self._save_config(conn, config) + def update_statistics(self): + """ Recompute the frequency of full words. + """ + with connect(self.dsn) as conn: + with conn.cursor() as cur: + cur.drop_table("word_frequencies") + LOG.info("Computing word frequencies") + cur.execute("""CREATE TEMP TABLE word_frequencies AS + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute("CREATE INDEX ON word_frequencies(id)") + LOG.info("Update word table with recomputed frequencies") + cur.execute("""UPDATE word SET search_name_count = count + FROM word_frequencies + WHERE word_token like ' %' and word_id = id""") + cur.drop_table("word_frequencies") + conn.commit() + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 5aaee0c8..00ae5dc9 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION LOG = logging.getLogger() -def recompute_word_counts(dsn, sql_dir): - """ Compute the frequency of full-word search terms. - """ - execute_file(dsn, sql_dir / 'words_from_search_name.sql') - - def _add_address_level_rows_from_entry(rows, entry): """ Converts a single entry from the JSON format for address rank descriptions into a flat format suitable for inserting into a