From: Sarah Hoffmann Date: Tue, 19 Nov 2024 10:56:33 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/57598a048e9124b905572ed8dc4fa9465b5d38a6?hp=-c Merge remote-tracking branch 'upstream/master' --- 57598a048e9124b905572ed8dc4fa9465b5d38a6 diff --combined src/nominatim_db/tokenizer/icu_tokenizer.py index 19818adb,5595fcb2..16122d08 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@@ -17,7 -17,7 +17,7 @@@ from pathlib import Pat from psycopg.types.json import Jsonb from psycopg import sql as pysql - from ..db.connection import connect, Connection, Cursor, server_version_tuple, \ + from ..db.connection import connect, Connection, Cursor, \ drop_tables, table_exists, execute_scalar from ..config import Configuration from ..db.sql_preprocessor import SQLPreprocessor @@@ -110,81 -110,37 +110,38 @@@ class ICUTokenizer(AbstractTokenizer) cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}') .format(pysql.Literal(min(threads, 6),))) - if server_version_tuple(conn) < (12, 0): - LOG.info('Computing word frequencies') - drop_tables(conn, 'word_frequencies', 'addressword_frequencies') - cur.execute("""CREATE TEMP TABLE word_frequencies AS - SELECT unnest(name_vector) as id, count(*) - FROM search_name GROUP BY id""") - cur.execute('CREATE INDEX ON word_frequencies(id)') - cur.execute("""CREATE TEMP TABLE addressword_frequencies AS - SELECT unnest(nameaddress_vector) as id, count(*) - FROM search_name GROUP BY id""") - cur.execute('CREATE INDEX ON addressword_frequencies(id)') - cur.execute(""" - CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER, - INOUT info JSONB) - AS $$ - DECLARE rec RECORD; - BEGIN - IF info is null THEN - info = '{}'::jsonb; - END IF; - FOR rec IN SELECT count FROM word_frequencies WHERE id = wid - LOOP - info = info || jsonb_build_object('count', rec.count); - END LOOP; - FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid - LOOP - info = info || jsonb_build_object('addr_count', rec.count); - END LOOP; - IF info = '{}'::jsonb THEN - info = null; - END IF; - END; - $$ LANGUAGE plpgsql IMMUTABLE; - """) - LOG.info('Update word table with recomputed frequencies') - drop_tables(conn, 'tmp_word') - cur.execute("""CREATE TABLE tmp_word AS - SELECT word_id, word_token, type, word, - word_freq_update(word_id, info) as info - FROM word - """) - drop_tables(conn, 'word_frequencies', 'addressword_frequencies') - else: - LOG.info('Computing word frequencies') - drop_tables(conn, 'word_frequencies') - cur.execute(""" - CREATE TEMP TABLE word_frequencies AS - WITH word_freq AS MATERIALIZED ( - SELECT unnest(name_vector) as id, count(*) - FROM search_name GROUP BY id), - addr_freq AS MATERIALIZED ( - SELECT unnest(nameaddress_vector) as id, count(*) - FROM search_name GROUP BY id) - SELECT coalesce(a.id, w.id) as id, - (CASE WHEN w.count is null THEN '{}'::JSONB - ELSE jsonb_build_object('count', w.count) END - || - CASE WHEN a.count is null THEN '{}'::JSONB - ELSE jsonb_build_object('addr_count', a.count) END) as info - FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; - """) - cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') - cur.execute('ANALYSE word_frequencies') - LOG.info('Update word table with recomputed frequencies') - drop_tables(conn, 'tmp_word') - cur.execute("""CREATE TABLE tmp_word AS - SELECT word_id, word_token, type, word, - (CASE WHEN wf.info is null THEN word.info - ELSE coalesce(word.info, '{}'::jsonb) || wf.info - END) as info - FROM word LEFT JOIN word_frequencies wf - ON word.word_id = wf.id - ORDER BY word_id - """) - drop_tables(conn, 'word_frequencies') + LOG.info('Computing word frequencies') + drop_tables(conn, 'word_frequencies') + cur.execute(""" + CREATE TEMP TABLE word_frequencies AS + WITH word_freq AS MATERIALIZED ( + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id), + addr_freq AS MATERIALIZED ( + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id) + SELECT coalesce(a.id, w.id) as id, + (CASE WHEN w.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('count', w.count) END + || + CASE WHEN a.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('addr_count', a.count) END) as info + FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; + """) + cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') + cur.execute('ANALYSE word_frequencies') + LOG.info('Update word table with recomputed frequencies') + drop_tables(conn, 'tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + (CASE WHEN wf.info is null THEN word.info + ELSE coalesce(word.info, '{}'::jsonb) || wf.info + END) as info + FROM word LEFT JOIN word_frequencies wf + ON word.word_id = wf.id ++ ORDER BY word_id + """) + drop_tables(conn, 'word_frequencies') with conn.cursor() as cur: cur.execute('SET max_parallel_workers_per_gather TO 0')