X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/81eed0680cd2a8b5c53217ab2996a43ab17f5056..6a748204fffd43722788aacdd341ba8961a5a4fb:/nominatim/tokenizer/icu_tokenizer.py?ds=inline diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index cbbaf71f..4b9dac69 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -67,7 +67,7 @@ class ICUTokenizer(AbstractTokenizer): if init_db: self.update_sql_functions(config) - self._setup_db_tables(config, 'word') + self._setup_db_tables(config) self._create_base_indices(config, 'word') @@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer): self.init_from_project(config) - def update_statistics(self, config: Configuration) -> None: + def update_statistics(self, config: Configuration, threads: int = 2) -> None: """ Recompute frequencies for all name words. """ with connect(self.dsn) as conn: @@ -112,22 +112,93 @@ class ICUTokenizer(AbstractTokenizer): return with conn.cursor() as cur: - LOG.info('Computing word frequencies') - cur.drop_table('word_frequencies') - cur.execute("""CREATE TEMP TABLE word_frequencies AS - SELECT unnest(name_vector) as id, count(*) - FROM search_name GROUP BY id""") - cur.execute('CREATE INDEX ON word_frequencies(id)') - LOG.info('Update word table with recomputed frequencies') - cur.drop_table('tmp_word') - cur.execute("""CREATE TABLE tmp_word AS - SELECT word_id, word_token, type, word, - (CASE WHEN wf.count is null THEN info - ELSE info || jsonb_build_object('count', wf.count) - END) as info - FROM word LEFT JOIN word_frequencies wf - ON word.word_id = wf.id""") - cur.drop_table('word_frequencies') + cur.execute('ANALYSE search_name') + if threads > 1: + cur.execute('SET max_parallel_workers_per_gather TO %s', + (min(threads, 6),)) + + if conn.server_version_tuple() < (12, 0): + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + cur.execute("""CREATE TEMP TABLE word_frequencies AS + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON word_frequencies(id)') + cur.execute("""CREATE TEMP TABLE addressword_frequencies AS + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON addressword_frequencies(id)') + cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER, + INOUT info JSONB) + AS $$ + DECLARE rec RECORD; + BEGIN + IF info is null THEN + info = '{}'::jsonb; + END IF; + FOR rec IN SELECT count FROM word_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('count', rec.count); + END LOOP; + FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('addr_count', rec.count); + END LOOP; + IF info = '{}'::jsonb THEN + info = null; + END IF; + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """) + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + word_freq_update(word_id, info) as info + FROM word + """) + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + else: + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.execute(""" + CREATE TEMP TABLE word_frequencies AS + WITH word_freq AS MATERIALIZED ( + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id), + addr_freq AS MATERIALIZED ( + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id) + SELECT coalesce(a.id, w.id) as id, + (CASE WHEN w.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('count', w.count) END + || + CASE WHEN a.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('addr_count', a.count) END) as info + FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; + """) + cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') + cur.execute('ANALYSE word_frequencies') + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + (CASE WHEN wf.info is null THEN word.info + ELSE coalesce(word.info, '{}'::jsonb) || wf.info + END) as info + FROM word LEFT JOIN word_frequencies wf + ON word.word_id = wf.id + """) + cur.drop_table('word_frequencies') + + with conn.cursor() as cur: + cur.execute('SET max_parallel_workers_per_gather TO 0') + + sqlp = SQLPreprocessor(conn, config) + sqlp.run_string(conn, + 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"') conn.commit() self._create_base_indices(config, 'tmp_word') self._create_lookup_indices(config, 'tmp_word') @@ -210,19 +281,20 @@ class ICUTokenizer(AbstractTokenizer): return list(s[0].split('@')[0] for s in cur) - def _install_php(self, phpdir: Path, overwrite: bool = True) -> None: + def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ - assert self.loader is not None - php_file = self.data_dir / "tokenizer.php" + if phpdir is not None: + assert self.loader is not None + php_file = self.data_dir / "tokenizer.php" - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: @@ -234,28 +306,29 @@ class ICUTokenizer(AbstractTokenizer): self.loader.save_config_to_db(conn) - def _setup_db_tables(self, config: Configuration, table_name: str) -> None: + def _setup_db_tables(self, config: Configuration) -> None: """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: with conn.cursor() as cur: - cur.drop_table(table_name) + cur.drop_table('word') sqlp = SQLPreprocessor(conn, config) sqlp.run_string(conn, """ - CREATE TABLE {{table_name}} ( + CREATE TABLE word ( word_id INTEGER, word_token text NOT NULL, type text NOT NULL, word text, info jsonb ) {{db.tablespace.search_data}}; - GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}"; + GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; - DROP SEQUENCE IF EXISTS seq_{{table_name}}; - CREATE SEQUENCE seq_{{table_name}} start 1; - GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}"; - """, table_name=table_name) + DROP SEQUENCE IF EXISTS seq_word; + CREATE SEQUENCE seq_word start 1; + GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}"; + """) + conn.commit() def _create_base_indices(self, config: Configuration, table_name: str) -> None: @@ -276,10 +349,11 @@ class ICUTokenizer(AbstractTokenizer): """, table_name=table_name, idx_name=name, column_type=ctype) + conn.commit() def _create_lookup_indices(self, config: Configuration, table_name: str) -> None: - """ Create addtional indexes used when running the API. + """ Create additional indexes used when running the API. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) @@ -289,6 +363,7 @@ class ICUTokenizer(AbstractTokenizer): ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}} """, table_name=table_name) + conn.commit() def _move_temporary_word_table(self, old: str) -> None: @@ -637,10 +712,11 @@ class ICUNameAnalyzer(AbstractAnalyzer): token_info.add_street(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': if not item.suffix: - token_info.add_place(self._compute_partial_tokens(item.name)) + token_info.add_place(itertools.chain(*self._compute_name_tokens([item]))) elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full', 'inclusion'): - token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) + token_info.add_address_term(item.kind, + itertools.chain(*self._compute_name_tokens([item]))) def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]: @@ -681,36 +757,6 @@ class ICUNameAnalyzer(AbstractAnalyzer): return result - def _compute_partial_tokens(self, name: str) -> List[int]: - """ Normalize the given term, split it into partial words and return - then token list for them. - """ - assert self.conn is not None - norm_name = self._search_normalized(name) - - tokens = [] - need_lookup = [] - for partial in norm_name.split(): - token = self._cache.partials.get(partial) - if token: - tokens.append(token) - else: - need_lookup.append(partial) - - if need_lookup: - with self.conn.cursor() as cur: - cur.execute("""SELECT word, getorcreate_partial_word(word) - FROM unnest(%s) word""", - (need_lookup, )) - - for partial, token in cur: - assert token is not None - tokens.append(token) - self._cache.partials[partial] = token - - return tokens - - def _retrieve_full_tokens(self, name: str) -> List[int]: """ Get the full name token for the given name, if it exists. The name is only retrieved for the standard analyser. @@ -882,8 +928,9 @@ class _TokenInfo: def add_address_term(self, key: str, partials: Iterable[int]) -> None: """ Add additional address terms. """ - if partials: - self.address_tokens[key] = self._mk_array(partials) + array = self._mk_array(partials) + if len(array) > 2: + self.address_tokens[key] = array def set_postcode(self, postcode: Optional[str]) -> None: """ Set the postcode to the given one.