+++ /dev/null
-DROP TABLE IF EXISTS word_frequencies;
-CREATE TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
-
-CREATE INDEX idx_word_frequencies ON word_frequencies(id);
-
-UPDATE word SET search_name_count = count
- FROM word_frequencies
- WHERE word_token like ' %' and word_id = id;
-
-DROP TABLE word_frequencies;
"Postcode updates on a frozen database is not possible.")
if args.word_counts:
- LOG.warning('Recompute frequency of full-word search terms')
- refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+ LOG.warning('Recompute word statistics')
+ self._get_tokenizer(args.config).update_statistics()
if args.address_levels:
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
pass
+ @abstractmethod
+ def update_statistics(self) -> None:
+ """ Recompute any tokenizer statistics necessary for efficient lookup.
+ This function is meant to be called from time to time by the user
+ to improve performance. However, the tokenizer must not depend on
+ it to be called in order to work.
+ """
+ pass
+
+
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries
return None
+ def update_statistics(self):
+ """ Recompute frequencies for all name words.
+ """
+ with connect(self.dsn) as conn:
+ with conn.cursor() as cur:
+ cur.drop_table("word_frequencies")
+ LOG.info("Computing word frequencies")
+ cur.execute("""CREATE TEMP TABLE word_frequencies AS
+ SELECT unnest(name_vector) as id, count(*)
+ FROM search_name GROUP BY id""")
+ cur.execute("CREATE INDEX ON word_frequencies(id)")
+ LOG.info("Update word table with recomputed frequencies")
+ cur.execute("""UPDATE word
+ SET info = info || jsonb_build_object('count', count)
+ FROM word_frequencies WHERE word_id = id""")
+ cur.drop_table("word_frequencies")
+ conn.commit()
+
+
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
self._save_config(conn, config)
+ def update_statistics(self):
+ """ Recompute the frequency of full words.
+ """
+ with connect(self.dsn) as conn:
+ with conn.cursor() as cur:
+ cur.drop_table("word_frequencies")
+ LOG.info("Computing word frequencies")
+ cur.execute("""CREATE TEMP TABLE word_frequencies AS
+ SELECT unnest(name_vector) as id, count(*)
+ FROM search_name GROUP BY id""")
+ cur.execute("CREATE INDEX ON word_frequencies(id)")
+ LOG.info("Update word table with recomputed frequencies")
+ cur.execute("""UPDATE word SET search_name_count = count
+ FROM word_frequencies
+ WHERE word_token like ' %' and word_id = id""")
+ cur.drop_table("word_frequencies")
+ conn.commit()
+
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
LOG = logging.getLogger()
-def recompute_word_counts(dsn, sql_dir):
- """ Compute the frequency of full-word search terms.
- """
- execute_file(dsn, sql_dir / 'words_from_search_name.sql')
-
-
def _add_address_level_rows_from_entry(rows, entry):
""" Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a