From ec7184c53315711b02ac66a05cf04a618e1d3ee3 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 19 Oct 2021 11:50:06 +0200 Subject: [PATCH] icu: no longer precompute terms The ICU analyzer no longer drops frequent partials, so it is no longer necessary to know the frequencies in advance. --- nominatim/tokenizer/icu_tokenizer.py | 38 ---------------------------- 1 file changed, 38 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 686fbd79..2af0bcb2 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -2,7 +2,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ -from collections import Counter import itertools import json import logging @@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer): sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') conn.commit() - LOG.warning("Precomputing word tokens") - - # get partial words and their frequencies - words = self._count_partial_terms(conn) - - # copy them back into the word table - with CopyBuffer() as copystr: - for term, cnt in words.items(): - copystr.add('w', term, json.dumps({'count': cnt})) - - with conn.cursor() as cur: - copystr.copy_out(cur, 'word', - columns=['type', 'word_token', 'info']) - cur.execute("""UPDATE word SET word_id = nextval('seq_word') - WHERE word_id is null and type = 'w'""") - - conn.commit() - - def _count_partial_terms(self, conn): - """ Count the partial terms from the names in the place table. - """ - words = Counter() - analysis = self.loader.make_token_analysis() - - with conn.cursor(name="words") as cur: - cur.execute(""" SELECT v, count(*) FROM - (SELECT svals(name) as v FROM place)x - WHERE length(v) < 75 GROUP BY v""") - - for name, cnt in cur: - word = analysis.search.transliterate(name) - if word and ' ' in word: - for term in set(word.split()): - words[term] += cnt - - return words - class LegacyICUNameAnalyzer(AbstractAnalyzer): """ The legacy analyzer uses the ICU library for splitting names. -- 2.39.5