Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
-from collections import Counter
import itertools
import json
import logging
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
- def finalize_import(self, _):
+ def finalize_import(self, config):
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
+ with connect(self.dsn) as conn:
+ sqlp = SQLPreprocessor(conn, config)
+ sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config):
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
- with conn.cursor() as cur:
- cur.drop_table("word_frequencies")
- LOG.info("Computing word frequencies")
- cur.execute("""CREATE TEMP TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*)
- FROM search_name GROUP BY id""")
- cur.execute("CREATE INDEX ON word_frequencies(id)")
- LOG.info("Update word table with recomputed frequencies")
- cur.execute("""UPDATE word
- SET info = info || jsonb_build_object('count', count)
- FROM word_frequencies WHERE word_id = id""")
- cur.drop_table("word_frequencies")
+ if conn.table_exists('search_name'):
+ with conn.cursor() as cur:
+ cur.drop_table("word_frequencies")
+ LOG.info("Computing word frequencies")
+ cur.execute("""CREATE TEMP TABLE word_frequencies AS
+ SELECT unnest(name_vector) as id, count(*)
+ FROM search_name GROUP BY id""")
+ cur.execute("CREATE INDEX ON word_frequencies(id)")
+ LOG.info("Update word table with recomputed frequencies")
+ cur.execute("""UPDATE word
+ SET info = info || jsonb_build_object('count', count)
+ FROM word_frequencies WHERE word_id = id""")
+ cur.drop_table("word_frequencies")
conn.commit()
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit()
- LOG.warning("Precomputing word tokens")
-
- # get partial words and their frequencies
- words = self._count_partial_terms(conn)
-
- # copy them back into the word table
- with CopyBuffer() as copystr:
- for term, cnt in words.items():
- copystr.add('w', term, json.dumps({'count': cnt}))
-
- with conn.cursor() as cur:
- copystr.copy_out(cur, 'word',
- columns=['type', 'word_token', 'info'])
- cur.execute("""UPDATE word SET word_id = nextval('seq_word')
- WHERE word_id is null and type = 'w'""")
-
- conn.commit()
-
- def _count_partial_terms(self, conn):
- """ Count the partial terms from the names in the place table.
- """
- words = Counter()
- analysis = self.loader.make_token_analysis()
-
- with conn.cursor(name="words") as cur:
- cur.execute(""" SELECT v, count(*) FROM
- (SELECT svals(name) as v FROM place)x
- WHERE length(v) < 75 GROUP BY v""")
-
- for name, cnt in cur:
- word = analysis.search.transliterate(name)
- if word and ' ' in word:
- for term in set(word.split()):
- words[term] += cnt
-
- return words
-
class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names.