From 83775289523eda29fe8d82ff2e92c6faa5c76898 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 10:27:06 +0200 Subject: [PATCH] new word table layout for icu tokenizer The table now directly reflects the different token types. Extra information is saved in a json structure that may be dynamically extended in the future without affecting the table layout. --- lib-sql/tokenizer/icu_tokenizer_tables.sql | 15 +++++++++++++++ nominatim/tokenizer/legacy_icu_tokenizer.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 lib-sql/tokenizer/icu_tokenizer_tables.sql diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql new file mode 100644 index 00000000..13e1bdb0 --- /dev/null +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS word; +CREATE TABLE word_icu ( + word_id INTEGER, + word_token text NOT NULL, + type text NOT NULL, + info jsonb +) {{db.tablespace.search_data}}; + +CREATE INDEX idx_word_word_token ON word + USING BTREE (word_token) {{db.tablespace.search_index}}; +GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; + +DROP SEQUENCE IF EXISTS seq_word; +CREATE SEQUENCE seq_word start 1; +GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}"; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6d3d11c1..59ad09aa 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -152,7 +152,7 @@ class LegacyICUTokenizer: """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') conn.commit() LOG.warning("Precomputing word tokens") -- 2.39.5