From eb6814d74e509fb49986989bb4c60539a2871d76 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 21 Jul 2021 11:37:14 +0200 Subject: [PATCH] convert word info column to json before copying --- nominatim/db/utils.py | 1 + nominatim/tokenizer/legacy_icu_tokenizer.py | 9 ++--- test/python/test_db_utils.py | 37 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 9a4a41a5..bb7faa25 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -65,6 +65,7 @@ _SQL_TRANSLATION = {ord(u'\\'): u'\\\\', ord(u'\t'): u'\\t', ord(u'\n'): u'\\n'} + class CopyBuffer: """ Data collector for the copy_from command. """ diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 14fa5b60..e019ef67 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module. """ from collections import Counter import itertools +import json import logging import re from textwrap import dedent @@ -173,7 +174,7 @@ class LegacyICUTokenizer: # copy them back into the word table with CopyBuffer() as copystr: for k, v in words.items(): - copystr.add('w', k, {'count': v}) + copystr.add('w', k, json.dumps({'count': v})) with conn.cursor() as cur: copystr.copy_out(cur, 'word', @@ -287,7 +288,7 @@ class LegacyICUNameAnalyzer: to_delete.append(word) else: copystr.add(self.name_processor.get_search_normalized(postcode), - 'P', {'postcode': postcode}) + 'P', json.dumps({'postcode': postcode})) if to_delete: cur.execute("""DELETE FROM WORD @@ -337,8 +338,8 @@ class LegacyICUNameAnalyzer: term = self.name_processor.get_search_normalized(word) if term: copystr.add(term, 'S', - {'word': word, 'class': cls, 'type': typ, - 'op': oper if oper in ('in', 'near') else None}) + json.dumps({'word': word, 'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None})) added += 1 copystr.copy_out(cursor, 'word', diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 545cc58f..9eea7ed1 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -1,6 +1,8 @@ """ Tests for DB utility functions in db.utils """ +import json + import pytest import nominatim.db.utils as db_utils @@ -115,3 +117,38 @@ class TestCopyBuffer: +class TestCopyBufferJson: + TABLE_NAME = 'copytable' + + @pytest.fixture(autouse=True) + def setup_test_table(self, table_factory): + table_factory(self.TABLE_NAME, 'colA INT, colB JSONB') + + + def table_rows(self, cursor): + cursor.execute('SELECT * FROM ' + self.TABLE_NAME) + results = {k: v for k,v in cursor} + + assert len(results) == cursor.rowcount + + return results + + + def test_json_object(self, temp_db_cursor): + with db_utils.CopyBuffer() as buf: + buf.add(1, json.dumps({'test': 'value', 'number': 1})) + + buf.copy_out(temp_db_cursor, self.TABLE_NAME) + + assert self.table_rows(temp_db_cursor) == \ + {1: {'test': 'value', 'number': 1}} + + + def test_json_object_special_chras(self, temp_db_cursor): + with db_utils.CopyBuffer() as buf: + buf.add(1, json.dumps({'te\tst': 'va\nlue', 'nu"mber': None})) + + buf.copy_out(temp_db_cursor, self.TABLE_NAME) + + assert self.table_rows(temp_db_cursor) == \ + {1: {'te\tst': 'va\nlue', 'nu"mber': None}} -- 2.39.5