From 8b8dfc46ebee5f78a91685dc83dc9382d21aad0e Mon Sep 17 00:00:00 2001
From: AntoJvlt <antonin.jolivat@gmail.com>
Date: Mon, 17 May 2021 12:40:50 +0200
Subject: [PATCH] Added --no-replace command for special phrases importation
 and added corresponding tests

---
 nominatim/clicmd/special_phrases.py           |  5 ++++-
 nominatim/tokenizer/legacy_icu_tokenizer.py   |  4 ++--
 nominatim/tokenizer/legacy_tokenizer.py       |  4 ++--
 .../tools/special_phrases/sp_importer.py      | 17 +++++++++++------
 test/python/dummy_tokenizer.py                |  2 +-
 test/python/test_cli.py                       | 19 ++++++++++++++-----
 test/python/test_tokenizer_legacy.py          | 19 ++++++++++++++++---
 test/python/test_tokenizer_legacy_icu.py      | 19 ++++++++++++++++---
 .../test_tools_import_special_phrases.py      | 15 +++++++++------
 test/python/test_tools_sp_csv_loader.py       |  1 -
 test/python/test_tools_sp_wiki_loader.py      |  2 +-
 11 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/nominatim/clicmd/special_phrases.py b/nominatim/clicmd/special_phrases.py
index 66fb41ed..b20a4101 100644
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@@ -27,6 +27,8 @@ class ImportSpecialPhrases:
                            help='Import special phrases from the OSM wiki to the database.')
         group.add_argument('--import-from-csv', metavar='FILE',
                            help='Import special phrases from a CSV file.')
+        group.add_argument('--no-replace', action='store_true',
+                           help='Keep the old phrases and only add the new ones.')
 
     @staticmethod
     def run(args):
@@ -51,7 +53,8 @@ class ImportSpecialPhrases:
         from ..tokenizer import factory as tokenizer_factory
 
         tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+        should_replace = not args.no_replace
         with connect(args.config.get_libpq_dsn()) as db_connection:
             SPImporter(
                 args.config, args.phplib_dir, db_connection, loader
-            ).import_phrases(tokenizer)
+            ).import_phrases(tokenizer, should_replace)
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 065fdb03..e07602d9 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -306,7 +306,7 @@ class LegacyICUNameAnalyzer:
             #                WHERE word_id is null and type = 'postcode'""")
 
 
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -345,7 +345,7 @@ class LegacyICUNameAnalyzer:
                               columns=['word', 'word_token', 'class', 'type',
                                        'operator', 'search_name_count'])
 
-            if to_delete:
+            if to_delete and should_replace:
                 psycopg2.extras.execute_values(
                     cur,
                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index 438a5aff..5bd45c51 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -314,7 +314,7 @@ class LegacyNameAnalyzer:
                                  FROM location_postcode) x""")
 
 
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
         """ Replace the search index for special phrases with the new phrases.
         """
         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -343,7 +343,7 @@ class LegacyNameAnalyzer:
                            FROM (VALUES %s) as v(name, class, type, op))""",
                     to_add)
 
-            if to_delete:
+            if to_delete and should_replace:
                 psycopg2.extras.execute_values(
                     cur,
                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
diff --git a/nominatim/tools/special_phrases/sp_importer.py b/nominatim/tools/special_phrases/sp_importer.py
index e82f721e..ef4e85bd 100644
--- a/nominatim/tools/special_phrases/sp_importer.py
+++ b/nominatim/tools/special_phrases/sp_importer.py
@@ -23,7 +23,7 @@ LOG = logging.getLogger()
 class SPImporter():
     # pylint: disable-msg=too-many-instance-attributes
     """
-        Class handling the process of special phrases importations into the database.
+        Class handling the process of special phrases importation into the database.
 
         Take a sp loader which load the phrases from an external source.
     """
@@ -42,10 +42,14 @@ class SPImporter():
         #special phrases class/type on the wiki.
         self.table_phrases_to_delete = set()
 
-    def import_phrases(self, tokenizer):
+    def import_phrases(self, tokenizer, should_replace):
         """
-            Iterate through all specified languages and
-            extract corresponding special phrases from the wiki.
+            Iterate through all SpecialPhrases extracted from the
+            loader and import them into the database.
+
+            If should_replace is set to True only the loaded phrases
+            will be kept into the database. All other phrases already
+            in the database will be removed.
         """
         LOG.warning('Special phrases importation starting')
         self._fetch_existing_place_classtype_tables()
@@ -60,11 +64,12 @@ class SPImporter():
                     class_type_pairs.update(result)
 
         self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_tables_from_db()
+        if should_replace:
+            self._remove_non_existent_tables_from_db()
         self.db_connection.commit()
 
         with tokenizer.name_analyzer() as analyzer:
-            analyzer.update_special_phrases(self.word_phrases)
+            analyzer.update_special_phrases(self.word_phrases, should_replace)
 
         LOG.warning('Import done.')
         self.statistics_handler.notify_import_done()
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py
index 6352a644..2e61a245 100644
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -54,7 +54,7 @@ class DummyNameAnalyzer:
     def add_postcodes_from_db(self):
         pass
 
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
         self.analyser_cache['special_phrases'] = phrases
 
     def add_country_names(self, code, names):
diff --git a/test/python/test_cli.py b/test/python/test_cli.py
index 9810aee7..1d89ec69 100644
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -255,18 +255,27 @@ def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
     assert bnd_mock.called == do_bnds
     assert rank_mock.called == do_ranks
 
-def test_special_phrases_wiki_command(temp_db, mock_func_factory, tokenizer_mock):
+@pytest.mark.parametrize("no_replace", [(True), (False)])
+def test_special_phrases_wiki_command(temp_db, mock_func_factory, tokenizer_mock, no_replace):
     func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases')
 
-    call_nominatim('special-phrases', '--import-from-wiki')
+    if no_replace:
+        call_nominatim('special-phrases', '--import-from-wiki', '--no-replace')
+    else:
+        call_nominatim('special-phrases', '--import-from-wiki')
 
     assert func.called == 1
 
-def test_special_phrases_csv_command(temp_db, mock_func_factory, tokenizer_mock):
+@pytest.mark.parametrize("no_replace", [(True), (False)])
+def test_special_phrases_csv_command(temp_db, mock_func_factory, tokenizer_mock, no_replace):
     func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases')
-    testdata = Path('__file__') / '..' / '..' / 'testdb'
+    testdata = SRC_DIR / 'test' / 'testdb'
     csv_path = str((testdata / 'full_en_phrases_test.csv').resolve())
-    call_nominatim('special-phrases', '--import-from-csv', csv_path)
+
+    if no_replace:
+        call_nominatim('special-phrases', '--import-from-csv', csv_path, '--no-replace')
+    else:
+        call_nominatim('special-phrases', '--import-from-csv', csv_path)
 
     assert func.called == 1
 
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py
index c567a4c1..80147172 100644
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -209,7 +209,7 @@ def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
         ("KÃ¶nig bei", "amenity", "royal", "near"),
         ("KÃ¶nige", "amenity", "royal", "-"),
         ("strasse", "highway", "primary", "in")
-    ])
+    ], True)
 
     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                      FROM word WHERE class != 'place'""") \
@@ -226,11 +226,24 @@ def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
 
     assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 
-    analyzer.update_special_phrases([])
+    analyzer.update_special_phrases([], True)
 
     assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 
 
+def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,
+                                          make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([], False)
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
 def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
                                       make_standard_name):
     temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
@@ -243,7 +256,7 @@ def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
       ('prison', 'amenity', 'prison', 'in'),
       ('bar', 'highway', 'road', '-'),
       ('garden', 'leisure', 'garden', 'near')
-    ])
+    ], True)
 
     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                      FROM word WHERE class != 'place'""") \
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py
index 836f15b9..92a83249 100644
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -159,7 +159,7 @@ def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor)
             ("KÃ¶nig bei", "amenity", "royal", "near"),
             ("KÃ¶nige", "amenity", "royal", "-"),
             ("street", "highway", "primary", "in")
-        ])
+        ], True)
 
     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                      FROM word WHERE class != 'place'""") \
@@ -176,11 +176,24 @@ def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor):
     assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 
     with analyzer() as a:
-        a.update_special_phrases([])
+        a.update_special_phrases([], True)
 
     assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 
 
+def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
+                                     (' BAR', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    with analyzer() as a:
+        a.update_special_phrases([], False)
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
 def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor):
     temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
                               VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
@@ -193,7 +206,7 @@ def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor):
           ('prison', 'amenity', 'prison', 'in'),
           ('bar', 'highway', 'road', '-'),
           ('garden', 'leisure', 'garden', 'near')
-        ])
+        ], True)
 
     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                      FROM word WHERE class != 'place'""") \
diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py
index 6f451ade..1b4ab191 100644
--- a/test/python/test_tools_import_special_phrases.py
+++ b/test/python/test_tools_import_special_phrases.py
@@ -185,8 +185,9 @@ def test_remove_non_existent_tables_from_db(sp_importer, default_phrases,
             tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
         )
 
+@pytest.mark.parametrize("should_replace", [(True), (False)])
 def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
-                          placex_table, tokenizer_mock):
+                        placex_table, tokenizer_mock, should_replace):
     """
         Check that the main import_phrases() method is well executed.
         It should create the place_classtype table, the place_id and centroid indexes,
@@ -202,10 +203,10 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
             CREATE TABLE place_classtype_wrongclass_wrongtype();""")
     
     monkeypatch.setattr('nominatim.tools.special_phrases.sp_wiki_loader.SPWikiLoader._get_wiki_content',
-                    mock_get_wiki_content)
+                        mock_get_wiki_content)
 
     tokenizer = tokenizer_mock()
-    sp_importer.import_phrases(tokenizer)
+    sp_importer.import_phrases(tokenizer, should_replace)
 
     assert len(tokenizer.analyser_cache['special_phrases']) == 18
 
@@ -216,7 +217,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
     assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
     assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
     assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
-    assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
+    if should_replace:
+        assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
 
     #Format (query, should_return_something_bool) use to easily execute all asserts
     queries_tests = set()
@@ -237,7 +239,8 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
         WHERE table_schema='public'
         AND table_name = 'place_classtype_wrongclass_wrongtype';
     """
-    queries_tests.add((query_wrong_table, False))
+    if should_replace:
+        queries_tests.add((query_wrong_table, False))
 
     with temp_db_conn.cursor() as temp_db_cursor:
         for query in queries_tests:
@@ -247,7 +250,7 @@ def test_import_phrases(monkeypatch, temp_db_conn, def_config, sp_importer,
             else:
                 assert not temp_db_cursor.fetchone()
 
-def mock_get_wiki_content(lang):
+def mock_get_wiki_content(self, lang):
     """
         Mock the _get_wiki_content() method to return
         static xml test file content.
diff --git a/test/python/test_tools_sp_csv_loader.py b/test/python/test_tools_sp_csv_loader.py
index 628dfd16..ccc9996d 100644
--- a/test/python/test_tools_sp_csv_loader.py
+++ b/test/python/test_tools_sp_csv_loader.py
@@ -16,7 +16,6 @@ def test_parse_csv(sp_csv_loader):
     phrases = sp_csv_loader.parse_csv()
     assert check_phrases_content(phrases)
 
-
 def test_next(sp_csv_loader):
     """
         Test objects returned from the next() method.
diff --git a/test/python/test_tools_sp_wiki_loader.py b/test/python/test_tools_sp_wiki_loader.py
index ad8e22c3..b4f6d529 100644
--- a/test/python/test_tools_sp_wiki_loader.py
+++ b/test/python/test_tools_sp_wiki_loader.py
@@ -47,7 +47,7 @@ def sp_wiki_loader(monkeypatch, def_config):
                         mock_get_wiki_content)
     return loader
 
-def mock_get_wiki_content(lang):
+def mock_get_wiki_content(self, lang):
     """
         Mock the _get_wiki_content() method to return
         static xml test file content.
-- 
2.39.5