From: AntoJvlt <antonin.jolivat@gmail.com>
Date: Mon, 17 May 2021 11:52:35 +0000 (+0200)
Subject: Resolve conflicts
X-Git-Tag: v4.0.0~79^2~1
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/3206bf59df0213d24bd3e11df7dd2abaebf89911?ds=inline;hp=-c

Resolve conflicts
---

3206bf59df0213d24bd3e11df7dd2abaebf89911
diff --combined docs/admin/Import.md
index def513aa,2686942e..264b46f4
--- a/docs/admin/Import.md
+++ b/docs/admin/Import.md
@@@ -83,15 -83,19 +83,19 @@@ The file is about 400MB and adds aroun
      `nominatim refresh --wiki-data --importance`. Updating importances for
      a planet can take a couple of hours.
  
- ### Great Britain, USA postcodes
+ ### External postcodes
  
- Nominatim can use postcodes from an external source to improve searches that
- involve a GB or US postcode. This data can be optionally downloaded into the
- project directory:
+ Nominatim can use postcodes from an external source to improve searching with
+ postcodes. We provide precomputed postcodes sets for the US (using TIGER data)
+ and the UK (using the [CodePoint OpenData set](https://osdatahub.os.uk/downloads/open/CodePointOpen).
+ This data can be optionally downloaded into the project directory:
  
      cd $PROJECT_DIR
-     wget https://www.nominatim.org/data/gb_postcode_data.sql.gz
-     wget https://www.nominatim.org/data/us_postcode_data.sql.gz
+     wget https://www.nominatim.org/data/gb_postcodes.csv.gz
+     wget https://www.nominatim.org/data/us_postcodes.csv.gz
+ 
+ You can also add your own custom postcode sources, see
+ [Customization of postcodes](Customization.md#external-postcode-data).
  
  ## Choosing the data to import
  
@@@ -248,6 -252,9 +252,9 @@@ to verify that your installation is wor
  `http://localhost:8088/status.php` and you should see the message `OK`.
  You can also run a search query, e.g. `http://localhost:8088/search.php?q=Berlin`.
  
+ Note that search query is not supported for reverse-only imports. You can run a
+ reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=78.04209025`.
+ 
  To run Nominatim via webservers like Apache or nginx, please read the
  [Deployment chapter](Deployment.md).
  
@@@ -270,60 -277,7 +277,60 @@@ If you want to be able to search for pl
  [special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
  you also need to import these key phrases like this:
  
 -    nominatim special-phrases --import-from-wiki
 +```sh
 +nominatim special-phrases --import-from-wiki
 +```
  
  Note that this command downloads the phrases from the wiki link above. You
  need internet access for the step.
 +
 +You can also import some phrases from a csv file. 
 +To do so, you have access to the following command:
 +
 +```sh
 +nominatim special-phrases --import-from-csv <csv file>
 +```
 +
 +Note that the 2 previous commands will update the phrases from your database.
 +This mean that if you import some phrases from a csv file, only the phrases
 +present in the csv file will be kept into the database. All other phrases will
 +be removed.
 +
 +If you want to only add new phrases and not update the other ones you can add
 +the argument `--no-replace` to the import command. For example:
 +
 +```sh
 +nominatim special-phrases --import-from-csv <csv file> --no-replace
 +```
 +
 +This will add the phrases present in the csv file into the database without
 +removing the other ones.
 +
 +## Installing Tiger housenumber data for the US
 +
 +Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
 +address set to complement the OSM house number data in the US. You can add
 +TIGER data to your own Nominatim instance by following these steps. The
 +entire US adds about 10GB to your database.
 +
 +  1. Get preprocessed TIGER 2020 data:
 +
 +        cd $PROJECT_DIR
 +        wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.tar.gz
 +
 +  2. Import the data into your Nominatim database:
 +
 +        nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.tar.gz
 +
 +  3. Enable use of the Tiger data in your `.env` by adding:
 +
 +        echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
 +
 +  4. Apply the new settings:
 +
 +        nominatim refresh --functions
 +
 +
 +See the [developer's guide](../develop/data-sources.md#us-census-tiger) for more
 +information on how the data got preprocessed.
 +
diff --combined nominatim/tokenizer/legacy_icu_tokenizer.py
index e07602d9,7205ddef..156e99ec
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@@ -263,6 -263,16 +263,16 @@@ class LegacyICUNameAnalyzer
          """
          return self.normalizer.transliterate(phrase)
  
+     @staticmethod
+     def normalize_postcode(postcode):
+         """ Convert the postcode to a standardized form.
+ 
+             This function must yield exactly the same result as the SQL function
+             'token_normalized_postcode()'.
+         """
+         return postcode.strip().upper()
+ 
+ 
      @functools.lru_cache(maxsize=1024)
      def make_standard_word(self, name):
          """ Create the normalised version of the input.
@@@ -285,28 -295,47 +295,47 @@@
  
          return self.transliterator.transliterate(hnr)
  
-     def add_postcodes_from_db(self):
-         """ Add postcodes from the location_postcode table to the word table.
+     def update_postcodes_from_db(self):
+         """ Update postcode tokens in the word table from the location_postcode
+             table.
          """
+         to_delete = []
          copystr = io.StringIO()
          with self.conn.cursor() as cur:
-             cur.execute("SELECT distinct(postcode) FROM location_postcode")
-             for (postcode, ) in cur:
-                 copystr.write(postcode)
-                 copystr.write('\t ')
-                 copystr.write(self.transliterator.transliterate(postcode))
-                 copystr.write('\tplace\tpostcode\t0\n')
- 
-             copystr.seek(0)
-             cur.copy_from(copystr, 'word',
-                           columns=['word', 'word_token', 'class', 'type',
-                                    'search_name_count'])
-             # Don't really need an ID for postcodes....
-             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-             #                WHERE word_id is null and type = 'postcode'""")
+             # This finds us the rows in location_postcode and word that are
+             # missing in the other table.
+             cur.execute("""SELECT * FROM
+                             (SELECT pc, word FROM
+                               (SELECT distinct(postcode) as pc FROM location_postcode) p
+                               FULL JOIN
+                               (SELECT word FROM word
+                                 WHERE class ='place' and type = 'postcode') w
+                               ON pc = word) x
+                            WHERE pc is null or word is null""")
+ 
+             for postcode, word in cur:
+                 if postcode is None:
+                     to_delete.append(word)
+                 else:
+                     copystr.write(postcode)
+                     copystr.write('\t ')
+                     copystr.write(self.transliterator.transliterate(postcode))
+                     copystr.write('\tplace\tpostcode\t0\n')
+ 
+             if to_delete:
+                 cur.execute("""DELETE FROM WORD
+                                WHERE class ='place' and type = 'postcode'
+                                      and word = any(%s)
+                             """, (to_delete, ))
+ 
+             if copystr.getvalue():
+                 copystr.seek(0)
+                 cur.copy_from(copystr, 'word',
+                               columns=['word', 'word_token', 'class', 'type',
+                                        'search_name_count'])
  
  
 -    def update_special_phrases(self, phrases):
 +    def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@@ -345,7 -374,7 +374,7 @@@
                                columns=['word', 'word_token', 'class', 'type',
                                         'operator', 'search_name_count'])
  
 -            if to_delete:
 +            if to_delete and should_replace:
                  psycopg2.extras.execute_values(
                      cur,
                      """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
@@@ -435,22 -464,25 +464,25 @@@
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
-         if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
-             term = self.make_standard_word(postcode)
-             if not term:
-                 return
- 
-             with self.conn.cursor() as cur:
-                 # no word_id needed for postcodes
-                 cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                  search_name_count)
-                                (SELECT pc, %s, 'place', 'postcode', 0
-                                 FROM (VALUES (%s)) as v(pc)
-                                 WHERE NOT EXISTS
-                                  (SELECT * FROM word
-                                   WHERE word = pc and class='place' and type='postcode'))
-                             """, (' ' + term, postcode))
-             self._cache.postcodes.add(postcode)
+         if re.search(r'[:,;]', postcode) is None:
+             postcode = self.normalize_postcode(postcode)
+ 
+             if postcode not in self._cache.postcodes:
+                 term = self.make_standard_word(postcode)
+                 if not term:
+                     return
+ 
+                 with self.conn.cursor() as cur:
+                     # no word_id needed for postcodes
+                     cur.execute("""INSERT INTO word (word, word_token, class, type,
+                                                      search_name_count)
+                                    (SELECT pc, %s, 'place', 'postcode', 0
+                                     FROM (VALUES (%s)) as v(pc)
+                                     WHERE NOT EXISTS
+                                      (SELECT * FROM word
+                                       WHERE word = pc and class='place' and type='postcode'))
+                                 """, (' ' + term, postcode))
+                 self._cache.postcodes.add(postcode)
  
      @staticmethod
      def _split_housenumbers(hnrs):
diff --combined nominatim/tokenizer/legacy_tokenizer.py
index 5bd45c51,3808c68e..4c03678d
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@@ -305,16 -305,54 +305,54 @@@ class LegacyNameAnalyzer
          return self.normalizer.transliterate(phrase)
  
  
-     def add_postcodes_from_db(self):
-         """ Add postcodes from the location_postcode table to the word table.
+     @staticmethod
+     def normalize_postcode(postcode):
+         """ Convert the postcode to a standardized form.
+ 
+             This function must yield exactly the same result as the SQL function
+             'token_normalized_postcode()'.
+         """
+         return postcode.strip().upper()
+ 
+ 
+     def update_postcodes_from_db(self):
+         """ Update postcode tokens in the word table from the location_postcode
+             table.
          """
          with self.conn.cursor() as cur:
-             cur.execute("""SELECT count(create_postcode_id(pc))
-                            FROM (SELECT distinct(postcode) as pc
-                                  FROM location_postcode) x""")
+             # This finds us the rows in location_postcode and word that are
+             # missing in the other table.
+             cur.execute("""SELECT * FROM
+                             (SELECT pc, word FROM
+                               (SELECT distinct(postcode) as pc FROM location_postcode) p
+                               FULL JOIN
+                               (SELECT word FROM word
+                                 WHERE class ='place' and type = 'postcode') w
+                               ON pc = word) x
+                            WHERE pc is null or word is null""")
+ 
+             to_delete = []
+             to_add = []
+ 
+             for postcode, word in cur:
+                 if postcode is None:
+                     to_delete.append(word)
+                 else:
+                     to_add.append(postcode)
+ 
+             if to_delete:
+                 cur.execute("""DELETE FROM WORD
+                                WHERE class ='place' and type = 'postcode'
+                                      and word = any(%s)
+                             """, (to_delete, ))
+             if to_add:
+                 cur.execute("""SELECT count(create_postcode_id(pc))
+                                FROM unnest(%s) as pc
+                             """, (to_add, ))
+ 
  
  
 -    def update_special_phrases(self, phrases):
 +    def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@@ -343,7 -381,7 +381,7 @@@
                             FROM (VALUES %s) as v(name, class, type, op))""",
                      to_add)
  
 -            if to_delete:
 +            if to_delete and should_replace:
                  psycopg2.extras.execute_values(
                      cur,
                      """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
@@@ -416,12 -454,8 +454,8 @@@
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
-         def _create_postcode_from_db(pcode):
-             with self.conn.cursor() as cur:
-                 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
- 
          if re.search(r'[:,;]', postcode) is None:
-             self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+             self._cache.add_postcode(self.conn, self.normalize_postcode(postcode))
  
  
  class _TokenInfo:
@@@ -552,16 -586,19 +586,19 @@@ class _TokenCache
                             FROM generate_series(1, 100) as i""")
              self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
  
-         # Get postcodes that are already saved
-         postcodes = OrderedDict()
-         with conn.cursor() as cur:
-             cur.execute("""SELECT word FROM word
-                            WHERE class ='place' and type = 'postcode'""")
-             for row in cur:
-                 postcodes[row[0]] = None
-         self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+         # For postcodes remember the ones that have already been added
+         self.postcodes = set()
  
      def get_housenumber(self, number):
          """ Get a housenumber token from the cache.
          """
          return self._cached_housenumbers.get(number)
+ 
+ 
+     def add_postcode(self, conn, postcode):
+         """ Make sure the given postcode is in the database.
+         """
+         if postcode not in self.postcodes:
+             with conn.cursor() as cur:
+                 cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
+             self.postcodes.add(postcode)
diff --combined test/bdd/steps/nominatim_environment.py
index 937978b0,7eb6f3dd..0e986f21
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@@ -9,6 -9,7 +9,7 @@@ sys.path.insert(1, str((Path(__file__) 
  
  from nominatim import cli
  from nominatim.config import Configuration
+ from nominatim.db.connection import _Connection
  from nominatim.tools import refresh
  from nominatim.tokenizer import factory as tokenizer_factory
  from steps.utils import run_script
@@@ -54,7 -55,7 +55,7 @@@ class NominatimEnvironment
              dbargs['user'] = self.db_user
          if self.db_pass:
              dbargs['password'] = self.db_pass
-         conn = psycopg2.connect(**dbargs)
+         conn = psycopg2.connect(connection_factory=_Connection, **dbargs)
          return conn
  
      def next_code_coverage_file(self):
@@@ -110,8 -111,13 +111,13 @@@
              self.website_dir.cleanup()
  
          self.website_dir = tempfile.TemporaryDirectory()
+ 
+         try:
+             conn = self.connect_database(dbname)
+         except:
+             conn = False
          refresh.setup_website(Path(self.website_dir.name) / 'website',
-                               self.get_test_config())
+                               self.get_test_config(), conn)
  
  
      def get_test_config(self):
@@@ -200,8 -206,11 +206,8 @@@
                          phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
                          run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
                      else:
 -                        # XXX Temporary use the wiki while there is no CSV import
 -                        # available.
 -                        self.test_env['NOMINATIM_LANGUAGES'] = 'en'
 -                        self.run_nominatim('special-phrases', '--import-from-wiki')
 -                        del self.test_env['NOMINATIM_LANGUAGES']
 +                        csv_path = str((testdata / 'full_en_phrases_test.csv').resolve())
 +                        self.run_nominatim('special-phrases', '--import-from-csv', csv_path)
                  except:
                      self.db_drop_database(self.api_test_db)
                      raise
@@@ -228,13 -237,13 +234,13 @@@
          """ Setup a test against a fresh, empty test database.
          """
          self.setup_template_db()
-         self.write_nominatim_config(self.test_db)
          conn = self.connect_database(self.template_db)
          conn.set_isolation_level(0)
          cur = conn.cursor()
          cur.execute('DROP DATABASE IF EXISTS {}'.format(self.test_db))
          cur.execute('CREATE DATABASE {} TEMPLATE = {}'.format(self.test_db, self.template_db))
          conn.close()
+         self.write_nominatim_config(self.test_db)
          context.db = self.connect_database(self.test_db)
          context.db.autocommit = True
          psycopg2.extras.register_hstore(context.db, globally=False)
diff --combined test/python/dummy_tokenizer.py
index 2e61a245,0a86ba8d..18e322ca
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@@ -51,10 -51,13 +51,13 @@@ class DummyNameAnalyzer
      def close(self):
          pass
  
-     def add_postcodes_from_db(self):
+     def normalize_postcode(self, postcode):
+         return postcode
+ 
+     def update_postcodes_from_db(self):
          pass
  
 -    def update_special_phrases(self, phrases):
 +    def update_special_phrases(self, phrases, should_replace):
          self.analyser_cache['special_phrases'] = phrases
  
      def add_country_names(self, code, names):
diff --combined test/python/test_cli.py
index 1d89ec69,e8d2e052..27ea0d5d
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@@ -120,7 -120,7 +120,7 @@@ def test_import_full(temp_db, mock_func
          mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
          mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
          mock_func_factory(nominatim.tools.refresh, 'load_address_levels_from_file'),
-         mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'),
+         mock_func_factory(nominatim.tools.postcodes, 'update_postcodes'),
          mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
          mock_func_factory(nominatim.tools.refresh, 'setup_website'),
          mock_func_factory(nominatim.db.properties, 'set_property')
@@@ -143,7 -143,7 +143,7 @@@ def test_import_continue_load_data(temp
          mock_func_factory(nominatim.tools.database_import, 'load_data'),
          mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
          mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
-         mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'),
+         mock_func_factory(nominatim.tools.postcodes, 'update_postcodes'),
          mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'),
          mock_func_factory(nominatim.tools.refresh, 'setup_website'),
          mock_func_factory(nominatim.db.properties, 'set_property')
@@@ -255,45 -255,34 +255,51 @@@ def test_index_command(mock_func_factor
      assert bnd_mock.called == do_bnds
      assert rank_mock.called == do_ranks
  
 -def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
 -    func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
 +@pytest.mark.parametrize("no_replace", [(True), (False)])
 +def test_special_phrases_wiki_command(temp_db, mock_func_factory, tokenizer_mock, no_replace):
 +    func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases')
  
 -    call_nominatim('special-phrases', '--import-from-wiki')
 +    if no_replace:
 +        call_nominatim('special-phrases', '--import-from-wiki', '--no-replace')
 +    else:
 +        call_nominatim('special-phrases', '--import-from-wiki')
 +
 +    assert func.called == 1
 +
 +@pytest.mark.parametrize("no_replace", [(True), (False)])
 +def test_special_phrases_csv_command(temp_db, mock_func_factory, tokenizer_mock, no_replace):
 +    func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases')
 +    testdata = SRC_DIR / 'test' / 'testdb'
 +    csv_path = str((testdata / 'full_en_phrases_test.csv').resolve())
 +
 +    if no_replace:
 +        call_nominatim('special-phrases', '--import-from-csv', csv_path, '--no-replace')
 +    else:
 +        call_nominatim('special-phrases', '--import-from-csv', csv_path)
  
      assert func.called == 1
  
  @pytest.mark.parametrize("command,func", [
-                          ('postcodes', 'update_postcodes'),
                           ('word-counts', 'recompute_word_counts'),
                           ('address-levels', 'load_address_levels_from_file'),
                           ('wiki-data', 'import_wikipedia_articles'),
                           ('importance', 'recompute_importance'),
                           ('website', 'setup_website'),
                           ])
- def test_refresh_command(mock_func_factory, temp_db, command, func):
+ def test_refresh_command(mock_func_factory, temp_db, command, func, tokenizer_mock):
      func_mock = mock_func_factory(nominatim.tools.refresh, func)
  
      assert 0 == call_nominatim('refresh', '--' + command)
      assert func_mock.called == 1
  
  
+ def test_refresh_postcodes(mock_func_factory, temp_db, tokenizer_mock):
+     func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
+     idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
+ 
+     assert 0 == call_nominatim('refresh', '--postcodes')
+     assert func_mock.called == 1
+ 
  def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
      func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
  
@@@ -302,7 -291,7 +308,7 @@@
      assert tokenizer_mock.update_sql_functions_called
  
  
- def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
+ def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db, tokenizer_mock):
      calls = []
      monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
                          lambda *args, **kwargs: calls.append('import') or 0)
diff --combined test/python/test_tokenizer_legacy.py
index 80147172,15ae50a4..76b51f71
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@@ -77,12 -77,12 +77,12 @@@ def make_standard_name(temp_db_cursor)
  
  
  @pytest.fixture
- def create_postcode_id(table_factory, temp_db_cursor):
-     table_factory('out_postcode_table', 'postcode TEXT')
- 
+ def create_postcode_id(temp_db_cursor):
      temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
                                RETURNS BOOLEAN AS $$
-                               INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
+                               INSERT INTO word (word_token, word, class, type)
+                                 VALUES (' ' || postcode, postcode, 'place', 'postcode')
+                               RETURNING True;
                                $$ LANGUAGE SQL""")
  
  
@@@ -192,27 -192,38 +192,38 @@@ def test_normalize(analyzer)
      assert analyzer.normalize('TEsT') == 'test'
  
  
- def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
-                                create_postcode_id):
+ def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table,
+                                         create_postcode_id):
      table_factory('location_postcode', 'postcode TEXT',
                    content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
  
-     analyzer.add_postcodes_from_db()
+     analyzer.update_postcodes_from_db()
+ 
+     assert word_table.count() == 3
+     assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
+ 
+ 
+ def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table,
+                                                  create_postcode_id):
+     table_factory('location_postcode', 'postcode TEXT',
+                   content=(('1234',), ('45BC', ), ('XX45', )))
+     word_table.add_postcode(' 1234', '1234')
+     word_table.add_postcode(' 5678', '5678')
+ 
+     analyzer.update_postcodes_from_db()
  
-     assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
-                == set((('1234', ), ('12 34', ), ('AB23',)))
+     assert word_table.count() == 3
+     assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
  
  
- def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
-                                            make_standard_name):
+ def test_update_special_phrase_empty_table(analyzer, word_table, make_standard_name):
      analyzer.update_special_phrases([
          ("KÃ¶nig bei", "amenity", "royal", "near"),
          ("KÃ¶nige", "amenity", "royal", "-"),
          ("strasse", "highway", "primary", "in")
 -    ])
 +    ], True)
  
-     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
-                                      FROM word WHERE class != 'place'""") \
+     assert word_table.get_special() \
                 == set(((' kÃ¶nig bei', 'kÃ¶nig bei', 'amenity', 'royal', 'near'),
                         (' kÃ¶nige', 'kÃ¶nige', 'amenity', 'royal', None),
                         (' strasse', 'strasse', 'highway', 'primary', 'in')))
@@@ -220,46 -231,29 +231,42 @@@
  
  def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
                                            make_standard_name):
-     temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
-                               VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
-                                      (' bar', 'bar', 'highway', 'road', null)""")
+     word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in')
+     word_table.add_special(' bar', 'bar', 'highway', 'road', None)
  
-     assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+     assert word_table.count_special() == 2
  
 -    analyzer.update_special_phrases([])
 +    analyzer.update_special_phrases([], True)
  
-     assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+     assert word_table.count_special() == 0
  
  
 +def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,
 +                                          make_standard_name):
 +    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
 +                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
 +                                     (' bar', 'bar', 'highway', 'road', null)""")
 +
 +    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 +
 +    analyzer.update_special_phrases([], False)
 +
 +    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 +
 +
- def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
-                                       make_standard_name):
-     temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
-                               VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
-                                      (' bar', 'bar', 'highway', 'road', null)""")
+ def test_update_special_phrase_modify(analyzer, word_table, make_standard_name):
+     word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in')
+     word_table.add_special(' bar', 'bar', 'highway', 'road', None)
  
-     assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+     assert word_table.count_special() == 2
  
      analyzer.update_special_phrases([
        ('prison', 'amenity', 'prison', 'in'),
        ('bar', 'highway', 'road', '-'),
        ('garden', 'leisure', 'garden', 'near')
 -    ])
 +    ], True)
  
-     assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
-                                      FROM word WHERE class != 'place'""") \
+     assert word_table.get_special() \
                 == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
                         (' bar', 'bar', 'highway', 'road', None),
                         (' garden', 'garden', 'leisure', 'garden', 'near')))
@@@ -273,21 -267,17 +280,17 @@@ def test_process_place_names(analyzer, 
  
  
  @pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
- def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
- 
+ def test_process_place_postcode(analyzer, create_postcode_id, word_table, pc):
      info = analyzer.process_place({'address': {'postcode' : pc}})
  
-     assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
-                == set(((pc, ),))
+     assert word_table.get_postcodes() == {pc, }
  
  
  @pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
- def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
-                                     pc):
- 
+ def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pc):
      info = analyzer.process_place({'address': {'postcode' : pc}})
  
-     assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
+     assert not word_table.get_postcodes()
  
  
  @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
diff --combined test/python/test_tokenizer_legacy_icu.py
index 92a83249,8dc5c830..abb058d2
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@@ -141,16 -141,28 +141,28 @@@ def test_make_standard_hnr(analyzer)
          assert a._make_standard_hnr('iv') == 'IV'
  
  
- def test_add_postcodes_from_db(analyzer, word_table, table_factory, temp_db_cursor):
+ def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table):
      table_factory('location_postcode', 'postcode TEXT',
                    content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
  
      with analyzer() as a:
-         a.add_postcodes_from_db()
+         a.update_postcodes_from_db()
  
-     assert temp_db_cursor.row_set("""SELECT word, word_token from word
-                                      """) \
-                == set((('1234', ' 1234'), ('12 34', ' 12 34'), ('AB23', ' AB23')))
+     assert word_table.count() == 3
+     assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'}
+ 
+ 
+ def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table):
+     table_factory('location_postcode', 'postcode TEXT',
+                   content=(('1234',), ('45BC', ), ('XX45', )))
+     word_table.add_postcode(' 1234', '1234')
+     word_table.add_postcode(' 5678', '5678')
+ 
+     with analyzer() as a:
+         a.update_postcodes_from_db()
+ 
+     assert word_table.count() == 3
+     assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'}
  
  
  def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor):
@@@ -159,7 -171,7 +171,7 @@@
              ("KÃ¶nig bei", "amenity", "royal", "near"),
              ("KÃ¶nige", "amenity", "royal", "-"),
              ("street", "highway", "primary", "in")
 -        ])
 +        ], True)
  
      assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                       FROM word WHERE class != 'place'""") \
@@@ -176,24 -188,11 +188,24 @@@ def test_update_special_phrase_delete_a
      assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
  
      with analyzer() as a:
 -        a.update_special_phrases([])
 +        a.update_special_phrases([], True)
  
      assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
  
  
 +def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,):
 +    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
 +                              VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
 +                                     (' BAR', 'bar', 'highway', 'road', null)""")
 +
 +    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 +
 +    with analyzer() as a:
 +        a.update_special_phrases([], False)
 +
 +    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
 +
 +
  def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor):
      temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
                                VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'),
@@@ -206,7 -205,7 +218,7 @@@
            ('prison', 'amenity', 'prison', 'in'),
            ('bar', 'highway', 'road', '-'),
            ('garden', 'leisure', 'garden', 'near')
 -        ])
 +        ], True)
  
      assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
                                       FROM word WHERE class != 'place'""") \
@@@ -224,22 -223,19 +236,19 @@@ def test_process_place_names(analyzer, 
  
  
  @pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
- def test_process_place_postcode(analyzer, temp_db_cursor, pc):
+ def test_process_place_postcode(analyzer, word_table, pc):
      with analyzer() as a:
          info = a.process_place({'address': {'postcode' : pc}})
  
-     assert temp_db_cursor.row_set("""SELECT word FROM word
-                                      WHERE class = 'place' and type = 'postcode'""") \
-                == set(((pc, ),))
+     assert word_table.get_postcodes() == {pc, }
  
  
  @pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
- def test_process_place_bad_postcode(analyzer, temp_db_cursor, pc):
+ def test_process_place_bad_postcode(analyzer, word_table, pc):
      with analyzer() as a:
          info = a.process_place({'address': {'postcode' : pc}})
  
-     assert 0 == temp_db_cursor.scalar("""SELECT count(*) FROM word
-                                          WHERE class = 'place' and type = 'postcode'""")
+     assert not word_table.get_postcodes()
  
  
  @pytest.mark.parametrize('hnr', ['123a', '1', '101'])