From: AntoJvlt Date: Mon, 17 May 2021 11:52:35 +0000 (+0200) Subject: Resolve conflicts X-Git-Tag: v4.0.0~79^2~1 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/3206bf59df0213d24bd3e11df7dd2abaebf89911?ds=inline;hp=-c Resolve conflicts --- 3206bf59df0213d24bd3e11df7dd2abaebf89911 diff --combined docs/admin/Import.md index def513aa,2686942e..264b46f4 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@@ -83,15 -83,19 +83,19 @@@ The file is about 400MB and adds aroun `nominatim refresh --wiki-data --importance`. Updating importances for a planet can take a couple of hours. - ### Great Britain, USA postcodes + ### External postcodes - Nominatim can use postcodes from an external source to improve searches that - involve a GB or US postcode. This data can be optionally downloaded into the - project directory: + Nominatim can use postcodes from an external source to improve searching with + postcodes. We provide precomputed postcodes sets for the US (using TIGER data) + and the UK (using the [CodePoint OpenData set](https://osdatahub.os.uk/downloads/open/CodePointOpen). + This data can be optionally downloaded into the project directory: cd $PROJECT_DIR - wget https://www.nominatim.org/data/gb_postcode_data.sql.gz - wget https://www.nominatim.org/data/us_postcode_data.sql.gz + wget https://www.nominatim.org/data/gb_postcodes.csv.gz + wget https://www.nominatim.org/data/us_postcodes.csv.gz + + You can also add your own custom postcode sources, see + [Customization of postcodes](Customization.md#external-postcode-data). ## Choosing the data to import @@@ -248,6 -252,9 +252,9 @@@ to verify that your installation is wor `http://localhost:8088/status.php` and you should see the message `OK`. You can also run a search query, e.g. `http://localhost:8088/search.php?q=Berlin`. + Note that search query is not supported for reverse-only imports. You can run a + reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=78.04209025`. + To run Nominatim via webservers like Apache or nginx, please read the [Deployment chapter](Deployment.md). @@@ -270,60 -277,7 +277,60 @@@ If you want to be able to search for pl [special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases) you also need to import these key phrases like this: - nominatim special-phrases --import-from-wiki +```sh +nominatim special-phrases --import-from-wiki +``` Note that this command downloads the phrases from the wiki link above. You need internet access for the step. + +You can also import some phrases from a csv file. +To do so, you have access to the following command: + +```sh +nominatim special-phrases --import-from-csv +``` + +Note that the 2 previous commands will update the phrases from your database. +This mean that if you import some phrases from a csv file, only the phrases +present in the csv file will be kept into the database. All other phrases will +be removed. + +If you want to only add new phrases and not update the other ones you can add +the argument `--no-replace` to the import command. For example: + +```sh +nominatim special-phrases --import-from-csv --no-replace +``` + +This will add the phrases present in the csv file into the database without +removing the other ones. + +## Installing Tiger housenumber data for the US + +Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html) +address set to complement the OSM house number data in the US. You can add +TIGER data to your own Nominatim instance by following these steps. The +entire US adds about 10GB to your database. + + 1. Get preprocessed TIGER 2020 data: + + cd $PROJECT_DIR + wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.tar.gz + + 2. Import the data into your Nominatim database: + + nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.tar.gz + + 3. Enable use of the Tiger data in your `.env` by adding: + + echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env + + 4. Apply the new settings: + + nominatim refresh --functions + + +See the [developer's guide](../develop/data-sources.md#us-census-tiger) for more +information on how the data got preprocessed. + diff --combined nominatim/tokenizer/legacy_icu_tokenizer.py index e07602d9,7205ddef..156e99ec --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@@ -263,6 -263,16 +263,16 @@@ class LegacyICUNameAnalyzer """ return self.normalizer.transliterate(phrase) + @staticmethod + def normalize_postcode(postcode): + """ Convert the postcode to a standardized form. + + This function must yield exactly the same result as the SQL function + 'token_normalized_postcode()'. + """ + return postcode.strip().upper() + + @functools.lru_cache(maxsize=1024) def make_standard_word(self, name): """ Create the normalised version of the input. @@@ -285,28 -295,47 +295,47 @@@ return self.transliterator.transliterate(hnr) - def add_postcodes_from_db(self): - """ Add postcodes from the location_postcode table to the word table. + def update_postcodes_from_db(self): + """ Update postcode tokens in the word table from the location_postcode + table. """ + to_delete = [] copystr = io.StringIO() with self.conn.cursor() as cur: - cur.execute("SELECT distinct(postcode) FROM location_postcode") - for (postcode, ) in cur: - copystr.write(postcode) - copystr.write('\t ') - copystr.write(self.transliterator.transliterate(postcode)) - copystr.write('\tplace\tpostcode\t0\n') - - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) - # Don't really need an ID for postcodes.... - # cur.execute("""UPDATE word SET word_id = nextval('seq_word') - # WHERE word_id is null and type = 'postcode'""") + # This finds us the rows in location_postcode and word that are + # missing in the other table. + cur.execute("""SELECT * FROM + (SELECT pc, word FROM + (SELECT distinct(postcode) as pc FROM location_postcode) p + FULL JOIN + (SELECT word FROM word + WHERE class ='place' and type = 'postcode') w + ON pc = word) x + WHERE pc is null or word is null""") + + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + copystr.write(postcode) + copystr.write('\t ') + copystr.write(self.transliterator.transliterate(postcode)) + copystr.write('\tplace\tpostcode\t0\n') + + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) + + if copystr.getvalue(): + copystr.seek(0) + cur.copy_from(copystr, 'word', + columns=['word', 'word_token', 'class', 'type', + 'search_name_count']) - def update_special_phrases(self, phrases): + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@@ -345,7 -374,7 +374,7 @@@ columns=['word', 'word_token', 'class', 'type', 'operator', 'search_name_count']) - if to_delete: + if to_delete and should_replace: psycopg2.extras.execute_values( cur, """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) @@@ -435,22 -464,25 +464,25 @@@ def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes: - term = self.make_standard_word(postcode) - if not term: - return - - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) - self._cache.postcodes.add(postcode) + if re.search(r'[:,;]', postcode) is None: + postcode = self.normalize_postcode(postcode) + + if postcode not in self._cache.postcodes: + term = self.make_standard_word(postcode) + if not term: + return + + with self.conn.cursor() as cur: + # no word_id needed for postcodes + cur.execute("""INSERT INTO word (word, word_token, class, type, + search_name_count) + (SELECT pc, %s, 'place', 'postcode', 0 + FROM (VALUES (%s)) as v(pc) + WHERE NOT EXISTS + (SELECT * FROM word + WHERE word = pc and class='place' and type='postcode')) + """, (' ' + term, postcode)) + self._cache.postcodes.add(postcode) @staticmethod def _split_housenumbers(hnrs): diff --combined nominatim/tokenizer/legacy_tokenizer.py index 5bd45c51,3808c68e..4c03678d --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@@ -305,16 -305,54 +305,54 @@@ class LegacyNameAnalyzer return self.normalizer.transliterate(phrase) - def add_postcodes_from_db(self): - """ Add postcodes from the location_postcode table to the word table. + @staticmethod + def normalize_postcode(postcode): + """ Convert the postcode to a standardized form. + + This function must yield exactly the same result as the SQL function + 'token_normalized_postcode()'. + """ + return postcode.strip().upper() + + + def update_postcodes_from_db(self): + """ Update postcode tokens in the word table from the location_postcode + table. """ with self.conn.cursor() as cur: - cur.execute("""SELECT count(create_postcode_id(pc)) - FROM (SELECT distinct(postcode) as pc - FROM location_postcode) x""") + # This finds us the rows in location_postcode and word that are + # missing in the other table. + cur.execute("""SELECT * FROM + (SELECT pc, word FROM + (SELECT distinct(postcode) as pc FROM location_postcode) p + FULL JOIN + (SELECT word FROM word + WHERE class ='place' and type = 'postcode') w + ON pc = word) x + WHERE pc is null or word is null""") + + to_delete = [] + to_add = [] + + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + to_add.append(postcode) + + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) + if to_add: + cur.execute("""SELECT count(create_postcode_id(pc)) + FROM unnest(%s) as pc + """, (to_add, )) + - def update_special_phrases(self, phrases): + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@@ -343,7 -381,7 +381,7 @@@ FROM (VALUES %s) as v(name, class, type, op))""", to_add) - if to_delete: + if to_delete and should_replace: psycopg2.extras.execute_values( cur, """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) @@@ -416,12 -454,8 +454,8 @@@ def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - def _create_postcode_from_db(pcode): - with self.conn.cursor() as cur: - cur.execute('SELECT create_postcode_id(%s)', (pcode, )) - if re.search(r'[:,;]', postcode) is None: - self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db) + self._cache.add_postcode(self.conn, self.normalize_postcode(postcode)) class _TokenInfo: @@@ -552,16 -586,19 +586,19 @@@ class _TokenCache FROM generate_series(1, 100) as i""") self._cached_housenumbers = {str(r[0]) : r[1] for r in cur} - # Get postcodes that are already saved - postcodes = OrderedDict() - with conn.cursor() as cur: - cur.execute("""SELECT word FROM word - WHERE class ='place' and type = 'postcode'""") - for row in cur: - postcodes[row[0]] = None - self.postcodes = _LRU(maxsize=32, init_data=postcodes) + # For postcodes remember the ones that have already been added + self.postcodes = set() def get_housenumber(self, number): """ Get a housenumber token from the cache. """ return self._cached_housenumbers.get(number) + + + def add_postcode(self, conn, postcode): + """ Make sure the given postcode is in the database. + """ + if postcode not in self.postcodes: + with conn.cursor() as cur: + cur.execute('SELECT create_postcode_id(%s)', (postcode, )) + self.postcodes.add(postcode) diff --combined test/bdd/steps/nominatim_environment.py index 937978b0,7eb6f3dd..0e986f21 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@@ -9,6 -9,7 +9,7 @@@ sys.path.insert(1, str((Path(__file__) from nominatim import cli from nominatim.config import Configuration + from nominatim.db.connection import _Connection from nominatim.tools import refresh from nominatim.tokenizer import factory as tokenizer_factory from steps.utils import run_script @@@ -54,7 -55,7 +55,7 @@@ class NominatimEnvironment dbargs['user'] = self.db_user if self.db_pass: dbargs['password'] = self.db_pass - conn = psycopg2.connect(**dbargs) + conn = psycopg2.connect(connection_factory=_Connection, **dbargs) return conn def next_code_coverage_file(self): @@@ -110,8 -111,13 +111,13 @@@ self.website_dir.cleanup() self.website_dir = tempfile.TemporaryDirectory() + + try: + conn = self.connect_database(dbname) + except: + conn = False refresh.setup_website(Path(self.website_dir.name) / 'website', - self.get_test_config()) + self.get_test_config(), conn) def get_test_config(self): @@@ -200,8 -206,11 +206,8 @@@ phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) else: - # XXX Temporary use the wiki while there is no CSV import - # available. - self.test_env['NOMINATIM_LANGUAGES'] = 'en' - self.run_nominatim('special-phrases', '--import-from-wiki') - del self.test_env['NOMINATIM_LANGUAGES'] + csv_path = str((testdata / 'full_en_phrases_test.csv').resolve()) + self.run_nominatim('special-phrases', '--import-from-csv', csv_path) except: self.db_drop_database(self.api_test_db) raise @@@ -228,13 -237,13 +234,13 @@@ """ Setup a test against a fresh, empty test database. """ self.setup_template_db() - self.write_nominatim_config(self.test_db) conn = self.connect_database(self.template_db) conn.set_isolation_level(0) cur = conn.cursor() cur.execute('DROP DATABASE IF EXISTS {}'.format(self.test_db)) cur.execute('CREATE DATABASE {} TEMPLATE = {}'.format(self.test_db, self.template_db)) conn.close() + self.write_nominatim_config(self.test_db) context.db = self.connect_database(self.test_db) context.db.autocommit = True psycopg2.extras.register_hstore(context.db, globally=False) diff --combined test/python/dummy_tokenizer.py index 2e61a245,0a86ba8d..18e322ca --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@@ -51,10 -51,13 +51,13 @@@ class DummyNameAnalyzer def close(self): pass - def add_postcodes_from_db(self): + def normalize_postcode(self, postcode): + return postcode + + def update_postcodes_from_db(self): pass - def update_special_phrases(self, phrases): + def update_special_phrases(self, phrases, should_replace): self.analyser_cache['special_phrases'] = phrases def add_country_names(self, code, names): diff --combined test/python/test_cli.py index 1d89ec69,e8d2e052..27ea0d5d --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@@ -120,7 -120,7 +120,7 @@@ def test_import_full(temp_db, mock_func mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.tools.refresh, 'load_address_levels_from_file'), - mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), + mock_func_factory(nominatim.tools.postcodes, 'update_postcodes'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), mock_func_factory(nominatim.db.properties, 'set_property') @@@ -143,7 -143,7 +143,7 @@@ def test_import_continue_load_data(temp mock_func_factory(nominatim.tools.database_import, 'load_data'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), - mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), + mock_func_factory(nominatim.tools.postcodes, 'update_postcodes'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), mock_func_factory(nominatim.db.properties, 'set_property') @@@ -255,45 -255,34 +255,51 @@@ def test_index_command(mock_func_factor assert bnd_mock.called == do_bnds assert rank_mock.called == do_ranks -def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock): - func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki') +@pytest.mark.parametrize("no_replace", [(True), (False)]) +def test_special_phrases_wiki_command(temp_db, mock_func_factory, tokenizer_mock, no_replace): + func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases') - call_nominatim('special-phrases', '--import-from-wiki') + if no_replace: + call_nominatim('special-phrases', '--import-from-wiki', '--no-replace') + else: + call_nominatim('special-phrases', '--import-from-wiki') + + assert func.called == 1 + +@pytest.mark.parametrize("no_replace", [(True), (False)]) +def test_special_phrases_csv_command(temp_db, mock_func_factory, tokenizer_mock, no_replace): + func = mock_func_factory(nominatim.clicmd.special_phrases.SPImporter, 'import_phrases') + testdata = SRC_DIR / 'test' / 'testdb' + csv_path = str((testdata / 'full_en_phrases_test.csv').resolve()) + + if no_replace: + call_nominatim('special-phrases', '--import-from-csv', csv_path, '--no-replace') + else: + call_nominatim('special-phrases', '--import-from-csv', csv_path) assert func.called == 1 @pytest.mark.parametrize("command,func", [ - ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), ('address-levels', 'load_address_levels_from_file'), ('wiki-data', 'import_wikipedia_articles'), ('importance', 'recompute_importance'), ('website', 'setup_website'), ]) - def test_refresh_command(mock_func_factory, temp_db, command, func): + def test_refresh_command(mock_func_factory, temp_db, command, func, tokenizer_mock): func_mock = mock_func_factory(nominatim.tools.refresh, func) assert 0 == call_nominatim('refresh', '--' + command) assert func_mock.called == 1 + def test_refresh_postcodes(mock_func_factory, temp_db, tokenizer_mock): + func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes') + idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes') + + assert 0 == call_nominatim('refresh', '--postcodes') + assert func_mock.called == 1 + def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock): func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions') @@@ -302,7 -291,7 +308,7 @@@ assert tokenizer_mock.update_sql_functions_called - def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db): + def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db, tokenizer_mock): calls = [] monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles', lambda *args, **kwargs: calls.append('import') or 0) diff --combined test/python/test_tokenizer_legacy.py index 80147172,15ae50a4..76b51f71 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@@ -77,12 -77,12 +77,12 @@@ def make_standard_name(temp_db_cursor) @pytest.fixture - def create_postcode_id(table_factory, temp_db_cursor): - table_factory('out_postcode_table', 'postcode TEXT') - + def create_postcode_id(temp_db_cursor): temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT) RETURNS BOOLEAN AS $$ - INSERT INTO out_postcode_table VALUES (postcode) RETURNING True; + INSERT INTO word (word_token, word, class, type) + VALUES (' ' || postcode, postcode, 'place', 'postcode') + RETURNING True; $$ LANGUAGE SQL""") @@@ -192,27 -192,38 +192,38 @@@ def test_normalize(analyzer) assert analyzer.normalize('TEsT') == 'test' - def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor, - create_postcode_id): + def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table, + create_postcode_id): table_factory('location_postcode', 'postcode TEXT', content=(('1234',), ('12 34',), ('AB23',), ('1234',))) - analyzer.add_postcodes_from_db() + analyzer.update_postcodes_from_db() + + assert word_table.count() == 3 + assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'} + + + def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table, + create_postcode_id): + table_factory('location_postcode', 'postcode TEXT', + content=(('1234',), ('45BC', ), ('XX45', ))) + word_table.add_postcode(' 1234', '1234') + word_table.add_postcode(' 5678', '5678') + + analyzer.update_postcodes_from_db() - assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \ - == set((('1234', ), ('12 34', ), ('AB23',))) + assert word_table.count() == 3 + assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'} - def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor, - make_standard_name): + def test_update_special_phrase_empty_table(analyzer, word_table, make_standard_name): analyzer.update_special_phrases([ ("König bei", "amenity", "royal", "near"), ("Könige", "amenity", "royal", "-"), ("strasse", "highway", "primary", "in") - ]) + ], True) - assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator - FROM word WHERE class != 'place'""") \ + assert word_table.get_special() \ == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'), (' könige', 'könige', 'amenity', 'royal', None), (' strasse', 'strasse', 'highway', 'primary', 'in'))) @@@ -220,46 -231,29 +231,42 @@@ def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor, make_standard_name): - temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator) - VALUES (' foo', 'foo', 'amenity', 'prison', 'in'), - (' bar', 'bar', 'highway', 'road', null)""") + word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in') + word_table.add_special(' bar', 'bar', 'highway', 'road', None) - assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + assert word_table.count_special() == 2 - analyzer.update_special_phrases([]) + analyzer.update_special_phrases([], True) - assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + assert word_table.count_special() == 0 +def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor, + make_standard_name): + temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator) + VALUES (' foo', 'foo', 'amenity', 'prison', 'in'), + (' bar', 'bar', 'highway', 'road', null)""") + + assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + + analyzer.update_special_phrases([], False) + + assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + + - def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor, - make_standard_name): - temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator) - VALUES (' foo', 'foo', 'amenity', 'prison', 'in'), - (' bar', 'bar', 'highway', 'road', null)""") + def test_update_special_phrase_modify(analyzer, word_table, make_standard_name): + word_table.add_special(' foo', 'foo', 'amenity', 'prison', 'in') + word_table.add_special(' bar', 'bar', 'highway', 'road', None) - assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + assert word_table.count_special() == 2 analyzer.update_special_phrases([ ('prison', 'amenity', 'prison', 'in'), ('bar', 'highway', 'road', '-'), ('garden', 'leisure', 'garden', 'near') - ]) + ], True) - assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator - FROM word WHERE class != 'place'""") \ + assert word_table.get_special() \ == set(((' prison', 'prison', 'amenity', 'prison', 'in'), (' bar', 'bar', 'highway', 'road', None), (' garden', 'garden', 'leisure', 'garden', 'near'))) @@@ -273,21 -267,17 +280,17 @@@ def test_process_place_names(analyzer, @pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345']) - def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc): - + def test_process_place_postcode(analyzer, create_postcode_id, word_table, pc): info = analyzer.process_place({'address': {'postcode' : pc}}) - assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \ - == set(((pc, ),)) + assert word_table.get_postcodes() == {pc, } @pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836']) - def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id, - pc): - + def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pc): info = analyzer.process_place({'address': {'postcode' : pc}}) - assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table") + assert not word_table.get_postcodes() @pytest.mark.parametrize('hnr', ['123a', '1', '101']) diff --combined test/python/test_tokenizer_legacy_icu.py index 92a83249,8dc5c830..abb058d2 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@@ -141,16 -141,28 +141,28 @@@ def test_make_standard_hnr(analyzer) assert a._make_standard_hnr('iv') == 'IV' - def test_add_postcodes_from_db(analyzer, word_table, table_factory, temp_db_cursor): + def test_update_postcodes_from_db_empty(analyzer, table_factory, word_table): table_factory('location_postcode', 'postcode TEXT', content=(('1234',), ('12 34',), ('AB23',), ('1234',))) with analyzer() as a: - a.add_postcodes_from_db() + a.update_postcodes_from_db() - assert temp_db_cursor.row_set("""SELECT word, word_token from word - """) \ - == set((('1234', ' 1234'), ('12 34', ' 12 34'), ('AB23', ' AB23'))) + assert word_table.count() == 3 + assert word_table.get_postcodes() == {'1234', '12 34', 'AB23'} + + + def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_table): + table_factory('location_postcode', 'postcode TEXT', + content=(('1234',), ('45BC', ), ('XX45', ))) + word_table.add_postcode(' 1234', '1234') + word_table.add_postcode(' 5678', '5678') + + with analyzer() as a: + a.update_postcodes_from_db() + + assert word_table.count() == 3 + assert word_table.get_postcodes() == {'1234', '45BC', 'XX45'} def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor): @@@ -159,7 -171,7 +171,7 @@@ ("König bei", "amenity", "royal", "near"), ("Könige", "amenity", "royal", "-"), ("street", "highway", "primary", "in") - ]) + ], True) assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator FROM word WHERE class != 'place'""") \ @@@ -176,24 -188,11 +188,24 @@@ def test_update_special_phrase_delete_a assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") with analyzer() as a: - a.update_special_phrases([]) + a.update_special_phrases([], True) assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") +def test_update_special_phrases_no_replace(analyzer, word_table, temp_db_cursor,): + temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator) + VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'), + (' BAR', 'bar', 'highway', 'road', null)""") + + assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + + with analyzer() as a: + a.update_special_phrases([], False) + + assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""") + + def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor): temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator) VALUES (' FOO', 'foo', 'amenity', 'prison', 'in'), @@@ -206,7 -205,7 +218,7 @@@ ('prison', 'amenity', 'prison', 'in'), ('bar', 'highway', 'road', '-'), ('garden', 'leisure', 'garden', 'near') - ]) + ], True) assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator FROM word WHERE class != 'place'""") \ @@@ -224,22 -223,19 +236,19 @@@ def test_process_place_names(analyzer, @pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345']) - def test_process_place_postcode(analyzer, temp_db_cursor, pc): + def test_process_place_postcode(analyzer, word_table, pc): with analyzer() as a: info = a.process_place({'address': {'postcode' : pc}}) - assert temp_db_cursor.row_set("""SELECT word FROM word - WHERE class = 'place' and type = 'postcode'""") \ - == set(((pc, ),)) + assert word_table.get_postcodes() == {pc, } @pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836']) - def test_process_place_bad_postcode(analyzer, temp_db_cursor, pc): + def test_process_place_bad_postcode(analyzer, word_table, pc): with analyzer() as a: info = a.process_place({'address': {'postcode' : pc}}) - assert 0 == temp_db_cursor.scalar("""SELECT count(*) FROM word - WHERE class = 'place' and type = 'postcode'""") + assert not word_table.get_postcodes() @pytest.mark.parametrize('hnr', ['123a', '1', '101'])