From 79da96b369aa86fdcec21c4d0eb8465b6ed54fc5 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 1 Sep 2021 23:51:53 +0200 Subject: [PATCH] read partition and languages from config file --- data/country_name.sql | 2 +- nominatim/clicmd/setup.py | 2 + nominatim/tools/country_info.py | 53 ++++++++++++++++++++++--- settings/country_settings.yaml | 6 +-- test/python/conftest.py | 2 +- test/python/test_tools_country_info.py | 55 ++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 10 deletions(-) create mode 100644 test/python/test_tools_country_info.py diff --git a/data/country_name.sql b/data/country_name.sql index eac46caf..d2bac3b0 100644 --- a/data/country_name.sql +++ b/data/country_name.sql @@ -5,7 +5,7 @@ CREATE TABLE public.country_name ( country_code character varying(2), name public.hstore, - country_default_language_code character varying(2), + country_default_language_code text, partition integer ); diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 3d04a57a..014c9dc9 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -55,6 +55,8 @@ class SetupAll: from ..tools import database_import, refresh, postcodes, freeze, country_info from ..indexer.indexer import Indexer + country_info.setup_country_config(args.config.config_dir / 'country_settings.yaml') + if args.continue_at is None: files = args.get_osm_file_list() diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py index 1b61ae68..1f7b7996 100644 --- a/nominatim/tools/country_info.py +++ b/nominatim/tools/country_info.py @@ -2,10 +2,37 @@ Functions for importing and managing static country information. """ import psycopg2.extras +import yaml from nominatim.db import utils as db_utils from nominatim.db.connection import connect +class _CountryInfo: + """ Caches country-specific properties from the configuration file. + """ + + def __init__(self): + self._info = {} + + def load(self, configfile): + if not self._info: + self._info = yaml.safe_load(configfile.read_text()) + + def items(self): + return self._info.items() + + +_COUNTRY_INFO = _CountryInfo() + +def setup_country_config(configfile): + """ Load country properties from the configuration file. + Needs to be called before using any other functions in this + file. + """ + _COUNTRY_INFO.load(configfile) + print(_COUNTRY_INFO._info) + + def setup_country_tables(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. @@ -13,11 +40,27 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False): db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') - if ignore_partitions: - with connect(dsn) as conn: - with conn.cursor() as cur: - cur.execute('UPDATE country_name SET partition = 0') - conn.commit() + params = [] + for ccode, props in _COUNTRY_INFO.items(): + if ccode is not None and props is not None: + if ignore_partitions: + partition = 0 + else: + partition = props.get('partition') + if ',' in (props.get('languages', ',') or ','): + lang = None + else: + lang = props['languages'] + params.append((ccode, partition, lang)) + + with connect(dsn) as conn: + with conn.cursor() as cur: + cur.execute_values( + """ UPDATE country_name + SET partition = part, country_default_language_code = lang + FROM (VALUES %s) AS v (cc, part, lang) + WHERE country_code = v.cc""", params) + conn.commit() def create_country_names(conn, tokenizer, languages=None): diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 3fe07eea..77b137a1 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -36,7 +36,7 @@ am: # Netherlands Antilles (De Nederlandse Antillen) an: partition: 58 - languages: + languages: nl, en, pap # Angola (Angola) ao: @@ -834,7 +834,7 @@ nl: languages: nl # Norway (Norge) -no: +"no": partition: 60 languages: nb, nn, no, se @@ -1226,7 +1226,7 @@ ws: # Kosovo (Kosova / Kosovo) xk: partition: 59 - languages:sq, sr + languages: sq, sr # Yemen (اليمن) ye: diff --git a/test/python/conftest.py b/test/python/conftest.py index 1fca4b62..2fc97726 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -5,7 +5,7 @@ from pathlib import Path import psycopg2 import pytest -SRC_DIR = Path(__file__) / '..' / '..' / '..' +SRC_DIR = (Path(__file__) / '..' / '..' / '..').resolve() # always test against the source sys.path.insert(0, str(SRC_DIR.resolve())) diff --git a/test/python/test_tools_country_info.py b/test/python/test_tools_country_info.py new file mode 100644 index 00000000..59737769 --- /dev/null +++ b/test/python/test_tools_country_info.py @@ -0,0 +1,55 @@ +""" +Tests for function that handle country properties. +""" + +import pytest + +from nominatim.tools import country_info + +@pytest.fixture(autouse=True) +def read_config(def_config): + country_info.setup_country_config(def_config.config_dir / 'country_settings.yaml') + +@pytest.mark.parametrize("no_partitions", (True, False)) +def test_setup_country_tables(src_dir, temp_db_with_extensions, dsn, temp_db_cursor, + def_config, no_partitions): + country_info.setup_country_tables(dsn, src_dir / 'data', no_partitions) + + assert temp_db_cursor.table_exists('country_name') + assert temp_db_cursor.table_rows('country_name') == \ + temp_db_cursor.scalar('SELECT count(DISTINCT country_code) FROM country_name') + + partitions = temp_db_cursor.row_set("SELECT DISTINCT partition FROM country_name") + if no_partitions: + assert partitions == {(0, )} + else: + assert len(partitions) > 10 + + assert temp_db_cursor.table_exists('country_osm_grid') + assert temp_db_cursor.table_rows('country_osm_grid') > 100 + + +@pytest.mark.parametrize("languages", (None, ' fr,en')) +def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor, + table_factory, tokenizer_mock, languages): + + table_factory('country_name', 'country_code varchar(2), name hstore', + content=(('us', '"name"=>"us1","name:af"=>"us2"'), + ('fr', '"name"=>"Fra", "name:en"=>"Fren"'))) + + assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2 + + tokenizer = tokenizer_mock() + + country_info.create_country_names(temp_db_conn, tokenizer, languages) + + assert len(tokenizer.analyser_cache['countries']) == 2 + + result_set = {k: set(v.values()) for k, v in tokenizer.analyser_cache['countries']} + + if languages: + assert result_set == {'us' : set(('us', 'us1', 'United States')), + 'fr' : set(('fr', 'Fra', 'Fren'))} + else: + assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')), + 'fr' : set(('fr', 'Fra', 'Fren'))} -- 2.39.5