LANGUAGE plpgsql;
-CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
- lookup_country_code varchar(2))
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_token := ' '||trim(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and country_code=lookup_country_code
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, null,
- null, null, lookup_country_code, 0);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
lookup_class text, lookup_type text)
RETURNS INTEGER
LANGUAGE plpgsql STABLE STRICT;
-CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
- RETURNS VOID
- AS $$
-DECLARE
- s TEXT;
- w INTEGER;
- words TEXT[];
- item RECORD;
- j INTEGER;
-BEGIN
- FOR item IN SELECT (each(src)).* LOOP
-
- s := make_standard_name(item.value);
- w := getorcreate_country(s, country_code);
-
- words := regexp_split_to_array(item.value, E'[,;()]');
- IF array_upper(words, 1) != 1 THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- s := make_standard_name(words[j]);
- IF s != '' THEN
- w := getorcreate_country(s, country_code);
- END IF;
- END LOOP;
- END IF;
- END LOOP;
-END;
-$$
-LANGUAGE plpgsql;
-
-
CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
RETURNS INTEGER[]
AS $$
database_import.create_search_indices(conn, args.config,
drop=args.no_updates)
LOG.warning('Create search index for default country names.')
- database_import.create_country_names(conn, args.config)
+ database_import.create_country_names(conn, tokenizer,
+ args.config.LANGUAGES)
webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
FROM (SELECT distinct(postcode) as pc
FROM location_postcode) x""")
+
+ def add_country_names(self, country_code, names):
+ """ Add names for the given country to the search index.
+ """
+ with self.conn.cursor() as cur:
+ cur.execute(
+ """INSERT INTO word (word_id, word_token, country_code)
+ (SELECT nextval('seq_word'), lookup_token, %s
+ FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+ FROM unnest(%s)n) y
+ WHERE NOT EXISTS(SELECT * FROM word
+ WHERE word_token = lookup_token and country_code = %s))
+ """, (country_code, names, country_code))
+
+
def process_place(self, place):
""" Determine tokenizer information about the given place.
"""
token_info = _TokenInfo(self._cache)
- token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
+ names = place.get('name')
+
+ if names:
+ token_info.add_names(self.conn, names)
+
+ country_feature = place.get('country_feature')
+ if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+ self.add_country_names(country_feature.lower(), list(names.values()))
address = place.get('address')
self.data = {}
- def add_names(self, conn, names, country_feature):
+ def add_names(self, conn, names):
""" Add token information for the names of the place.
"""
- if not names:
- return
-
with conn.cursor() as cur:
# Create the token IDs for all names.
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
(names, ))
- # Add country tokens to word table if necessary.
- if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
- cur.execute("SELECT create_country(%s, %s)",
- (names, country_feature.lower()))
-
def add_housenumbers(self, conn, hnrs):
""" Extract housenumber information from the address.
"""
def _get_place(name):
with conn.cursor() as cur:
- cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text,
+ cur.execute("""SELECT (addr_ids_from_name(%s)
+ || getorcreate_name_id(make_standard_name(%s), ''))::text,
word_ids_from_name(%s)::text""",
(name, name, name))
return cur.fetchone()
from pathlib import Path
import psutil
+import psycopg2.extras
from nominatim.db.connection import connect, get_pg_env
from nominatim.db import utils as db_utils
sql.run_sql_file(conn, 'indices.sql', drop=drop)
-def create_country_names(conn, config):
- """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+ """ Add default country names to search index. `languages` is a comma-
+ separated list of language codes as used in OSM. If `languages` is not
+ empty then only name translations for the given languages are added
+ to the index.
"""
+ if languages:
+ languages = languages.split(',')
+
+ def _include_key(key):
+ return key == 'name' or \
+ (key.startswith('name:') \
+ and (not languages or key[5:] in languages))
with conn.cursor() as cur:
- cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
- cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
- cur.execute("""SELECT COUNT(*) FROM
- (SELECT getorcreate_country(make_standard_name(country_code),
- country_code) FROM country_name WHERE country_code is not null) AS x""")
- cur.execute("""SELECT COUNT(*) FROM
- (SELECT getorcreate_country(make_standard_name(name->'name'), country_code)
- FROM country_name WHERE name ? 'name') AS x""")
- sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
- country_code) FROM (SELECT country_code, skeys(name)
- AS k, svals(name) AS v FROM country_name) x WHERE k"""
-
- languages = config.LANGUAGES
-
- if languages:
- sql_statement = "{} IN (".format(sql_statement)
- delim = ''
- for language in languages.split(','):
- sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
- delim = ', '
- sql_statement = '{})'.format(sql_statement)
- else:
- sql_statement = "{} LIKE 'name:%'".format(sql_statement)
- sql_statement = "{}) v".format(sql_statement)
- cur.execute(sql_statement)
+ psycopg2.extras.register_hstore(cur)
+ cur.execute("""SELECT country_code, name FROM country_name
+ WHERE country_code is not null""")
+
+ with tokenizer.name_analyzer() as analyzer:
+ for code, name in cur:
+ names = [code]
+ if code == 'gb':
+ names.append('UK')
+ if code == 'us':
+ names.append('United States')
+
+ # country names (only in languages as provided)
+ if name:
+ names.extend((v for k, v in name.items() if _include_key(k)))
+
+ analyzer.add_country_names(code, names)
+
conn.commit()
def mk_table(name, definition='id INT', content=None):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
if content is not None:
- if not isinstance(content, str):
- content = '),('.join([str(x) for x in content])
- temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
+ psycopg2.extras.execute_values(
+ temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
return mk_table
@pytest.fixture
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
- table_factory('country_name', 'partition INT', (0, 1, 2))
+ table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
sql=tmp_path, data=SRC_DIR / 'data')
@pytest.fixture
-def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
""" Sets up the configuration so that the test dummy tokenizer will be
- loaded.
+ loaded when the tokenizer factory is used. Also returns a factory
+ with which a new dummy tokenizer may be created.
"""
monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
monkeypatch.setattr(importlib, "import_module", _import_dummy)
properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
+
+ def _create_tokenizer():
+ return dummy_tokenizer.DummyTokenizer(None, None)
+
+ return _create_tokenizer
self.dsn = dsn
self.data_dir = data_dir
self.init_state = None
+ self.analyser_cache = {}
def init_new_db(self, config):
def name_analyzer(self):
- return DummyNameAnalyzer()
+ return DummyNameAnalyzer(self.analyser_cache)
class DummyNameAnalyzer:
self.close()
+ def __init__(self, cache):
+ self.analyser_cache = cache
+ cache['countries'] = []
+
+
def close(self):
- """ Free all resources used by the analyzer.
- """
pass
def add_postcodes_from_db(self):
pass
- def process_place(self, place):
- """ Determine tokenizer information about the given place.
- Returns a JSON-serialisable structure that will be handed into
- the database via the token_info field.
- """
+ def add_country_names(self, code, names):
+ self.analyser_cache['countries'].append((code, names))
+
+ def process_place(self, place):
return {}
'location_property_tiger', 'location_property_osmline',
'location_postcode', 'search_name', 'location_road_23')
for table in tables:
- table_factory(table, content=(1, 2, 3))
+ table_factory(table, content=((1, ), (2, ), (3, )))
+ assert temp_db_cursor.table_rows(table) == 3
database_import.truncate_data_tables(temp_db_conn)
assert temp_db_cursor.table_rows('placex') == 30
assert temp_db_cursor.table_rows('location_property_osmline') == 1
-@pytest.mark.parametrize("languages", (False, True))
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
- temp_db_with_extensions, monkeypatch, languages):
- if languages:
- monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
- temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
- RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
- """)
- temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
- temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
- temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
- '"name"=>"us","name:af"=>"us"')""")
- temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
- lookup_country_code varchar(2))
- RETURNS INTEGER
- AS $$
- BEGIN
- INSERT INTO word VALUES (lookup_country_code);
- RETURN 5;
- END;
- $$
- LANGUAGE plpgsql;
- """)
- database_import.create_country_names(temp_db_conn, def_config)
+
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
+ table_factory, tokenizer_mock, languages):
+
+ table_factory('country_name', 'country_code varchar(2), name hstore',
+ content=(('us', '"name"=>"us1","name:af"=>"us2"'),
+ ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
+
+ assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
+
+ tokenizer = tokenizer_mock()
+
+ database_import.create_country_names(temp_db_conn, tokenizer, languages)
+
+ assert len(tokenizer.analyser_cache['countries']) == 2
+
+ result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+
if languages:
- assert temp_db_cursor.table_rows('word') == 4
+ assert result_set == {'us' : set(('us', 'us1', 'United States')),
+ 'fr' : set(('fr', 'Fra', 'Fren'))}
else:
- assert temp_db_cursor.table_rows('word') == 5
+ assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
+ 'fr' : set(('fr', 'Fra', 'Fren'))}
return tmp_path
@pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
- monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
- table_factory('country_name', 'partition INT', (0, 1, 2))
+def conn(sql_preprocessor, temp_db_conn):
return temp_db_conn