From ffc2d82b0ed150d52a718dc563f9399062e579a7 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Apr 2021 18:26:36 +0200 Subject: [PATCH] move postcode normalization into tokenizer --- lib-sql/functions/interpolation.sql | 63 +++++++++++++++---------- lib-sql/functions/placex_triggers.sql | 4 -- lib-sql/tokenizer/legacy_tokenizer.sql | 35 ++++++++------ nominatim/clicmd/setup.py | 3 +- nominatim/indexer/indexer.py | 2 +- nominatim/indexer/runners.py | 31 +++++++++--- nominatim/tokenizer/legacy_tokenizer.py | 53 +++++++++++++++++++++ nominatim/tools/postcodes.py | 25 +++++----- test/python/conftest.py | 2 +- test/python/dummy_tokenizer.py | 3 ++ test/python/test_indexing.py | 17 +++++-- test/python/test_tools_postcodes.py | 17 ++++--- 12 files changed, 181 insertions(+), 74 deletions(-) diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql index a797cad3..8bfc307b 100644 --- a/lib-sql/functions/interpolation.sql +++ b/lib-sql/functions/interpolation.sql @@ -12,39 +12,47 @@ $$ LANGUAGE plpgsql IMMUTABLE; +CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT) +RETURNS HSTORE + AS $$ +DECLARE + location RECORD; + waynodes BIGINT[]; +BEGIN + IF akeys(in_address) != ARRAY['interpolation'] THEN + RETURN in_address; + END IF; + + SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid; + FOR location IN + SELECT placex.address, placex.osm_id FROM placex + WHERE osm_type = 'N' and osm_id = ANY(waynodes) + and placex.address is not null + and (placex.address ? 'street' or placex.address ? 'place') + and indexed_status < 100 + LOOP + -- mark it as a derived address + RETURN location.address || in_address || hstore('_inherited', ''); + END LOOP; + + RETURN in_address; +END; +$$ +LANGUAGE plpgsql STABLE; + + + -- find the parent road of the cut road parts -CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT, +CREATE OR REPLACE FUNCTION get_interpolation_parent(street TEXT, place TEXT, partition SMALLINT, centroid GEOMETRY, geom GEOMETRY) RETURNS BIGINT AS $$ DECLARE - addr_street TEXT; - addr_place TEXT; parent_place_id BIGINT; - - waynodes BIGINT[]; - location RECORD; BEGIN - addr_street = street; - addr_place = place; - - IF addr_street is null and addr_place is null THEN - select nodes from planet_osm_ways where id = wayid INTO waynodes; - FOR location IN SELECT placex.address from placex - where osm_type = 'N' and osm_id = ANY(waynodes) - and placex.address is not null - and (placex.address ? 'street' or placex.address ? 'place') - and indexed_status < 100 - limit 1 LOOP - addr_street = location.address->'street'; - addr_place = location.address->'place'; - END LOOP; - END IF; - - parent_place_id := find_parent_for_address(addr_street, addr_place, - partition, centroid); + parent_place_id := find_parent_for_address(street, place, partition, centroid); IF parent_place_id is null THEN FOR location IN SELECT place_id FROM placex @@ -147,17 +155,20 @@ BEGIN NEW.interpolationtype = NEW.address->'interpolation'; place_centroid := ST_PointOnSurface(NEW.linegeo); - NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street', + NEW.parent_place_id = get_interpolation_parent(NEW.address->'street', NEW.address->'place', NEW.partition, place_centroid, NEW.linegeo); IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN interpol_postcode := NEW.address->'postcode'; - housenum := getorcreate_postcode_id(NEW.address->'postcode'); ELSE interpol_postcode := NULL; END IF; + IF NEW.address ? '_inherited' THEN + NEW.address := hstore('interpolation', NEW.interpolationtype); + END IF; + -- if the line was newly inserted, split the line as necessary IF OLD.indexed_status = 1 THEN select nodes from planet_osm_ways where id = NEW.osm_id INTO waynodes; diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index ca9ab5cc..922d79bd 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -817,10 +817,6 @@ BEGIN IF NEW.address is not NULL THEN addr_street := NEW.address->'street'; addr_place := NEW.address->'place'; - - IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN - i := getorcreate_postcode_id(NEW.address->'postcode'); - END IF; END IF; NEW.postcode := null; diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 916ba9ae..4c22424e 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -34,6 +34,13 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) + RETURNS TEXT +AS $$ + SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; +$$ LANGUAGE SQL IMMUTABLE STRICT; + + -- Return token info that should be saved permanently in the database. CREATE OR REPLACE FUNCTION token_strip_info(info JSONB) RETURNS JSONB @@ -133,26 +140,26 @@ $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT) - RETURNS INTEGER +CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT) + RETURNS BOOLEAN AS $$ DECLARE + r RECORD; lookup_token TEXT; - lookup_word TEXT; return_word_id INTEGER; BEGIN - lookup_word := upper(trim(postcode)); - lookup_token := ' ' || make_standard_name(lookup_word); - SELECT min(word_id) FROM word - WHERE word_token = lookup_token and word = lookup_word + lookup_token := ' ' || make_standard_name(postcode); + FOR r IN + SELECT word_id FROM word + WHERE word_token = lookup_token and word = postcode and class='place' and type='postcode' - INTO return_word_id; - IF return_word_id IS NULL THEN - return_word_id := nextval('seq_word'); - INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word, - 'place', 'postcode', null, 0); - END IF; - RETURN return_word_id; + LOOP + RETURN false; + END LOOP; + + INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode, + 'place', 'postcode', null, 0); + RETURN true; END; $$ LANGUAGE plpgsql; diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 499eff76..1ce9cf3e 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -116,7 +116,8 @@ class SetupAll: if args.continue_at is None or args.continue_at == 'load-data': LOG.warning('Calculate postcodes') - postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir) + postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir, + tokenizer) if args.continue_at is None or args.continue_at in ('load-data', 'indexing'): if args.continue_at is not None and args.continue_at != 'load-data': diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index ea7b0e59..d0c6ea0c 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -147,7 +147,7 @@ class Indexer: if maxrank == 30: self._index(runners.RankRunner(0, analyzer)) - self._index(runners.InterpolationRunner(), 20) + self._index(runners.InterpolationRunner(analyzer), 20) self._index(runners.RankRunner(30, analyzer), 20) else: self._index(runners.RankRunner(maxrank, analyzer)) diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 2bf9e516..75429fe4 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -25,7 +25,7 @@ class AbstractPlacexRunner: SET indexed_status = 0, address = v.addr, token_info = v.ti FROM (VALUES {}) as v(id, addr, ti) WHERE place_id = v.id - """.format(','.join(["(%s, %s::hstore, %s::json)"] * num_places)) + """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places)) def index_places(self, worker, places): @@ -82,6 +82,10 @@ class InterpolationRunner: location_property_osmline. """ + def __init__(self, analyzer): + self.analyzer = analyzer + + @staticmethod def name(): return "interpolation lines (location_property_osmline)" @@ -93,15 +97,30 @@ class InterpolationRunner: @staticmethod def sql_get_objects(): - return """SELECT place_id FROM location_property_osmline + return """SELECT place_id, get_interpolation_address(address, osm_id) as address + FROM location_property_osmline WHERE indexed_status > 0 ORDER BY geometry_sector""" + @staticmethod - def index_places(worker, ids): - worker.perform(""" UPDATE location_property_osmline - SET indexed_status = 0 WHERE place_id IN ({}) - """.format(','.join((str(i[0]) for i in ids)))) + @functools.lru_cache(maxsize=1) + def _index_sql(num_places): + return """ UPDATE location_property_osmline + SET indexed_status = 0, address = v.addr, token_info = v.ti + FROM (VALUES {}) as v(id, addr, ti) + WHERE place_id = v.id + """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places)) + + + def index_places(self, worker, places): + values = [] + for place in places: + values.extend((place[x] for x in ('place_id', 'address'))) + values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + + worker.perform(self._index_sql(len(places)), values) + class PostcodeRunner: diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 6ffdc4ef..b95a6208 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -1,6 +1,7 @@ """ Tokenizer implementing normalisation as used before Nominatim 4. """ +from collections import OrderedDict import logging import re import shutil @@ -213,6 +214,15 @@ class LegacyNameAnalyzer: self.conn.close() self.conn = None + + def add_postcodes_from_db(self): + """ Add postcodes from the location_postcode table to the word table. + """ + with self.conn.cursor() as cur: + cur.execute("""SELECT count(create_postcode_id(pc)) + FROM (SELECT distinct(postcode) as pc + FROM location_postcode) x""") + def process_place(self, place): """ Determine tokenizer information about the given place. @@ -226,11 +236,25 @@ class LegacyNameAnalyzer: address = place.get('address') if address: + self._add_postcode(address.get('postcode')) token_info.add_housenumbers(self.conn, address) return token_info.data + def _add_postcode(self, postcode): + """ Make sure the normalized postcode is present in the word table. + """ + if not postcode or re.search(r'[:,;]', postcode) is not None: + return + + def _create_postcode_from_db(pcode): + with self.conn.cursor() as cur: + cur.execute('SELECT create_postcode_id(%s)', (pcode, )) + + self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db) + + class _TokenInfo: """ Collect token information to be sent back to the database. """ @@ -285,6 +309,32 @@ class _TokenInfo: self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone() +class _LRU: + """ Least recently used cache that accepts a generator function to + produce the item when there is a cache miss. + """ + + def __init__(self, maxsize=128): + self.data = OrderedDict() + self.maxsize = maxsize + + def get(self, key, generator): + """ Get the item with the given key from the cache. If nothing + is found in the cache, generate the value through the + generator function and store it in the cache. + """ + value = self.data.get(key) + if value is not None: + self.data.move_to_end(key) + else: + value = generator(key) + if len(self.data) >= self.maxsize: + self.data.popitem(last=False) + self.data[key] = value + + return value + + class _TokenCache: """ Cache for token information to avoid repeated database queries. @@ -292,6 +342,9 @@ class _TokenCache: analyzer. """ def __init__(self, conn): + # various LRU caches + self.postcodes = _LRU(maxsize=32) + # Lookup houseunumbers up to 100 and cache them with conn.cursor() as cur: cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 0a568cba..78bd8cb9 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -6,7 +6,7 @@ of artificial postcode centroids. from nominatim.db.utils import execute_file from nominatim.db.connection import connect -def import_postcodes(dsn, project_dir): +def import_postcodes(dsn, project_dir, tokenizer): """ Set up the initial list of postcodes. """ @@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir): INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, country_code, - upper(trim (both ' ' from address->'postcode')) as pc, + token_normalized_postcode(address->'postcode') as pc, ST_Centroid(ST_Collect(ST_Centroid(geometry))) FROM placex - WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%' + WHERE address ? 'postcode' + and token_normalized_postcode(address->'postcode') is not null AND geometry IS NOT null GROUP BY country_code, pc """) @@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir): cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) - SELECT nextval('seq_place'), 1, 'us', postcode, + SELECT nextval('seq_place'), 1, 'us', + token_normalized_postcode(postcode), ST_SetSRID(ST_Point(x,y),4326) - FROM us_postcode WHERE postcode NOT IN + FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN (SELECT postcode FROM location_postcode WHERE country_code = 'us') """) @@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir): cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) - SELECT nextval('seq_place'), 1, 'gb', postcode, geometry - FROM gb_postcode WHERE postcode NOT IN + SELECT nextval('seq_place'), 1, 'gb', + token_normalized_postcode(postcode), geometry + FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN (SELECT postcode FROM location_postcode WHERE country_code = 'gb') """) @@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir): DELETE FROM word WHERE class='place' and type='postcode' and word NOT IN (SELECT postcode FROM location_postcode) """) - - cur.execute(""" - SELECT count(getorcreate_postcode_id(v)) FROM - (SELECT distinct(postcode) as v FROM location_postcode) p - """) conn.commit() + + with tokenizer.name_analyzer() as analyzer: + analyzer.add_postcodes_from_db() diff --git a/test/python/conftest.py b/test/python/conftest.py index d4649c24..f43f09d0 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -299,7 +299,7 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): @pytest.fixture -def tokenizer_mock(monkeypatch, property_table, temp_db_conn): +def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn): """ Sets up the configuration so that the test dummy tokenizer will be loaded. """ diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index 013016c8..ceea4a7e 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -43,6 +43,9 @@ class DummyNameAnalyzer: """ pass + def add_postcodes_from_db(self): + pass + def process_place(self, place): """ Determine tokenizer information about the given place. diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index d6876906..ff84e379 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -33,6 +33,9 @@ class IndexerTestDB: geometry_sector INTEGER)""") cur.execute("""CREATE TABLE location_property_osmline ( place_id BIGINT, + osm_id BIGINT, + address HSTORE, + token_info JSONB, indexed_status SMALLINT, indexed_date TIMESTAMP, geometry_sector INTEGER)""") @@ -61,6 +64,14 @@ class IndexerTestDB: END; $$ LANGUAGE plpgsql STABLE; """) + cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT) + RETURNS HSTORE AS $$ + BEGIN + RETURN in_address; + END; + $$ LANGUAGE plpgsql STABLE; + """) + for table in ('placex', 'location_property_osmline', 'location_postcode'): cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0} FOR EACH ROW EXECUTE PROCEDURE date_update() @@ -91,9 +102,9 @@ class IndexerTestDB: next_id = next(self.osmline_id) with self.conn.cursor() as cur: cur.execute("""INSERT INTO location_property_osmline - (place_id, indexed_status, geometry_sector) - VALUES (%s, 1, %s)""", - (next_id, sector)) + (place_id, osm_id, indexed_status, geometry_sector) + VALUES (%s, %s, 1, %s)""", + (next_id, next_id, sector)) return next_id def add_postcode(self, country, postcode): diff --git a/test/python/test_tools_postcodes.py b/test/python/test_tools_postcodes.py index 1fc060b0..37b47dfa 100644 --- a/test/python/test_tools_postcodes.py +++ b/test/python/test_tools_postcodes.py @@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table. import pytest from nominatim.tools import postcodes +import dummy_tokenizer + +@pytest.fixture +def tokenizer(): + return dummy_tokenizer.DummyTokenizer(None, None) @pytest.fixture def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory, @@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory, postcode TEXT, geometry GEOMETRY(Geometry, 4326)""") temp_db_cursor.execute('CREATE SEQUENCE seq_place') - temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT) - RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql; + temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) + RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql; """) -def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path): - postcodes.import_postcodes(dsn, tmp_path) +def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer): + postcodes.import_postcodes(dsn, tmp_path, tokenizer) assert temp_db_cursor.table_exists('gb_postcode') assert temp_db_cursor.table_exists('us_postcode') assert temp_db_cursor.table_rows('location_postcode') == 0 -def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path): +def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer): temp_db_cursor.execute(""" INSERT INTO placex (place_id, country_code, address, geometry) VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)') """) - postcodes.import_postcodes(dsn, tmp_path) + postcodes.import_postcodes(dsn, tmp_path, tokenizer) rows = temp_db_cursor.row_set(""" SELECT postcode, country_code, ST_X(geometry), ST_Y(geometry) -- 2.39.5