From d8ed1bfc601c6eb19b2db50df466654e843f0777 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Apr 2021 11:47:29 +0200 Subject: [PATCH] move houseunumber handling to tokenizer Normalization and token computation are now done in the tokenizer. The tokenizer keeps a cache to the hundred most used house numbers to keep the numbers of calls to the database low. --- lib-sql/functions/placex_triggers.sql | 29 ++++------- lib-sql/tokenizer/legacy_tokenizer.sql | 36 ++++++++++---- nominatim/indexer/indexer.py | 3 ++ nominatim/tokenizer/legacy_tokenizer.py | 64 +++++++++++++++++++++++-- nominatim/tools/migration.py | 3 ++ 5 files changed, 101 insertions(+), 34 deletions(-) diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 4f79d8e3..ca9ab5cc 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -292,7 +292,6 @@ CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT, parent_place_id BIGINT, address HSTORE, country TEXT, - housenumber TEXT, token_info JSONB, geometry GEOMETRY, OUT name_vector INTEGER[], @@ -302,6 +301,7 @@ DECLARE parent_name_vector INTEGER[]; parent_address_vector INTEGER[]; addr_place_ids INTEGER[]; + hnr_vector INTEGER[]; addr_item RECORD; parent_address_place_ids BIGINT[]; @@ -358,9 +358,10 @@ BEGIN -- This is unusual for the search_name table but prevents that the place -- is returned when we only search for the street/place. - IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN - name_vector := array_merge(name_vector, - ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]); + hnr_vector := token_get_housenumber_search_tokens(token_info); + + IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN + name_vector := array_merge(name_vector, hnr_vector); END IF; IF not address ? 'street' and address ? 'place' THEN @@ -370,7 +371,7 @@ BEGIN nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids); -- If there is a housenumber, also add the place name as a name, -- so we can search it by the usual housenumber+place algorithms. - IF housenumber is not null THEN + IF hnr_vector is not null THEN name_vector := array_merge(name_vector, ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]); END IF; @@ -812,21 +813,8 @@ BEGIN {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %} -- housenumber is a computed field, so start with an empty value - NEW.housenumber := NULL; + NEW.housenumber := token_normalized_housenumber(NEW.token_info); IF NEW.address is not NULL THEN - IF NEW.address ? 'conscriptionnumber' THEN - IF NEW.address ? 'streetnumber' THEN - NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber'); - ELSE - NEW.housenumber := NEW.address->'conscriptionnumber'; - END IF; - ELSEIF NEW.address ? 'streetnumber' THEN - NEW.housenumber := NEW.address->'streetnumber'; - ELSEIF NEW.address ? 'housenumber' THEN - NEW.housenumber := NEW.address->'housenumber'; - END IF; - NEW.housenumber := create_housenumber_id(NEW.housenumber); - addr_street := NEW.address->'street'; addr_place := NEW.address->'place'; @@ -940,8 +928,7 @@ BEGIN SELECT * INTO name_vector, nameaddress_vector FROM create_poi_search_terms(NEW.place_id, NEW.partition, NEW.parent_place_id, - NEW.address, - NEW.country_code, NEW.housenumber, + NEW.address, NEW.country_code, NEW.token_info, NEW.centroid); IF array_length(name_vector, 1) is not NULL THEN diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index d78c800c..916ba9ae 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -7,6 +7,7 @@ AS $$ SELECT (info->>'names')::INTEGER[] $$ LANGUAGE SQL IMMUTABLE STRICT; + -- Get tokens for matching the place name against others. -- -- This should usually be restricted to full name tokens. @@ -17,6 +18,22 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +-- Return the housenumber tokens applicable for the place. +CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB) + RETURNS INTEGER[] +AS $$ + SELECT (info->>'hnr_tokens')::INTEGER[] +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +-- Return the housenumber in the form that it can be matched during search. +CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB) + RETURNS TEXT +AS $$ + SELECT info->>'hnr'; +$$ LANGUAGE SQL IMMUTABLE STRICT; + + -- Return token info that should be saved permanently in the database. CREATE OR REPLACE FUNCTION token_strip_info(info JSONB) RETURNS JSONB @@ -75,26 +92,25 @@ END; $$ LANGUAGE plpgsql; + -- Create housenumber tokens from an OSM addr:housenumber. -- The housnumber is split at comma and semicolon as necessary. -- The function returns the normalized form of the housenumber suitable -- for comparison. -CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT) - RETURNS TEXT +CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[], + OUT tokens TEXT, + OUT normtext TEXT) AS $$ -DECLARE - normtext TEXT; BEGIN - SELECT array_to_string(array_agg(trans), ';') - INTO normtext - FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) + SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT + INTO normtext, tokens + FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid FROM (SELECT make_standard_name(h) as lookup_word - FROM regexp_split_to_table(housenumber, '[,;]') h) x) y; - - return normtext; + FROM unnest(housenumbers) h) x) y; END; $$ LANGUAGE plpgsql STABLE STRICT; + CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT) RETURNS INTEGER AS $$ diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index 28e81c81..ea7b0e59 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -4,6 +4,8 @@ Main work horse for indexing (computing addresses) the database. import logging import select +import psycopg2.extras + from nominatim.indexer.progress import ProgressLogger from nominatim.indexer import runners from nominatim.db.async_connection import DBConnection @@ -176,6 +178,7 @@ class Indexer: LOG.warning("Starting %s (using batch size %s)", runner.name(), batch) with connect(self.dsn) as conn: + psycopg2.extras.register_hstore(conn) with conn.cursor() as cur: total_tuples = cur.scalar(runner.sql_count_objects()) LOG.debug("Total number of rows: %i", total_tuples) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 73a911b4..6ffdc4ef 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -195,6 +195,8 @@ class LegacyNameAnalyzer: self.conn.autocommit = True psycopg2.extras.register_hstore(self.conn) + self._cache = _TokenCache(self.conn) + def __enter__(self): return self @@ -217,16 +219,23 @@ class LegacyNameAnalyzer: Returns a JSON-serialisable structure that will be handed into the database via the token_info field. """ - token_info = _TokenInfo() + token_info = _TokenInfo(self._cache) token_info.add_names(self.conn, place.get('name'), place.get('country_feature')) + address = place.get('address') + + if address: + token_info.add_housenumbers(self.conn, address) + return token_info.data class _TokenInfo: - - def __init__(self): + """ Collect token information to be sent back to the database. + """ + def __init__(self, cache): + self.cache = cache self.data = {} @@ -245,3 +254,52 @@ class _TokenInfo: if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): cur.execute("SELECT create_country(%s, %s)", (names, country_feature.lower())) + + + def add_housenumbers(self, conn, address): + """ Extract housenumber information from the address. + """ + hnrs = [v for k, v in address.items() + if k in ('housenumber', 'streetnumber', 'conscriptionnumber')] + + if not hnrs: + return + + if len(hnrs) == 1: + token = self.cache.get_housenumber(hnrs[0]) + if token is not None: + self.data['hnr_tokens'] = token + self.data['hnr'] = hnrs[0] + return + + # split numbers if necessary + simple_list = [] + for hnr in hnrs: + simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr))) + + if len(simple_list) > 1: + simple_list = list(set(simple_list)) + + with conn.cursor() as cur: + cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, )) + self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone() + + +class _TokenCache: + """ Cache for token information to avoid repeated database queries. + + This cache is not thread-safe and needs to be instantiated per + analyzer. + """ + def __init__(self, conn): + # Lookup houseunumbers up to 100 and cache them + with conn.cursor() as cur: + cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text + FROM generate_series(1, 100) as i""") + self._cached_housenumbers = {str(r[0]) : r[1] for r in cur} + + + def get_housenumber(self, number): + """ Get a housenumber token from the cache. + """ + return self._cached_housenumbers.get(number) diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py index c8011eeb..0396ff87 100644 --- a/nominatim/tools/migration.py +++ b/nominatim/tools/migration.py @@ -129,6 +129,9 @@ def change_housenumber_transliteration(conn, **_): The database schema switched from saving raw housenumbers in placex.housenumber to saving transliterated ones. + + Note: the function create_housenumber_id() has been dropped in later + versions. """ with conn.cursor() as cur: cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT) -- 2.39.5