Normalization and token computation are now done in the tokenizer.
The tokenizer keeps a cache to the hundred most used house numbers
to keep the numbers of calls to the database low.
parent_place_id BIGINT,
address HSTORE,
country TEXT,
- housenumber TEXT,
token_info JSONB,
geometry GEOMETRY,
OUT name_vector INTEGER[],
parent_name_vector INTEGER[];
parent_address_vector INTEGER[];
addr_place_ids INTEGER[];
+ hnr_vector INTEGER[];
addr_item RECORD;
parent_address_place_ids BIGINT[];
-- This is unusual for the search_name table but prevents that the place
-- is returned when we only search for the street/place.
- IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
- name_vector := array_merge(name_vector,
- ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
+ hnr_vector := token_get_housenumber_search_tokens(token_info);
+
+ IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
+ name_vector := array_merge(name_vector, hnr_vector);
END IF;
IF not address ? 'street' and address ? 'place' THEN
nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-- If there is a housenumber, also add the place name as a name,
-- so we can search it by the usual housenumber+place algorithms.
- IF housenumber is not null THEN
+ IF hnr_vector is not null THEN
name_vector := array_merge(name_vector,
ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
END IF;
{% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
-- housenumber is a computed field, so start with an empty value
- NEW.housenumber := NULL;
+ NEW.housenumber := token_normalized_housenumber(NEW.token_info);
IF NEW.address is not NULL THEN
- IF NEW.address ? 'conscriptionnumber' THEN
- IF NEW.address ? 'streetnumber' THEN
- NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
- ELSE
- NEW.housenumber := NEW.address->'conscriptionnumber';
- END IF;
- ELSEIF NEW.address ? 'streetnumber' THEN
- NEW.housenumber := NEW.address->'streetnumber';
- ELSEIF NEW.address ? 'housenumber' THEN
- NEW.housenumber := NEW.address->'housenumber';
- END IF;
- NEW.housenumber := create_housenumber_id(NEW.housenumber);
-
addr_street := NEW.address->'street';
addr_place := NEW.address->'place';
SELECT * INTO name_vector, nameaddress_vector
FROM create_poi_search_terms(NEW.place_id,
NEW.partition, NEW.parent_place_id,
- NEW.address,
- NEW.country_code, NEW.housenumber,
+ NEW.address, NEW.country_code,
NEW.token_info, NEW.centroid);
IF array_length(name_vector, 1) is not NULL THEN
SELECT (info->>'names')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
+
-- Get tokens for matching the place name against others.
--
-- This should usually be restricted to full name tokens.
$$ LANGUAGE SQL IMMUTABLE STRICT;
+-- Return the housenumber tokens applicable for the place.
+CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'hnr_tokens')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber in the form that it can be matched during search.
+CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
+ RETURNS TEXT
+AS $$
+ SELECT info->>'hnr';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
-- Return token info that should be saved permanently in the database.
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
RETURNS JSONB
$$
LANGUAGE plpgsql;
+
-- Create housenumber tokens from an OSM addr:housenumber.
-- The housnumber is split at comma and semicolon as necessary.
-- The function returns the normalized form of the housenumber suitable
-- for comparison.
-CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
- RETURNS TEXT
+CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
+ OUT tokens TEXT,
+ OUT normtext TEXT)
AS $$
-DECLARE
- normtext TEXT;
BEGIN
- SELECT array_to_string(array_agg(trans), ';')
- INTO normtext
- FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
+ SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
+ INTO normtext, tokens
+ FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
FROM (SELECT make_standard_name(h) as lookup_word
- FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
-
- return normtext;
+ FROM unnest(housenumbers) h) x) y;
END;
$$ LANGUAGE plpgsql STABLE STRICT;
+
CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
RETURNS INTEGER
AS $$
import logging
import select
+import psycopg2.extras
+
from nominatim.indexer.progress import ProgressLogger
from nominatim.indexer import runners
from nominatim.db.async_connection import DBConnection
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn:
+ psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples)
self.conn.autocommit = True
psycopg2.extras.register_hstore(self.conn)
+ self._cache = _TokenCache(self.conn)
+
def __enter__(self):
return self
Returns a JSON-serialisable structure that will be handed into
the database via the token_info field.
"""
- token_info = _TokenInfo()
+ token_info = _TokenInfo(self._cache)
token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
+ address = place.get('address')
+
+ if address:
+ token_info.add_housenumbers(self.conn, address)
+
return token_info.data
class _TokenInfo:
-
- def __init__(self):
+ """ Collect token information to be sent back to the database.
+ """
+ def __init__(self, cache):
+ self.cache = cache
self.data = {}
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
cur.execute("SELECT create_country(%s, %s)",
(names, country_feature.lower()))
+
+
+ def add_housenumbers(self, conn, address):
+ """ Extract housenumber information from the address.
+ """
+ hnrs = [v for k, v in address.items()
+ if k in ('housenumber', 'streetnumber', 'conscriptionnumber')]
+
+ if not hnrs:
+ return
+
+ if len(hnrs) == 1:
+ token = self.cache.get_housenumber(hnrs[0])
+ if token is not None:
+ self.data['hnr_tokens'] = token
+ self.data['hnr'] = hnrs[0]
+ return
+
+ # split numbers if necessary
+ simple_list = []
+ for hnr in hnrs:
+ simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+ if len(simple_list) > 1:
+ simple_list = list(set(simple_list))
+
+ with conn.cursor() as cur:
+ cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
+ self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+
+
+class _TokenCache:
+ """ Cache for token information to avoid repeated database queries.
+
+ This cache is not thread-safe and needs to be instantiated per
+ analyzer.
+ """
+ def __init__(self, conn):
+ # Lookup houseunumbers up to 100 and cache them
+ with conn.cursor() as cur:
+ cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+ FROM generate_series(1, 100) as i""")
+ self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
+
+
+ def get_housenumber(self, number):
+ """ Get a housenumber token from the cache.
+ """
+ return self._cached_housenumbers.get(number)
The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones.
+
+ Note: the function create_housenumber_id() has been dropped in later
+ versions.
"""
with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)