LANGUAGE plpgsql IMMUTABLE;
+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+ AS $$
+DECLARE
+ location RECORD;
+ waynodes BIGINT[];
+BEGIN
+ IF akeys(in_address) != ARRAY['interpolation'] THEN
+ RETURN in_address;
+ END IF;
+
+ SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+ FOR location IN
+ SELECT placex.address, placex.osm_id FROM placex
+ WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+ and placex.address is not null
+ and (placex.address ? 'street' or placex.address ? 'place')
+ and indexed_status < 100
+ LOOP
+ -- mark it as a derived address
+ RETURN location.address || in_address || hstore('_inherited', '');
+ END LOOP;
+
+ RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
-- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street TEXT,
place TEXT, partition SMALLINT,
centroid GEOMETRY, geom GEOMETRY)
RETURNS BIGINT
AS $$
DECLARE
- addr_street TEXT;
- addr_place TEXT;
parent_place_id BIGINT;
-
- waynodes BIGINT[];
-
location RECORD;
BEGIN
- addr_street = street;
- addr_place = place;
-
- IF addr_street is null and addr_place is null THEN
- select nodes from planet_osm_ways where id = wayid INTO waynodes;
- FOR location IN SELECT placex.address from placex
- where osm_type = 'N' and osm_id = ANY(waynodes)
- and placex.address is not null
- and (placex.address ? 'street' or placex.address ? 'place')
- and indexed_status < 100
- limit 1 LOOP
- addr_street = location.address->'street';
- addr_place = location.address->'place';
- END LOOP;
- END IF;
-
- parent_place_id := find_parent_for_address(addr_street, addr_place,
- partition, centroid);
+ parent_place_id := find_parent_for_address(street, place, partition, centroid);
IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex
NEW.interpolationtype = NEW.address->'interpolation';
place_centroid := ST_PointOnSurface(NEW.linegeo);
- NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
+ NEW.parent_place_id = get_interpolation_parent(NEW.address->'street',
NEW.address->'place',
NEW.partition, place_centroid, NEW.linegeo);
IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
interpol_postcode := NEW.address->'postcode';
- housenum := getorcreate_postcode_id(NEW.address->'postcode');
ELSE
interpol_postcode := NULL;
END IF;
+ IF NEW.address ? '_inherited' THEN
+ NEW.address := hstore('interpolation', NEW.interpolationtype);
+ END IF;
+
-- if the line was newly inserted, split the line as necessary
IF OLD.indexed_status = 1 THEN
select nodes from planet_osm_ways where id = NEW.osm_id INTO waynodes;
IF NEW.address is not NULL THEN
addr_street := NEW.address->'street';
addr_place := NEW.address->'place';
-
- IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
- i := getorcreate_postcode_id(NEW.address->'postcode');
- END IF;
END IF;
NEW.postcode := null;
$$ LANGUAGE SQL IMMUTABLE STRICT;
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+ RETURNS TEXT
+AS $$
+ SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
-- Return token info that should be saved permanently in the database.
CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
RETURNS JSONB
LANGUAGE plpgsql;
-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
- RETURNS INTEGER
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+ RETURNS BOOLEAN
AS $$
DECLARE
+ r RECORD;
lookup_token TEXT;
- lookup_word TEXT;
return_word_id INTEGER;
BEGIN
- lookup_word := upper(trim(postcode));
- lookup_token := ' ' || make_standard_name(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and word = lookup_word
+ lookup_token := ' ' || make_standard_name(postcode);
+ FOR r IN
+ SELECT word_id FROM word
+ WHERE word_token = lookup_token and word = postcode
and class='place' and type='postcode'
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
- 'place', 'postcode', null, 0);
- END IF;
- RETURN return_word_id;
+ LOOP
+ RETURN false;
+ END LOOP;
+
+ INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
+ 'place', 'postcode', null, 0);
+ RETURN true;
END;
$$
LANGUAGE plpgsql;
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes')
- postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+ postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+ tokenizer)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
if args.continue_at is not None and args.continue_at != 'load-data':
if maxrank == 30:
self._index(runners.RankRunner(0, analyzer))
- self._index(runners.InterpolationRunner(), 20)
+ self._index(runners.InterpolationRunner(analyzer), 20)
self._index(runners.RankRunner(30, analyzer), 20)
else:
self._index(runners.RankRunner(maxrank, analyzer))
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
- """.format(','.join(["(%s, %s::hstore, %s::json)"] * num_places))
+ """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
def index_places(self, worker, places):
location_property_osmline.
"""
+ def __init__(self, analyzer):
+ self.analyzer = analyzer
+
+
@staticmethod
def name():
return "interpolation lines (location_property_osmline)"
@staticmethod
def sql_get_objects():
- return """SELECT place_id FROM location_property_osmline
+ return """SELECT place_id, get_interpolation_address(address, osm_id) as address
+ FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
+
@staticmethod
- def index_places(worker, ids):
- worker.perform(""" UPDATE location_property_osmline
- SET indexed_status = 0 WHERE place_id IN ({})
- """.format(','.join((str(i[0]) for i in ids))))
+ @functools.lru_cache(maxsize=1)
+ def _index_sql(num_places):
+ return """ UPDATE location_property_osmline
+ SET indexed_status = 0, address = v.addr, token_info = v.ti
+ FROM (VALUES {}) as v(id, addr, ti)
+ WHERE place_id = v.id
+ """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
+
+
+ def index_places(self, worker, places):
+ values = []
+ for place in places:
+ values.extend((place[x] for x in ('place_id', 'address')))
+ values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+ worker.perform(self._index_sql(len(places)), values)
+
class PostcodeRunner:
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
+from collections import OrderedDict
import logging
import re
import shutil
self.conn.close()
self.conn = None
+
+ def add_postcodes_from_db(self):
+ """ Add postcodes from the location_postcode table to the word table.
+ """
+ with self.conn.cursor() as cur:
+ cur.execute("""SELECT count(create_postcode_id(pc))
+ FROM (SELECT distinct(postcode) as pc
+ FROM location_postcode) x""")
+
def process_place(self, place):
""" Determine tokenizer information about the given place.
address = place.get('address')
if address:
+ self._add_postcode(address.get('postcode'))
token_info.add_housenumbers(self.conn, address)
return token_info.data
+ def _add_postcode(self, postcode):
+ """ Make sure the normalized postcode is present in the word table.
+ """
+ if not postcode or re.search(r'[:,;]', postcode) is not None:
+ return
+
+ def _create_postcode_from_db(pcode):
+ with self.conn.cursor() as cur:
+ cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+ self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+class _LRU:
+ """ Least recently used cache that accepts a generator function to
+ produce the item when there is a cache miss.
+ """
+
+ def __init__(self, maxsize=128):
+ self.data = OrderedDict()
+ self.maxsize = maxsize
+
+ def get(self, key, generator):
+ """ Get the item with the given key from the cache. If nothing
+ is found in the cache, generate the value through the
+ generator function and store it in the cache.
+ """
+ value = self.data.get(key)
+ if value is not None:
+ self.data.move_to_end(key)
+ else:
+ value = generator(key)
+ if len(self.data) >= self.maxsize:
+ self.data.popitem(last=False)
+ self.data[key] = value
+
+ return value
+
+
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
analyzer.
"""
def __init__(self, conn):
+ # various LRU caches
+ self.postcodes = _LRU(maxsize=32)
+
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
from nominatim.db.utils import execute_file
from nominatim.db.connection import connect
-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
""" Set up the initial list of postcodes.
"""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, country_code,
- upper(trim (both ' ' from address->'postcode')) as pc,
+ token_normalized_postcode(address->'postcode') as pc,
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
FROM placex
- WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+ WHERE address ? 'postcode'
+ and token_normalized_postcode(address->'postcode') is not null
AND geometry IS NOT null
GROUP BY country_code, pc
""")
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
- SELECT nextval('seq_place'), 1, 'us', postcode,
+ SELECT nextval('seq_place'), 1, 'us',
+ token_normalized_postcode(postcode),
ST_SetSRID(ST_Point(x,y),4326)
- FROM us_postcode WHERE postcode NOT IN
+ FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'us')
""")
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
- SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
- FROM gb_postcode WHERE postcode NOT IN
+ SELECT nextval('seq_place'), 1, 'gb',
+ token_normalized_postcode(postcode), geometry
+ FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'gb')
""")
DELETE FROM word WHERE class='place' and type='postcode'
and word NOT IN (SELECT postcode FROM location_postcode)
""")
-
- cur.execute("""
- SELECT count(getorcreate_postcode_id(v)) FROM
- (SELECT distinct(postcode) as v FROM location_postcode) p
- """)
conn.commit()
+
+ with tokenizer.name_analyzer() as analyzer:
+ analyzer.add_postcodes_from_db()
@pytest.fixture
-def tokenizer_mock(monkeypatch, property_table, temp_db_conn):
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
""" Sets up the configuration so that the test dummy tokenizer will be
loaded.
"""
"""
pass
+ def add_postcodes_from_db(self):
+ pass
+
def process_place(self, place):
""" Determine tokenizer information about the given place.
geometry_sector INTEGER)""")
cur.execute("""CREATE TABLE location_property_osmline (
place_id BIGINT,
+ osm_id BIGINT,
+ address HSTORE,
+ token_info JSONB,
indexed_status SMALLINT,
indexed_date TIMESTAMP,
geometry_sector INTEGER)""")
END;
$$ LANGUAGE plpgsql STABLE;
""")
+ cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+ RETURNS HSTORE AS $$
+ BEGIN
+ RETURN in_address;
+ END;
+ $$ LANGUAGE plpgsql STABLE;
+ """)
+
for table in ('placex', 'location_property_osmline', 'location_postcode'):
cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
FOR EACH ROW EXECUTE PROCEDURE date_update()
next_id = next(self.osmline_id)
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO location_property_osmline
- (place_id, indexed_status, geometry_sector)
- VALUES (%s, 1, %s)""",
- (next_id, sector))
+ (place_id, osm_id, indexed_status, geometry_sector)
+ VALUES (%s, %s, 1, %s)""",
+ (next_id, next_id, sector))
return next_id
def add_postcode(self, country, postcode):
import pytest
from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+ return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture
def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
postcode TEXT,
geometry GEOMETRY(Geometry, 4326)""")
temp_db_cursor.execute('CREATE SEQUENCE seq_place')
- temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
- RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+ temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+ RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
""")
-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
- postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+ postcodes.import_postcodes(dsn, tmp_path, tokenizer)
assert temp_db_cursor.table_exists('gb_postcode')
assert temp_db_cursor.table_exists('us_postcode')
assert temp_db_cursor.table_rows('location_postcode') == 0
-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
temp_db_cursor.execute("""
INSERT INTO placex (place_id, country_code, address, geometry)
VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
""")
- postcodes.import_postcodes(dsn, tmp_path)
+ postcodes.import_postcodes(dsn, tmp_path, tokenizer)
rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
ST_X(geometry), ST_Y(geometry)