"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
+from collections import OrderedDict
import logging
import re
import shutil
self.conn.close()
self.conn = None
+
+ def add_postcodes_from_db(self):
+ """ Add postcodes from the location_postcode table to the word table.
+ """
+ with self.conn.cursor() as cur:
+ cur.execute("""SELECT count(create_postcode_id(pc))
+ FROM (SELECT distinct(postcode) as pc
+ FROM location_postcode) x""")
+
def process_place(self, place):
""" Determine tokenizer information about the given place.
address = place.get('address')
if address:
+ self._add_postcode(address.get('postcode'))
token_info.add_housenumbers(self.conn, address)
+ token_info.add_address_parent(self.conn, address.get('street'),
+ address.get('place'))
+ token_info.add_address_parts(self.conn, address)
return token_info.data
+ def _add_postcode(self, postcode):
+ """ Make sure the normalized postcode is present in the word table.
+ """
+ if not postcode or re.search(r'[:,;]', postcode) is not None:
+ return
+
+ def _create_postcode_from_db(pcode):
+ with self.conn.cursor() as cur:
+ cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+ self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+ def add_address_parent(self, conn, street, place):
+ """ Extract the tokens for street and place terms.
+ """
+ def _get_streetplace(name):
+ with conn.cursor() as cur:
+ cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text,
+ word_ids_from_name(%s)::text""",
+ (name, name, name))
+ return cur.fetchone()
+
+ if street:
+ self.data['street_search'], self.data['street_match'] = \
+ self.cache.streets.get(street, _get_streetplace)
+
+ if place:
+ self.data['place_search'], self.data['place_match'] = \
+ self.cache.streets.get(place, _get_streetplace)
+
+
+ def add_address_parts(self, conn, address):
+ """ Extract address terms.
+ """
+ def _get_address_term(name):
+ with conn.cursor() as cur:
+ cur.execute("""SELECT addr_ids_from_name(%s)::text,
+ word_ids_from_name(%s)::text""",
+ (name, name))
+ return cur.fetchone()
+
+ tokens = {}
+ for key, value in address.items():
+ if not key.startswith('_') and \
+ key not in ('country', 'street', 'place', 'postcode', 'full',
+ 'housenumber', 'streetnumber', 'conscriptionnumber'):
+ tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+
+ if tokens:
+ self.data['addr'] = tokens
+
+
+class _LRU:
+ """ Least recently used cache that accepts a generator function to
+ produce the item when there is a cache miss.
+ """
+
+ def __init__(self, maxsize=128):
+ self.data = OrderedDict()
+ self.maxsize = maxsize
+
+ def get(self, key, generator):
+ """ Get the item with the given key from the cache. If nothing
+ is found in the cache, generate the value through the
+ generator function and store it in the cache.
+ """
+ value = self.data.get(key)
+ if value is not None:
+ self.data.move_to_end(key)
+ else:
+ value = generator(key)
+ if len(self.data) >= self.maxsize:
+ self.data.popitem(last=False)
+ self.data[key] = value
+
+ return value
+
+
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
analyzer.
"""
def __init__(self, conn):
+ # various LRU caches
+ self.postcodes = _LRU(maxsize=32)
+ self.streets = _LRU(maxsize=256)
+ self.places = _LRU(maxsize=128)
+ self.address_terms = _LRU(maxsize=1024)
+
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text