+
+ def add_housenumbers(self, conn, hnrs):
+ """ Extract housenumber information from the address.
+ """
+ if len(hnrs) == 1:
+ token = self.cache.get_housenumber(hnrs[0])
+ if token is not None:
+ self.data['hnr_tokens'] = token
+ self.data['hnr'] = hnrs[0]
+ return
+
+ # split numbers if necessary
+ simple_list = []
+ for hnr in hnrs:
+ simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+ if len(simple_list) > 1:
+ simple_list = list(set(simple_list))
+
+ with conn.cursor() as cur:
+ cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
+ self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+
+
+ def add_street(self, conn, street):
+ """ Add addr:street match terms.
+ """
+ def _get_street(name):
+ with conn.cursor() as cur:
+ return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+
+ self.data['street'] = self.cache.streets.get(street, _get_street)
+
+
+ def add_place(self, conn, place):
+ """ Add addr:place search and match terms.
+ """
+ def _get_place(name):
+ with conn.cursor() as cur:
+ cur.execute("""SELECT (addr_ids_from_name(%s)
+ || getorcreate_name_id(make_standard_name(%s), ''))::text,
+ word_ids_from_name(%s)::text""",
+ (name, name, name))
+ return cur.fetchone()
+
+ self.data['place_search'], self.data['place_match'] = \
+ self.cache.places.get(place, _get_place)
+
+
+ def add_address_terms(self, conn, terms):
+ """ Add additional address terms.
+ """
+ def _get_address_term(name):
+ with conn.cursor() as cur:
+ cur.execute("""SELECT addr_ids_from_name(%s)::text,
+ word_ids_from_name(%s)::text""",
+ (name, name))
+ return cur.fetchone()
+
+ tokens = {}
+ for key, value in terms:
+ tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+
+ self.data['addr'] = tokens
+
+
+class _LRU:
+ """ Least recently used cache that accepts a generator function to
+ produce the item when there is a cache miss.
+ """
+
+ def __init__(self, maxsize=128, init_data=None):
+ self.data = init_data or OrderedDict()
+ self.maxsize = maxsize
+ if init_data is not None and len(init_data) > maxsize:
+ self.maxsize = len(init_data)
+
+ def get(self, key, generator):
+ """ Get the item with the given key from the cache. If nothing
+ is found in the cache, generate the value through the
+ generator function and store it in the cache.
+ """
+ value = self.data.get(key)
+ if value is not None:
+ self.data.move_to_end(key)
+ else:
+ value = generator(key)
+ if len(self.data) >= self.maxsize:
+ self.data.popitem(last=False)
+ self.data[key] = value
+
+ return value
+
+
+class _TokenCache:
+ """ Cache for token information to avoid repeated database queries.
+
+ This cache is not thread-safe and needs to be instantiated per
+ analyzer.
+ """
+ def __init__(self, conn):
+ # various LRU caches
+ self.streets = _LRU(maxsize=256)
+ self.places = _LRU(maxsize=128)
+ self.address_terms = _LRU(maxsize=1024)
+
+ # Lookup houseunumbers up to 100 and cache them
+ with conn.cursor() as cur:
+ cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+ FROM generate_series(1, 100) as i""")
+ self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
+
+ # Get postcodes that are already saved
+ postcodes = OrderedDict()
+ with conn.cursor() as cur:
+ cur.execute("""SELECT word FROM word
+ WHERE class ='place' and type = 'postcode'""")
+ for row in cur:
+ postcodes[row[0]] = None
+ self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+
+ def get_housenumber(self, number):
+ """ Get a housenumber token from the cache.
+ """
+ return self._cached_housenumbers.get(number)