+ assert self.conn is not None
+
+ token_info = _TokenInfo(self._cache)
+
+ names = place.name
+
+ if names:
+ token_info.add_names(self.conn, names)
+
+ if place.is_country():
+ assert place.country_code is not None
+ self.add_country_names(place.country_code, names)
+
+ address = place.address
+ if address:
+ self._process_place_address(token_info, address)
+
+ return token_info.data
+
+
+ def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
+ assert self.conn is not None
+ hnrs = []
+ addr_terms = []
+
+ for key, value in address.items():
+ if key == 'postcode':
+ # Make sure the normalized postcode is present in the word table.
+ if re.search(r'[:,;]', value) is None:
+ norm_pc = self.normalize_postcode(value)
+ token_info.set_postcode(norm_pc)
+ self._cache.add_postcode(self.conn, norm_pc)
+ elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+ hnrs.append(value)
+ elif key == 'street':
+ token_info.add_street(self.conn, value)
+ elif key == 'place':
+ token_info.add_place(self.conn, value)
+ elif not key.startswith('_') \
+ and key not in ('country', 'full', 'inclusion'):
+ addr_terms.append((key, value))
+
+ if hnrs:
+ token_info.add_housenumbers(self.conn, hnrs)
+
+ if addr_terms:
+ token_info.add_address_terms(self.conn, addr_terms)
+
+
+
+class _TokenInfo:
+ """ Collect token information to be sent back to the database.
+ """
+ def __init__(self, cache: '_TokenCache') -> None:
+ self.cache = cache
+ self.data: Dict[str, Any] = {}
+
+
+ def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
+ """ Add token information for the names of the place.
+ """
+ with conn.cursor() as cur:
+ # Create the token IDs for all names.
+ self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
+ (names, ))
+
+
+ def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
+ """ Extract housenumber information from the address.
+ """
+ if len(hnrs) == 1:
+ token = self.cache.get_housenumber(hnrs[0])
+ if token is not None:
+ self.data['hnr_tokens'] = token
+ self.data['hnr'] = hnrs[0]
+ return
+
+ # split numbers if necessary
+ simple_list: List[str] = []
+ for hnr in hnrs:
+ simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+ if len(simple_list) > 1:
+ simple_list = list(set(simple_list))
+
+ with conn.cursor() as cur:
+ cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
+ self.data['hnr_tokens'], self.data['hnr'] = \
+ cur.fetchone() # type: ignore[no-untyped-call]
+
+
+ def set_postcode(self, postcode: str) -> None:
+ """ Set or replace the postcode token with the given value.
+ """
+ self.data['postcode'] = postcode
+
+ def add_street(self, conn: Connection, street: str) -> None:
+ """ Add addr:street match terms.
+ """
+ def _get_street(name: str) -> List[int]:
+ with conn.cursor() as cur:
+ return cast(List[int],
+ cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
+
+ tokens = self.cache.streets.get(street, _get_street)
+ if tokens:
+ self.data['street'] = tokens
+
+
+ def add_place(self, conn: Connection, place: str) -> None:
+ """ Add addr:place search and match terms.
+ """
+ def _get_place(name: str) -> Tuple[List[int], List[int]]:
+ with conn.cursor() as cur:
+ cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
+ word_ids_from_name(%s)::text""",
+ (name, name))
+ return cast(Tuple[List[int], List[int]],
+ cur.fetchone()) # type: ignore[no-untyped-call]
+
+ self.data['place_search'], self.data['place_match'] = \
+ self.cache.places.get(place, _get_place)
+
+
+ def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
+ """ Add additional address terms.
+ """
+ def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
+ with conn.cursor() as cur:
+ cur.execute("""SELECT addr_ids_from_name(%s)::text,
+ word_ids_from_name(%s)::text""",
+ (name, name))
+ return cast(Tuple[List[int], List[int]],
+ cur.fetchone()) # type: ignore[no-untyped-call]
+
+ tokens = {}
+ for key, value in terms:
+ items = self.cache.address_terms.get(value, _get_address_term)
+ if items[0] or items[1]:
+ tokens[key] = items
+
+ if tokens:
+ self.data['addr'] = tokens
+
+
+class _LRU:
+ """ Least recently used cache that accepts a generator function to
+ produce the item when there is a cache miss.
+ """
+
+ def __init__(self, maxsize: int = 128):
+ self.data: 'OrderedDict[str, Any]' = OrderedDict()
+ self.maxsize = maxsize
+
+
+ def get(self, key: str, generator: Callable[[str], Any]) -> Any:
+ """ Get the item with the given key from the cache. If nothing
+ is found in the cache, generate the value through the
+ generator function and store it in the cache.
+ """
+ value = self.data.get(key)
+ if value is not None:
+ self.data.move_to_end(key)
+ else:
+ value = generator(key)
+ if len(self.data) >= self.maxsize:
+ self.data.popitem(last=False)
+ self.data[key] = value
+
+ return value
+
+
+class _TokenCache:
+ """ Cache for token information to avoid repeated database queries.
+
+ This cache is not thread-safe and needs to be instantiated per
+ analyzer.
+ """
+ def __init__(self, conn: Connection):
+ # various LRU caches
+ self.streets = _LRU(maxsize=256)
+ self.places = _LRU(maxsize=128)
+ self.address_terms = _LRU(maxsize=1024)
+
+ # Lookup houseunumbers up to 100 and cache them
+ with conn.cursor() as cur:
+ cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+ FROM generate_series(1, 100) as i""")
+ self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
+
+ # For postcodes remember the ones that have already been added
+ self.postcodes: Set[str] = set()
+
+ def get_housenumber(self, number: str) -> Optional[str]:
+ """ Get a housenumber token from the cache.
+ """
+ return self._cached_housenumbers.get(number)
+
+
+ def add_postcode(self, conn: Connection, postcode: str) -> None:
+ """ Make sure the given postcode is in the database.
+ """
+ if postcode not in self.postcodes:
+ with conn.cursor() as cur:
+ cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
+ self.postcodes.add(postcode)