From 243725aae1aa35e4cd3bf2dc241828ca9e9d79e5 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 15 Feb 2022 21:20:47 +0100 Subject: [PATCH] icu: move housenumber token computation out of TokenInfo This was the last function to use the cache. There is a more clean separation of responsibility now. --- nominatim/tokenizer/icu_tokenizer.py | 70 +++++++++++----------------- 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 05c5a3ea..9600e65b 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -282,13 +282,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return postcode.strip().upper() - def _make_standard_hnr(self, hnr): - """ Create a normalised version of a housenumber. - - This function takes minor shortcuts on transliteration. - """ - return self._search_normalized(hnr) - def update_postcodes_from_db(self): """ Update postcode tokens in the word table from the location_postcode table. @@ -456,7 +449,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): Returns a JSON-serializable structure that will be handed into the database via the token_info field. """ - token_info = _TokenInfo(self._cache) + token_info = _TokenInfo() names, address = self.sanitizer.process_names(place) @@ -475,6 +468,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): + hnr_tokens = set() hnrs = set() addr_terms = [] streets = [] @@ -482,9 +476,10 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if item.kind == 'postcode': self._add_postcode(item.name) elif item.kind == 'housenumber': - norm_name = self._make_standard_hnr(item.name) - if norm_name: - hnrs.add(norm_name) + token, hnr = self._compute_housenumber_token(item) + if token is not None: + hnr_tokens.add(token) + hnrs.add(hnr) elif item.kind == 'street': streets.extend(self._retrieve_full_tokens(item.name)) elif item.kind == 'place': @@ -495,7 +490,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: - token_info.add_housenumbers(self.conn, hnrs) + token_info.add_housenumbers(hnr_tokens, hnrs) if addr_terms: token_info.add_address_terms(addr_terms) @@ -504,6 +499,24 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): token_info.add_street(streets) + def _compute_housenumber_token(self, hnr): + """ Normalize the housenumber and return the word token and the + canonical form. + """ + norm_name = self._search_normalized(hnr.name) + if not norm_name: + return None, None + + token = self._cache.housenumbers.get(norm_name) + if token is None: + with self.conn.cursor() as cur: + cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + token = cur.fetchone()[0] + self._cache.housenumbers[norm_name] = token + + return token, norm_name + + def _compute_partial_tokens(self, name): """ Normalize the given term, split it into partial words and return then token list for them. @@ -612,8 +625,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): class _TokenInfo: """ Collect token information to be sent back to the database. """ - def __init__(self, cache): - self._cache = cache + def __init__(self): self.data = {} @staticmethod @@ -627,11 +639,11 @@ class _TokenInfo: self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) - def add_housenumbers(self, conn, hnrs): + def add_housenumbers(self, tokens, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr_tokens'] = self._mk_array(tokens) self.data['hnr'] = ';'.join(hnrs) @@ -670,29 +682,3 @@ class _TokenCache: self.fulls = {} self.postcodes = set() self.housenumbers = {} - - - def get_hnr_tokens(self, conn, terms): - """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. `terms` is an iterable of normalized - housenumbers. - """ - tokens = [] - askdb = [] - - for term in terms: - token = self.housenumbers.get(term) - if token is None: - askdb.append(term) - else: - tokens.append(token) - - if askdb: - with conn.cursor() as cur: - cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr", - (askdb, )) - for term, tid in cur: - self.housenumbers[term] = tid - tokens.append(tid) - - return tokens -- 2.39.5