From d48793c22cd2625d5390364dfb0ec04a2cc8d0f9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 25 Jul 2021 15:30:47 +0200 Subject: [PATCH] fix Python linitin errors --- nominatim/tokenizer/legacy_icu_tokenizer.py | 41 ++++++++++++--------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 1f8096ff..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -79,7 +79,6 @@ class LegacyICUTokenizer: """ Do any required postprocessing to make the tokenizer data ready for use. """ - pass def update_sql_functions(self, config): @@ -156,25 +155,12 @@ class LegacyICUTokenizer: LOG.warning("Precomputing word tokens") # get partial words and their frequencies - words = Counter() - name_proc = ICUNameProcessor(self.naming_rules) - with conn.cursor(name="words") as cur: - cur.execute(""" SELECT v, count(*) FROM - (SELECT svals(name) as v FROM place)x - WHERE length(v) < 75 GROUP BY v""") - - for name, cnt in cur: - terms = set() - for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - if ' ' in word: - terms.update(word.split()) - for term in terms: - words[term] += cnt + words = self._count_partial_terms(conn) # copy them back into the word table with CopyBuffer() as copystr: - for k, v in words.items(): - copystr.add('w', k, json.dumps({'count': v})) + for term, cnt in words.items(): + copystr.add('w', term, json.dumps({'count': cnt})) with conn.cursor() as cur: copystr.copy_out(cur, 'word', @@ -184,6 +170,27 @@ class LegacyICUTokenizer: conn.commit() + def _count_partial_terms(self, conn): + """ Count the partial terms from the names in the place table. + """ + words = Counter() + name_proc = ICUNameProcessor(self.naming_rules) + + with conn.cursor(name="words") as cur: + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") + + for name, cnt in cur: + terms = set() + for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): + if ' ' in word: + terms.update(word.split()) + for term in terms: + words[term] += cnt + + return words + class LegacyICUNameAnalyzer: """ The legacy analyzer uses the ICU library for splitting names. -- 2.39.5