X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/70f154be8b69d3b57eebd25eff225ee29ccc97ba..1e9f37ab82db1758235bedf83c659693f4ca6c3e:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 14fa5b60..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module. """ from collections import Counter import itertools +import json import logging import re from textwrap import dedent @@ -78,7 +79,6 @@ class LegacyICUTokenizer: """ Do any required postprocessing to make the tokenizer data ready for use. """ - pass def update_sql_functions(self, config): @@ -155,25 +155,12 @@ class LegacyICUTokenizer: LOG.warning("Precomputing word tokens") # get partial words and their frequencies - words = Counter() - name_proc = ICUNameProcessor(self.naming_rules) - with conn.cursor(name="words") as cur: - cur.execute(""" SELECT v, count(*) FROM - (SELECT svals(name) as v FROM place)x - WHERE length(v) < 75 GROUP BY v""") - - for name, cnt in cur: - terms = set() - for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - if ' ' in word: - terms.update(word.split()) - for term in terms: - words[term] += cnt + words = self._count_partial_terms(conn) # copy them back into the word table with CopyBuffer() as copystr: - for k, v in words.items(): - copystr.add('w', k, {'count': v}) + for term, cnt in words.items(): + copystr.add('w', term, json.dumps({'count': cnt})) with conn.cursor() as cur: copystr.copy_out(cur, 'word', @@ -183,6 +170,27 @@ class LegacyICUTokenizer: conn.commit() + def _count_partial_terms(self, conn): + """ Count the partial terms from the names in the place table. + """ + words = Counter() + name_proc = ICUNameProcessor(self.naming_rules) + + with conn.cursor(name="words") as cur: + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") + + for name, cnt in cur: + terms = set() + for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): + if ' ' in word: + terms.update(word.split()) + for term in terms: + words[term] += cnt + + return words + class LegacyICUNameAnalyzer: """ The legacy analyzer uses the ICU library for splitting names. @@ -235,17 +243,17 @@ class LegacyICUNameAnalyzer: partial_tokens[word] = self.name_processor.get_search_normalized(word) with self.conn.cursor() as cur: - cur.execute("""(SELECT word_token, word_id - FROM word WHERE word_token = ANY(%s) and type = 'W') - UNION - (SELECT word_token, word_id - FROM word WHERE word_token = ANY(%s) and type = 'w')""", - (list(full_tokens.values()), - list(partial_tokens.values()))) - ids = {r[0]: r[1] for r in cur} + cur.execute("""SELECT word_token, word_id + FROM word WHERE word_token = ANY(%s) and type = 'W' + """, (list(full_tokens.values()),)) + full_ids = {r[0]: r[1] for r in cur} + cur.execute("""SELECT word_token, word_id + FROM word WHERE word_token = ANY(%s) and type = 'w'""", + (list(partial_tokens.values()),)) + part_ids = {r[0]: r[1] for r in cur} - return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \ - + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()] + return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \ + + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()] @staticmethod @@ -277,7 +285,7 @@ class LegacyICUNameAnalyzer: (SELECT pc, word FROM (SELECT distinct(postcode) as pc FROM location_postcode) p FULL JOIN - (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w + (SELECT word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") @@ -287,15 +295,15 @@ class LegacyICUNameAnalyzer: to_delete.append(word) else: copystr.add(self.name_processor.get_search_normalized(postcode), - 'P', {'postcode': postcode}) + 'P', postcode) if to_delete: cur.execute("""DELETE FROM WORD - WHERE type ='P' and info->>'postcode' = any(%s) + WHERE type ='P' and word = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', - columns=['word_token', 'type', 'info']) + columns=['word_token', 'type', 'word']) def update_special_phrases(self, phrases, should_replace): @@ -310,9 +318,9 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("SELECT info FROM word WHERE type = 'S'") - for (info, ) in cur: - existing_phrases.add((info['word'], info['class'], info['type'], + cur.execute("SELECT word, info FROM word WHERE type = 'S'") + for word, info in cur: + existing_phrases.add((word, info['class'], info['type'], info.get('op') or '-')) added = self._add_special_phrases(cur, norm_phrases, existing_phrases) @@ -336,13 +344,13 @@ class LegacyICUNameAnalyzer: for word, cls, typ, oper in to_add: term = self.name_processor.get_search_normalized(word) if term: - copystr.add(term, 'S', - {'word': word, 'class': cls, 'type': typ, - 'op': oper if oper in ('in', 'near') else None}) + copystr.add(term, 'S', word, + json.dumps({'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None})) added += 1 copystr.copy_out(cursor, 'word', - columns=['word_token', 'type', 'info']) + columns=['word_token', 'type', 'word', 'info']) return added @@ -357,7 +365,7 @@ class LegacyICUNameAnalyzer: if to_delete: cursor.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE info->>'word' = name + WHERE type = 'S' and word = name and info->>'class' = in_class and info->>'type' = in_type and ((op = '-' and info->>'op' is null) or op = info->>'op') """, to_delete) @@ -377,14 +385,14 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get existing names cur.execute("""SELECT word_token FROM word - WHERE type = 'C' and info->>'cc'= %s""", + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT token, 'C', json_build_object('cc', %s) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) """, (country_code, list(word_tokens))) @@ -502,12 +510,11 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT %s, 'P', json_build_object('postcode', pc) - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE type = 'P' and info->>postcode = pc)) + WHERE type = 'P' and word = pc)) """, (term, postcode)) self._cache.postcodes.add(postcode)