X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/8413075249e1bb2832df4edd0f66d61f77fb9f99..bc8b2d4ae0dbaef64448ddcb530de9626da9d82d:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index eb850237..6d3d11c1 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,20 +3,15 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import functools -import io import itertools -import json import logging import re from textwrap import dedent from pathlib import Path -from icu import Transliterator -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property +from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules @@ -103,9 +98,7 @@ class LegacyICUTokenizer: """ self.init_from_project() - if self.normalization is None\ - or self.transliteration is None\ - or self.abbreviations is None: + if self.naming_rules is None: return "Configuration for tokenizer 'legacy_icu' are missing." return None @@ -128,7 +121,7 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - + # pylint: disable=missing-format-attribute def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ @@ -168,22 +161,28 @@ class LegacyICUTokenizer: words = Counter() name_proc = ICUNameProcessor(self.naming_rules) with conn.cursor(name="words") as cur: - cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v") + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") for name, cnt in cur: + terms = set() for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - for term in word.split(): - words[term] += cnt + if ' ' in word: + terms.update(word.split()) + for term in terms: + words[term] += cnt # copy them back into the word table - copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items()))) + with CopyBuffer() as copystr: + for args in words.items(): + copystr.add(*args) - - with conn.cursor() as cur: - copystr.seek(0) - cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count']) - cur.execute("""UPDATE word SET word_id = nextval('seq_word') - WHERE word_id is null""") + with conn.cursor() as cur: + copystr.copy_out(cur, 'word', + columns=['word_token', 'search_name_count']) + cur.execute("""UPDATE word SET word_id = nextval('seq_word') + WHERE word_id is null""") conn.commit() @@ -219,7 +218,7 @@ class LegacyICUNameAnalyzer: self.conn = None - def get_word_token_info(self, conn, words): + def get_word_token_info(self, words): """ Return token information for the given list of words. If a word starts with # it is assumed to be a full name otherwise is a partial name. @@ -233,11 +232,11 @@ class LegacyICUNameAnalyzer: tokens = {} for word in words: if word.startswith('#'): - tokens[word] = ' ' + self.name_processor.get_normalized(word[1:]) + tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:]) else: - tokens[word] = self.name_processor.get_normalized(word) + tokens[word] = self.name_processor.get_search_normalized(word) - with conn.cursor() as cur: + with self.conn.cursor() as cur: cur.execute("""SELECT word_token, word_id FROM word, (SELECT unnest(%s::TEXT[]) as term) t WHERE word_token = t.term @@ -245,7 +244,7 @@ class LegacyICUNameAnalyzer: (list(tokens.values()), )) ids = {r[0]: r[1] for r in cur} - return [(k, v, ids[v]) for k, v in tokens.items()] + return [(k, v, ids.get(v, None)) for k, v in tokens.items()] @staticmethod @@ -270,7 +269,6 @@ class LegacyICUNameAnalyzer: table. """ to_delete = [] - copystr = io.StringIO() with self.conn.cursor() as cur: # This finds us the rows in location_postcode and word that are # missing in the other table. @@ -283,32 +281,31 @@ class LegacyICUNameAnalyzer: ON pc = word) x WHERE pc is null or word is null""") - for postcode, word in cur: - if postcode is None: - to_delete.append(word) - else: - copystr.write(postcode) - copystr.write('\t ') - copystr.write(self.name_processor.get_search_normalized(postcode)) - copystr.write('\tplace\tpostcode\t0\n') + with CopyBuffer() as copystr: + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + copystr.add( + postcode, + ' ' + self.name_processor.get_search_normalized(postcode), + 'place', 'postcode', 0) - if to_delete: - cur.execute("""DELETE FROM WORD - WHERE class ='place' and type = 'postcode' - and word = any(%s) - """, (to_delete, )) + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) - if copystr.getvalue(): - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) + copystr.copy_out(cur, 'word', + columns=['word', 'word_token', 'class', 'type', + 'search_name_count']) def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ - norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3]) + norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) with self.conn.cursor() as cur: @@ -320,40 +317,53 @@ class LegacyICUNameAnalyzer: for label, cls, typ, oper in cur: existing_phrases.add((label, cls, typ, oper or '-')) - to_add = norm_phrases - existing_phrases - to_delete = existing_phrases - norm_phrases - - if to_add: - copystr = io.StringIO() - for word, cls, typ, oper in to_add: - term = self.name_processor.get_search_normalized(word) - if term: - copystr.write(word) - copystr.write('\t ') - copystr.write(term) - copystr.write('\t') - copystr.write(cls) - copystr.write('\t') - copystr.write(typ) - copystr.write('\t') - copystr.write(oper if oper in ('in', 'near') else '\\N') - copystr.write('\t0\n') - - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'operator', 'search_name_count']) - - if to_delete and should_replace: - psycopg2.extras.execute_values( - cur, - """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE word = name and class = in_class and type = in_type - and ((op = '-' and operator is null) or op = operator)""", - to_delete) + added = self._add_special_phrases(cur, norm_phrases, existing_phrases) + if should_replace: + deleted = self._remove_special_phrases(cur, norm_phrases, + existing_phrases) + else: + deleted = 0 LOG.info("Total phrases: %s. Added: %s. Deleted: %s", - len(norm_phrases), len(to_add), len(to_delete)) + len(norm_phrases), added, deleted) + + + def _add_special_phrases(self, cursor, new_phrases, existing_phrases): + """ Add all phrases to the database that are not yet there. + """ + to_add = new_phrases - existing_phrases + + added = 0 + with CopyBuffer() as copystr: + for word, cls, typ, oper in to_add: + term = self.name_processor.get_search_normalized(word) + if term: + copystr.add(word, ' ' + term, cls, typ, + oper if oper in ('in', 'near') else None, 0) + added += 1 + + copystr.copy_out(cursor, 'word', + columns=['word', 'word_token', 'class', 'type', + 'operator', 'search_name_count']) + + return added + + + @staticmethod + def _remove_special_phrases(cursor, new_phrases, existing_phrases): + """ Remove all phrases from the databse that are no longer in the + new phrase list. + """ + to_delete = existing_phrases - new_phrases + + if to_delete: + cursor.execute_values( + """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) + WHERE word = name and class = in_class and type = in_type + and ((op = '-' and operator is null) or op = operator)""", + to_delete) + + return len(to_delete) def add_country_names(self, country_code, names): @@ -373,9 +383,9 @@ class LegacyICUNameAnalyzer: if word_tokens: cur.execute("""INSERT INTO word (word_id, word_token, country_code, search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + (SELECT nextval('seq_word'), token, %s, 0 FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) def process_place(self, place): @@ -398,33 +408,36 @@ class LegacyICUNameAnalyzer: self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(*self._compute_name_tokens({'name': value})) - elif key == 'place': - token_info.add_place(*self._compute_name_tokens({'name': value})) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, *self._compute_name_tokens({'name': value}))) - - if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) - - if addr_terms: - token_info.add_address_terms(addr_terms) + self._process_place_address(token_info, address) return token_info.data + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + for key, value in address.items(): + if key == 'postcode': + self._add_postcode(value) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(*self._compute_name_tokens({'name': value})) + elif key == 'place': + token_info.add_place(*self._compute_name_tokens({'name': value})) + elif not key.startswith('_') and \ + key not in ('country', 'full'): + addr_terms.append((key, *self._compute_name_tokens({'name': value}))) + + if hnrs: + hnrs = self._split_housenumbers(hnrs) + token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + + if addr_terms: + token_info.add_address_terms(addr_terms) + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -438,6 +451,9 @@ class LegacyICUNameAnalyzer: full, part = self._cache.names.get(norm_name, (None, None)) if full is None: variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + with self.conn.cursor() as cur: cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", (norm_name, variants)) @@ -451,17 +467,19 @@ class LegacyICUNameAnalyzer: return full_tokens, partial_tokens - def _compute_full_names(self, names): + @staticmethod + def _compute_full_names(names): """ Return the set of all full name word ids to be used with the given dictionary of names. """ full_names = set() - for name in (n for ns in names.values() for n in re.split('[;,]', ns)): - full_names.add(name.strip()) + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -534,7 +552,7 @@ class _TokenInfo: self.data['hnr'] = ';'.join(hnrs) - def add_street(self, fulls, partials): + def add_street(self, fulls, _): """ Add addr:street match terms. """ if fulls: