X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2e3c5d4c5b39e29af57a9398f20fdf5cad0e9045..70f154be8b69d3b57eebd25eff225ee29ccc97ba:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 20932144..14fa5b60 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,20 +3,15 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import functools -import io import itertools -import json import logging import re from textwrap import dedent from pathlib import Path -from icu import Transliterator -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property +from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules @@ -79,13 +74,11 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + pass def update_sql_functions(self, config): @@ -103,9 +96,7 @@ class LegacyICUTokenizer: """ self.init_from_project() - if self.normalization is None\ - or self.transliteration is None\ - or self.abbreviations is None: + if self.naming_rules is None: return "Configuration for tokenizer 'legacy_icu' are missing." return None @@ -133,13 +124,12 @@ class LegacyICUTokenizer: """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'postcode' as word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") - for postcode, word in cur: - if postcode is None: - to_delete.append(word) - else: - copystr.write(postcode) - copystr.write('\t ') - copystr.write(self.name_processor.get_search_normalized(postcode)) - copystr.write('\tplace\tpostcode\t0\n') + with CopyBuffer() as copystr: + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + copystr.add(self.name_processor.get_search_normalized(postcode), + 'P', {'postcode': postcode}) - if to_delete: - cur.execute("""DELETE FROM WORD - WHERE class ='place' and type = 'postcode' - and word = any(%s) - """, (to_delete, )) + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE type ='P' and info->>'postcode' = any(%s) + """, (to_delete, )) - if copystr.getvalue(): - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) + copystr.copy_out(cur, 'word', + columns=['word_token', 'type', 'info']) def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. + If `should_replace` is True, then the previous set of will be + completely replaced. Otherwise the phrases are added to the + already existing ones. """ norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) @@ -314,46 +310,59 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("""SELECT word, class, type, operator FROM word - WHERE class != 'place' - OR (type != 'house' AND type != 'postcode')""") - for label, cls, typ, oper in cur: - existing_phrases.add((label, cls, typ, oper or '-')) - - to_add = norm_phrases - existing_phrases - to_delete = existing_phrases - norm_phrases - - if to_add: - copystr = io.StringIO() - for word, cls, typ, oper in to_add: - term = self.name_processor.get_search_normalized(word) - if term: - copystr.write(word) - copystr.write('\t ') - copystr.write(term) - copystr.write('\t') - copystr.write(cls) - copystr.write('\t') - copystr.write(typ) - copystr.write('\t') - copystr.write(oper if oper in ('in', 'near') else '\\N') - copystr.write('\t0\n') - - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'operator', 'search_name_count']) - - if to_delete and should_replace: - psycopg2.extras.execute_values( - cur, - """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE word = name and class = in_class and type = in_type - and ((op = '-' and operator is null) or op = operator)""", - to_delete) + cur.execute("SELECT info FROM word WHERE type = 'S'") + for (info, ) in cur: + existing_phrases.add((info['word'], info['class'], info['type'], + info.get('op') or '-')) + + added = self._add_special_phrases(cur, norm_phrases, existing_phrases) + if should_replace: + deleted = self._remove_special_phrases(cur, norm_phrases, + existing_phrases) + else: + deleted = 0 LOG.info("Total phrases: %s. Added: %s. Deleted: %s", - len(norm_phrases), len(to_add), len(to_delete)) + len(norm_phrases), added, deleted) + + + def _add_special_phrases(self, cursor, new_phrases, existing_phrases): + """ Add all phrases to the database that are not yet there. + """ + to_add = new_phrases - existing_phrases + + added = 0 + with CopyBuffer() as copystr: + for word, cls, typ, oper in to_add: + term = self.name_processor.get_search_normalized(word) + if term: + copystr.add(term, 'S', + {'word': word, 'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None}) + added += 1 + + copystr.copy_out(cursor, 'word', + columns=['word_token', 'type', 'info']) + + return added + + + @staticmethod + def _remove_special_phrases(cursor, new_phrases, existing_phrases): + """ Remove all phrases from the databse that are no longer in the + new phrase list. + """ + to_delete = existing_phrases - new_phrases + + if to_delete: + cursor.execute_values( + """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) + WHERE info->>'word' = name + and info->>'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) + + return len(to_delete) def add_country_names(self, country_code, names): @@ -361,21 +370,27 @@ class LegacyICUNameAnalyzer: """ word_tokens = set() for name in self._compute_full_names(names): - if name: - word_tokens.add(' ' + self.name_processor.get_search_normalized(name)) + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and info->>'cc'= %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT token, 'C', json_build_object('cc', %s) FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) + + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. def process_place(self, place): @@ -398,33 +413,36 @@ class LegacyICUNameAnalyzer: self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(*self._compute_name_tokens({'name': value})) - elif key == 'place': - token_info.add_place(*self._compute_name_tokens({'name': value})) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, *self._compute_name_tokens({'name': value}))) - - if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) - - if addr_terms: - token_info.add_address_terms(addr_terms) + self._process_place_address(token_info, address) return token_info.data + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + for key, value in address.items(): + if key == 'postcode': + self._add_postcode(value) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(*self._compute_name_tokens({'name': value})) + elif key == 'place': + token_info.add_place(*self._compute_name_tokens({'name': value})) + elif not key.startswith('_') and \ + key not in ('country', 'full'): + addr_terms.append((key, *self._compute_name_tokens({'name': value}))) + + if hnrs: + hnrs = self._split_housenumbers(hnrs) + token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + + if addr_terms: + token_info.add_address_terms(addr_terms) + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -438,6 +456,9 @@ class LegacyICUNameAnalyzer: full, part = self._cache.names.get(norm_name, (None, None)) if full is None: variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + with self.conn.cursor() as cur: cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", (norm_name, variants)) @@ -451,17 +472,19 @@ class LegacyICUNameAnalyzer: return full_tokens, partial_tokens - def _compute_full_names(self, names): + @staticmethod + def _compute_full_names(names): """ Return the set of all full name word ids to be used with the given dictionary of names. """ full_names = set() - for name in (n for ns in names.values() for n in re.split('[;,]', ns)): - full_names.add(name.strip()) + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -479,14 +502,13 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT %s, 'P', json_build_object('postcode', pc) FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and info->>postcode = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) @@ -534,7 +556,7 @@ class _TokenInfo: self.data['hnr'] = ';'.join(hnrs) - def add_street(self, fulls, partials): + def add_street(self, fulls, _): """ Add addr:street match terms. """ if fulls: @@ -577,7 +599,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = []