X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/1ffb6bd5d0e1aea120f953a55d72025f47206242..70f154be8b69d3b57eebd25eff225ee29ccc97ba:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 156e99ec..14fa5b60 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,26 +3,21 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import functools -import io import itertools -import json import logging import re from textwrap import dedent from pathlib import Path -from icu import Transliterator -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property +from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules -DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" -DBCFG_TRANSLITERATION = "tokenizer_transliteration" -DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" +DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -41,9 +36,9 @@ class LegacyICUTokenizer: def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.normalization = None - self.transliteration = None - self.abbreviations = None + self.naming_rules = None + self.term_normalization = None + self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -55,14 +50,14 @@ class LegacyICUTokenizer: if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: - cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' + cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' - rules = json.loads(cfgfile.read_text()) - self.transliteration = ';'.join(rules['normalization']) + ';' - self.abbreviations = rules["abbreviations"] - self.normalization = config.TERM_NORMALIZATION + loader = ICURuleLoader(cfgfile) + self.naming_rules = ICUNameProcessorRules(loader=loader) + self.term_normalization = config.TERM_NORMALIZATION + self.max_word_frequency = config.MAX_WORD_FREQUENCY - self._install_php(config) + self._install_php(config.lib_dir.php) self._save_config(config) if init_db: @@ -74,18 +69,16 @@ class LegacyICUTokenizer: """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: - self.normalization = get_property(conn, DBCFG_NORMALIZATION) - self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) - self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) + self.naming_rules = ICUNameProcessorRules(conn=conn) + self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) + self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + pass def update_sql_functions(self, config): @@ -103,9 +96,7 @@ class LegacyICUTokenizer: """ self.init_from_project() - if self.normalization is None\ - or self.transliteration is None\ - or self.abbreviations is None: + if self.naming_rules is None: return "Configuration for tokenizer 'legacy_icu' are missing." return None @@ -126,26 +117,19 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ - norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("trans", self.transliteration) - return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) + return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - def _install_php(self, config): + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ - abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'postcode' as word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") - for postcode, word in cur: - if postcode is None: - to_delete.append(word) - else: - copystr.write(postcode) - copystr.write('\t ') - copystr.write(self.transliterator.transliterate(postcode)) - copystr.write('\tplace\tpostcode\t0\n') + with CopyBuffer() as copystr: + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + copystr.add(self.name_processor.get_search_normalized(postcode), + 'P', {'postcode': postcode}) - if to_delete: - cur.execute("""DELETE FROM WORD - WHERE class ='place' and type = 'postcode' - and word = any(%s) - """, (to_delete, )) + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE type ='P' and info->>'postcode' = any(%s) + """, (to_delete, )) - if copystr.getvalue(): - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) + copystr.copy_out(cur, 'word', + columns=['word_token', 'type', 'info']) def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. + If `should_replace` is True, then the previous set of will be + completely replaced. Otherwise the phrases are added to the + already existing ones. """ - norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) + norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("""SELECT word, class, type, operator FROM word - WHERE class != 'place' - OR (type != 'house' AND type != 'postcode')""") - for label, cls, typ, oper in cur: - existing_phrases.add((label, cls, typ, oper or '-')) - - to_add = norm_phrases - existing_phrases - to_delete = existing_phrases - norm_phrases - - if to_add: - copystr = io.StringIO() - for word, cls, typ, oper in to_add: - term = self.make_standard_word(word) - if term: - copystr.write(word) - copystr.write('\t ') - copystr.write(term) - copystr.write('\t') - copystr.write(cls) - copystr.write('\t') - copystr.write(typ) - copystr.write('\t') - copystr.write(oper if oper in ('in', 'near') else '\\N') - copystr.write('\t0\n') - - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'operator', 'search_name_count']) - - if to_delete and should_replace: - psycopg2.extras.execute_values( - cur, - """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE word = name and class = in_class and type = in_type - and ((op = '-' and operator is null) or op = operator)""", - to_delete) + cur.execute("SELECT info FROM word WHERE type = 'S'") + for (info, ) in cur: + existing_phrases.add((info['word'], info['class'], info['type'], + info.get('op') or '-')) + + added = self._add_special_phrases(cur, norm_phrases, existing_phrases) + if should_replace: + deleted = self._remove_special_phrases(cur, norm_phrases, + existing_phrases) + else: + deleted = 0 LOG.info("Total phrases: %s. Added: %s. Deleted: %s", - len(norm_phrases), len(to_add), len(to_delete)) + len(norm_phrases), added, deleted) - def add_country_names(self, country_code, names): - """ Add names for the given country to the search index. + def _add_special_phrases(self, cursor, new_phrases, existing_phrases): + """ Add all phrases to the database that are not yet there. + """ + to_add = new_phrases - existing_phrases + + added = 0 + with CopyBuffer() as copystr: + for word, cls, typ, oper in to_add: + term = self.name_processor.get_search_normalized(word) + if term: + copystr.add(term, 'S', + {'word': word, 'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None}) + added += 1 + + copystr.copy_out(cursor, 'word', + columns=['word_token', 'type', 'info']) + + return added + + + @staticmethod + def _remove_special_phrases(cursor, new_phrases, existing_phrases): + """ Remove all phrases from the databse that are no longer in the + new phrase list. """ - full_names = set((self.make_standard_word(n) for n in names)) - full_names.discard('') - self._add_normalized_country_names(country_code, full_names) + to_delete = existing_phrases - new_phrases + + if to_delete: + cursor.execute_values( + """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) + WHERE info->>'word' = name + and info->>'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) + + return len(to_delete) - def _add_normalized_country_names(self, country_code, names): + def add_country_names(self, country_code, names): """ Add names for the given country to the search index. """ - word_tokens = set((' ' + name for name in names)) + word_tokens = set() + for name in self._compute_full_names(names): + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) + with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and info->>'cc'= %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT token, 'C', json_build_object('cc', %s) FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) + + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. def process_place(self, place): @@ -423,44 +404,91 @@ class LegacyICUNameAnalyzer: names = place.get('name') if names: - full_names = set((self.make_standard_word(name) for name in names.values())) - full_names.discard('') + fulls, partials = self._compute_name_tokens(names) - token_info.add_names(self.conn, full_names) + token_info.add_names(fulls, partials) country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self._add_normalized_country_names(country_feature.lower(), - full_names) + self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(self.conn, self.make_standard_word(value)) - elif key == 'place': - token_info.add_place(self.conn, self.make_standard_word(value)) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, self.make_standard_word(value))) - - if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) - - if addr_terms: - token_info.add_address_terms(self.conn, addr_terms) + self._process_place_address(token_info, address) return token_info.data + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + for key, value in address.items(): + if key == 'postcode': + self._add_postcode(value) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(*self._compute_name_tokens({'name': value})) + elif key == 'place': + token_info.add_place(*self._compute_name_tokens({'name': value})) + elif not key.startswith('_') and \ + key not in ('country', 'full'): + addr_terms.append((key, *self._compute_name_tokens({'name': value}))) + + if hnrs: + hnrs = self._split_housenumbers(hnrs) + token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + + if addr_terms: + token_info.add_address_terms(addr_terms) + + + def _compute_name_tokens(self, names): + """ Computes the full name and partial name tokens for the given + dictionary of names. + """ + full_names = self._compute_full_names(names) + full_tokens = set() + partial_tokens = set() + + for name in full_names: + norm_name = self.name_processor.get_normalized(name) + full, part = self._cache.names.get(norm_name, (None, None)) + if full is None: + variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + + with self.conn.cursor() as cur: + cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", + (norm_name, variants)) + full, part = cur.fetchone() + + self._cache.names[norm_name] = (full, part) + + full_tokens.add(full) + partial_tokens.update(part) + + return full_tokens, partial_tokens + + + @staticmethod + def _compute_full_names(names): + """ Return the set of all full name word ids to be used with the + given dictionary of names. + """ + full_names = set() + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) + + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) + + return full_names + + def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ @@ -468,22 +496,22 @@ class LegacyICUNameAnalyzer: postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.make_standard_word(postcode) + term = self.name_processor.get_search_normalized(postcode) if not term: return with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT %s, 'P', json_build_object('postcode', pc) FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and info->>postcode = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) + @staticmethod def _split_housenumbers(hnrs): if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: @@ -506,7 +534,7 @@ class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self, cache): - self.cache = cache + self._cache = cache self.data = {} @staticmethod @@ -514,88 +542,44 @@ class _TokenInfo: return '{%s}' % ','.join((str(s) for s in tokens)) - def add_names(self, conn, names): + def add_names(self, fulls, partials): """ Adds token information for the normalised names. """ - # Start with all partial names - terms = set((part for ns in names for part in ns.split())) - # Add partials for the full terms (TO BE REMOVED) - terms.update((n for n in names)) - # Add the full names - terms.update((' ' + n for n in names)) - - self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) + self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) self.data['hnr'] = ';'.join(hnrs) - def add_street(self, conn, street): + def add_street(self, fulls, _): """ Add addr:street match terms. """ - if not street: - return - - term = ' ' + street - - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid + if fulls: + self.data['street'] = self._mk_array(fulls) - if tid is not None: - self.data['street'] = '{%d}' % tid - - def add_place(self, conn, place): + def add_place(self, fulls, partials): """ Add addr:place search and match terms. """ - if not place: - return - - partial_ids = self.cache.get_term_tokens(conn, place.split()) - tid = self.cache.get_term_tokens(conn, [' ' + place]) - - self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) - self.data['place_match'] = '{%s}' % tid[0] + if fulls: + self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials)) + self.data['place_match'] = self._mk_array(fulls) - def add_address_terms(self, conn, terms): + def add_address_terms(self, terms): """ Add additional address terms. """ tokens = {} - for key, value in terms: - if not value: - continue - partial_ids = self.cache.get_term_tokens(conn, value.split()) - term = ' ' + value - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - tokens[key] = [self._mk_array(partial_ids), - '{%s}' % ('' if tid is None else str(tid))] + for key, fulls, partials in terms: + if fulls: + tokens[key] = [self._mk_array(itertools.chain(fulls, partials)), + self._mk_array(fulls)] if tokens: self.data['addr'] = tokens @@ -613,35 +597,10 @@ class _TokenCache: self.housenumbers = {} - def get_term_tokens(self, conn, terms): - """ Get token ids for a list of terms, looking them up in the database - if necessary. - """ - tokens = [] - askdb = [] - - for term in terms: - token = self.names.get(term) - if token is None: - askdb.append(term) - elif token != 0: - tokens.append(token) - - if askdb: - with conn.cursor() as cur: - cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", - (askdb, )) - for term, tid in cur: - self.names[term] = tid - if tid != 0: - tokens.append(tid) - - return tokens - - def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = []