X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/4f4d15c28a8743c2f3dfb6d3e5b787b94ef66fc5..1e9f37ab82db1758235bedf83c659693f4ca6c3e:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index b4d85356..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,8 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import functools -import io import itertools import json import logging @@ -12,17 +10,15 @@ import re from textwrap import dedent from pathlib import Path -from icu import Transliterator -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property +from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules -DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" -DBCFG_TRANSLITERATION = "tokenizer_transliteration" -DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" +DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -41,9 +37,9 @@ class LegacyICUTokenizer: def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.normalization = None - self.transliteration = None - self.abbreviations = None + self.naming_rules = None + self.term_normalization = None + self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -55,14 +51,14 @@ class LegacyICUTokenizer: if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: - cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' + cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' - rules = json.loads(cfgfile.read_text()) - self.transliteration = ';'.join(rules['normalization']) + ';' - self.abbreviations = rules["abbreviations"] - self.normalization = config.TERM_NORMALIZATION + loader = ICURuleLoader(cfgfile) + self.naming_rules = ICUNameProcessorRules(loader=loader) + self.term_normalization = config.TERM_NORMALIZATION + self.max_word_frequency = config.MAX_WORD_FREQUENCY - self._install_php(config) + self._install_php(config.lib_dir.php) self._save_config(config) if init_db: @@ -74,18 +70,15 @@ class LegacyICUTokenizer: """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: - self.normalization = get_property(conn, DBCFG_NORMALIZATION) - self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) - self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) + self.naming_rules = ICUNameProcessorRules(conn=conn) + self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) + self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') def update_sql_functions(self, config): @@ -103,9 +96,7 @@ class LegacyICUTokenizer: """ self.init_from_project() - if self.normalization is None\ - or self.transliteration is None\ - or self.abbreviations is None: + if self.naming_rules is None: return "Configuration for tokenizer 'legacy_icu' are missing." return None @@ -126,26 +117,19 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ - norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("trans", self.transliteration) - return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) + return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - def _install_php(self, config): + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ - abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) + + return len(to_delete) + + + def add_country_names(self, country_code, names): """ Add names for the given country to the search index. """ - word_tokens = set((' ' + name for name in names)) + word_tokens = set() + for name in self._compute_full_names(names): + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) + with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) + + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. def process_place(self, place): @@ -423,58 +412,87 @@ class LegacyICUNameAnalyzer: names = place.get('name') if names: - full_names = self._compute_full_names(names) + fulls, partials = self._compute_name_tokens(names) - token_info.add_names(self.conn, full_names) + token_info.add_names(fulls, partials) country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self._add_normalized_country_names(country_feature.lower(), - full_names) + self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(self.conn, self.make_standard_word(value)) - elif key == 'place': - token_info.add_place(self.conn, self.make_standard_word(value)) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, self.make_standard_word(value))) - - if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) - - if addr_terms: - token_info.add_address_terms(self.conn, addr_terms) + self._process_place_address(token_info, address) return token_info.data - def _compute_full_names(self, names): + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + for key, value in address.items(): + if key == 'postcode': + self._add_postcode(value) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(*self._compute_name_tokens({'name': value})) + elif key == 'place': + token_info.add_place(*self._compute_name_tokens({'name': value})) + elif not key.startswith('_') and \ + key not in ('country', 'full'): + addr_terms.append((key, *self._compute_name_tokens({'name': value}))) + + if hnrs: + hnrs = self._split_housenumbers(hnrs) + token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + + if addr_terms: + token_info.add_address_terms(addr_terms) + + + def _compute_name_tokens(self, names): + """ Computes the full name and partial name tokens for the given + dictionary of names. + """ + full_names = self._compute_full_names(names) + full_tokens = set() + partial_tokens = set() + + for name in full_names: + norm_name = self.name_processor.get_normalized(name) + full, part = self._cache.names.get(norm_name, (None, None)) + if full is None: + variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + + with self.conn.cursor() as cur: + cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", + (norm_name, variants)) + full, part = cur.fetchone() + + self._cache.names[norm_name] = (full, part) + + full_tokens.add(full) + partial_tokens.update(part) + + return full_tokens, partial_tokens + + + @staticmethod + def _compute_full_names(names): """ Return the set of all full name word ids to be used with the given dictionary of names. """ full_names = set() - for name in (n for ns in names.values() for n in re.split('[;,]', ns)): - word = self.make_standard_word(name) - if word: - full_names.add(word) + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) - brace_split = name.split('(', 2) - if len(brace_split) > 1: - word = self.make_standard_word(brace_split[0]) - if word: - full_names.add(word) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -486,22 +504,21 @@ class LegacyICUNameAnalyzer: postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.make_standard_word(postcode) + term = self.name_processor.get_search_normalized(postcode) if not term: return with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and word = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) + @staticmethod def _split_housenumbers(hnrs): if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: @@ -524,7 +541,7 @@ class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self, cache): - self.cache = cache + self._cache = cache self.data = {} @staticmethod @@ -532,88 +549,44 @@ class _TokenInfo: return '{%s}' % ','.join((str(s) for s in tokens)) - def add_names(self, conn, names): + def add_names(self, fulls, partials): """ Adds token information for the normalised names. """ - # Start with all partial names - terms = set((part for ns in names for part in ns.split())) - # Add partials for the full terms (TO BE REMOVED) - terms.update((n for n in names)) - # Add the full names - terms.update((' ' + n for n in names)) - - self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) + self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) self.data['hnr'] = ';'.join(hnrs) - def add_street(self, conn, street): + def add_street(self, fulls, _): """ Add addr:street match terms. """ - if not street: - return - - term = ' ' + street - - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid + if fulls: + self.data['street'] = self._mk_array(fulls) - if tid is not None: - self.data['street'] = '{%d}' % tid - - def add_place(self, conn, place): + def add_place(self, fulls, partials): """ Add addr:place search and match terms. """ - if not place: - return - - partial_ids = self.cache.get_term_tokens(conn, place.split()) - tid = self.cache.get_term_tokens(conn, [' ' + place]) - - self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) - self.data['place_match'] = '{%s}' % tid[0] + if fulls: + self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials)) + self.data['place_match'] = self._mk_array(fulls) - def add_address_terms(self, conn, terms): + def add_address_terms(self, terms): """ Add additional address terms. """ tokens = {} - for key, value in terms: - if not value: - continue - partial_ids = self.cache.get_term_tokens(conn, value.split()) - term = ' ' + value - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - tokens[key] = [self._mk_array(partial_ids), - '{%s}' % ('' if tid is None else str(tid))] + for key, fulls, partials in terms: + if fulls: + tokens[key] = [self._mk_array(itertools.chain(fulls, partials)), + self._mk_array(fulls)] if tokens: self.data['addr'] = tokens @@ -631,35 +604,10 @@ class _TokenCache: self.housenumbers = {} - def get_term_tokens(self, conn, terms): - """ Get token ids for a list of terms, looking them up in the database - if necessary. - """ - tokens = [] - askdb = [] - - for term in terms: - token = self.names.get(term) - if token is None: - askdb.append(term) - elif token != 0: - tokens.append(token) - - if askdb: - with conn.cursor() as cur: - cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", - (askdb, )) - for term, tid in cur: - self.names[term] = tid - if tid != 0: - tokens.append(tid) - - return tokens - - def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = []