X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/6ba00e6aee2645ad958857a0df170915c8e59cdb..4c52777ef03738803845f9ee58d269d93bbb9c3d:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 689318d7..c585c5af 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,26 +3,23 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import functools -import io import itertools -import json import logging import re from textwrap import dedent from pathlib import Path -from icu import Transliterator import psycopg2.extras from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property +from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules -DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" -DBCFG_TRANSLITERATION = "tokenizer_transliteration" -DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" +DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -41,9 +38,9 @@ class LegacyICUTokenizer: def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.normalization = None - self.transliteration = None - self.abbreviations = None + self.naming_rules = None + self.term_normalization = None + self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -55,14 +52,14 @@ class LegacyICUTokenizer: if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: - cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' + cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' - rules = json.loads(cfgfile.read_text()) - self._load_transliteration(rules['normalization'], cfgfile.parent) - self.abbreviations = rules["abbreviations"] - self.normalization = config.TERM_NORMALIZATION + loader = ICURuleLoader(cfgfile) + self.naming_rules = ICUNameProcessorRules(loader=loader) + self.term_normalization = config.TERM_NORMALIZATION + self.max_word_frequency = config.MAX_WORD_FREQUENCY - self._install_php(config) + self._install_php(config.lib_dir.php) self._save_config(config) if init_db: @@ -70,19 +67,13 @@ class LegacyICUTokenizer: self._init_db_tables(config) - def _load_transliteration(self, rules, cfg_path): - if isinstance(rules, str): - self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ') - else: - self.transliteration = ';'.join(rules) + ';' - def init_from_project(self): """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: - self.normalization = get_property(conn, DBCFG_NORMALIZATION) - self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) - self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) + self.naming_rules = ICUNameProcessorRules(conn=conn) + self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) + self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, config): @@ -109,9 +100,7 @@ class LegacyICUTokenizer: """ self.init_from_project() - if self.normalization is None\ - or self.transliteration is None\ - or self.abbreviations is None: + if self.naming_rules is None: return "Configuration for tokenizer 'legacy_icu' are missing." return None @@ -132,26 +121,20 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ - norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("trans", self.transliteration) - return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) - + return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - def _install_php(self, config): + # pylint: disable=missing-format-attribute + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ - abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent("""\ 1: - word = self.make_standard_word(brace_split[0]) - if word: - full_names.add(word) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -492,7 +491,7 @@ class LegacyICUNameAnalyzer: postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.make_standard_word(postcode) + term = self.name_processor.get_search_normalized(postcode) if not term: return @@ -508,6 +507,7 @@ class LegacyICUNameAnalyzer: """, (' ' + term, postcode)) self._cache.postcodes.add(postcode) + @staticmethod def _split_housenumbers(hnrs): if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: @@ -530,7 +530,7 @@ class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self, cache): - self.cache = cache + self._cache = cache self.data = {} @staticmethod @@ -538,86 +538,44 @@ class _TokenInfo: return '{%s}' % ','.join((str(s) for s in tokens)) - def add_names(self, conn, names): + def add_names(self, fulls, partials): """ Adds token information for the normalised names. """ - # Start with all partial names - terms = set((part for ns in names for part in ns.split())) - # Add the full names - terms.update((' ' + n for n in names)) - - self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) + self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) self.data['hnr'] = ';'.join(hnrs) - def add_street(self, conn, street): + def add_street(self, fulls, _): """ Add addr:street match terms. """ - if not street: - return + if fulls: + self.data['street'] = self._mk_array(fulls) - term = ' ' + street - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - if tid is not None: - self.data['street'] = '{%d}' % tid - - - def add_place(self, conn, place): + def add_place(self, fulls, partials): """ Add addr:place search and match terms. """ - if not place: - return - - partial_ids = self.cache.get_term_tokens(conn, place.split()) - tid = self.cache.get_term_tokens(conn, [' ' + place]) + if fulls: + self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials)) + self.data['place_match'] = self._mk_array(fulls) - self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) - self.data['place_match'] = '{%s}' % tid[0] - - def add_address_terms(self, conn, terms): + def add_address_terms(self, terms): """ Add additional address terms. """ tokens = {} - for key, value in terms: - if not value: - continue - partial_ids = self.cache.get_term_tokens(conn, value.split()) - term = ' ' + value - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - tokens[key] = [self._mk_array(partial_ids), - '{%s}' % ('' if tid is None else str(tid))] + for key, fulls, partials in terms: + if fulls: + tokens[key] = [self._mk_array(itertools.chain(fulls, partials)), + self._mk_array(fulls)] if tokens: self.data['addr'] = tokens @@ -635,32 +593,6 @@ class _TokenCache: self.housenumbers = {} - def get_term_tokens(self, conn, terms): - """ Get token ids for a list of terms, looking them up in the database - if necessary. - """ - tokens = [] - askdb = [] - - for term in terms: - token = self.names.get(term) - if token is None: - askdb.append(term) - elif token != 0: - tokens.append(token) - - if askdb: - with conn.cursor() as cur: - cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", - (askdb, )) - for term, tid in cur: - self.names[term] = tid - if tid != 0: - tokens.append(tid) - - return tokens - - def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the database if necessary.