nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70
  71     def finalize_import(self, config):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75         with connect(self.dsn) as conn:
  76             sqlp = SQLPreprocessor(conn, config)
  77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  86
  87
  88     def check_database(self, config):
  89         """ Check that the tokenizer is set up correctly.
  90         """
  91         # Will throw an error if there is an issue.
  92         self.init_from_project(config)
  93
  94
  95     def update_statistics(self):
  96         """ Recompute frequencies for all name words.
  97         """
  98         with connect(self.dsn) as conn:
  99             if conn.table_exists('search_name'):
 100                 with conn.cursor() as cur:
 101                     cur.drop_table("word_frequencies")
 102                     LOG.info("Computing word frequencies")
 103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                      SELECT unnest(name_vector) as id, count(*)
 105                                      FROM search_name GROUP BY id""")
 106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                     LOG.info("Update word table with recomputed frequencies")
 108                     cur.execute("""UPDATE word
 109                                    SET info = info || jsonb_build_object('count', count)
 110                                    FROM word_frequencies WHERE word_id = id""")
 111                     cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def name_analyzer(self):
 116         """ Create a new analyzer for tokenizing names and queries
 117             using this tokinzer. Analyzers are context managers and should
 118             be used accordingly:
 119
 120             ```
 121             with tokenizer.name_analyzer() as analyzer:
 122                 analyser.tokenize()
 123             ```
 124
 125             When used outside the with construct, the caller must ensure to
 126             call the close() function before destructing the analyzer.
 127
 128             Analyzers are not thread-safe. You need to instantiate one per thread.
 129         """
 130         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 131                                      self.loader.make_token_analysis())
 132
 133
 134     def _install_php(self, phpdir):
 135         """ Install the php script for the tokenizer.
 136         """
 137         php_file = self.data_dir / "tokenizer.php"
 138         php_file.write_text(dedent(f"""\
 139             <?php
 140             @define('CONST_Max_Word_Frequency', 10000000);
 141             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 142             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 143             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 144
 145
 146     def _save_config(self):
 147         """ Save the configuration that needs to remain stable for the given
 148             database as database properties.
 149         """
 150         with connect(self.dsn) as conn:
 151             self.loader.save_config_to_db(conn)
 152
 153
 154     def _init_db_tables(self, config):
 155         """ Set up the word table and fill it with pre-computed word
 156             frequencies.
 157         """
 158         with connect(self.dsn) as conn:
 159             sqlp = SQLPreprocessor(conn, config)
 160             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 161             conn.commit()
 162
 163
 164 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 165     """ The legacy analyzer uses the ICU library for splitting names.
 166
 167         Each instance opens a connection to the database to request the
 168         normalization.
 169     """
 170
 171     def __init__(self, dsn, sanitizer, token_analysis):
 172         self.conn = connect(dsn).connection
 173         self.conn.autocommit = True
 174         self.sanitizer = sanitizer
 175         self.token_analysis = token_analysis
 176
 177         self._cache = _TokenCache()
 178
 179
 180     def close(self):
 181         """ Free all resources used by the analyzer.
 182         """
 183         if self.conn:
 184             self.conn.close()
 185             self.conn = None
 186
 187
 188     def _search_normalized(self, name):
 189         """ Return the search token transliteration of the given name.
 190         """
 191         return self.token_analysis.search.transliterate(name).strip()
 192
 193
 194     def _normalized(self, name):
 195         """ Return the normalized version of the given name with all
 196             non-relevant information removed.
 197         """
 198         return self.token_analysis.normalizer.transliterate(name).strip()
 199
 200
 201     def get_word_token_info(self, words):
 202         """ Return token information for the given list of words.
 203             If a word starts with # it is assumed to be a full name
 204             otherwise is a partial name.
 205
 206             The function returns a list of tuples with
 207             (original word, word token, word id).
 208
 209             The function is used for testing and debugging only
 210             and not necessarily efficient.
 211         """
 212         full_tokens = {}
 213         partial_tokens = {}
 214         for word in words:
 215             if word.startswith('#'):
 216                 full_tokens[word] = self._search_normalized(word[1:])
 217             else:
 218                 partial_tokens[word] = self._search_normalized(word)
 219
 220         with self.conn.cursor() as cur:
 221             cur.execute("""SELECT word_token, word_id
 222                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 223                         """, (list(full_tokens.values()),))
 224             full_ids = {r[0]: r[1] for r in cur}
 225             cur.execute("""SELECT word_token, word_id
 226                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 227                         (list(partial_tokens.values()),))
 228             part_ids = {r[0]: r[1] for r in cur}
 229
 230         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 231                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 232
 233
 234     @staticmethod
 235     def normalize_postcode(postcode):
 236         """ Convert the postcode to a standardized form.
 237
 238             This function must yield exactly the same result as the SQL function
 239             'token_normalized_postcode()'.
 240         """
 241         return postcode.strip().upper()
 242
 243
 244     def _make_standard_hnr(self, hnr):
 245         """ Create a normalised version of a housenumber.
 246
 247             This function takes minor shortcuts on transliteration.
 248         """
 249         return self._search_normalized(hnr)
 250
 251     def update_postcodes_from_db(self):
 252         """ Update postcode tokens in the word table from the location_postcode
 253             table.
 254         """
 255         to_delete = []
 256         with self.conn.cursor() as cur:
 257             # This finds us the rows in location_postcode and word that are
 258             # missing in the other table.
 259             cur.execute("""SELECT * FROM
 260                             (SELECT pc, word FROM
 261                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 262                               FULL JOIN
 263                               (SELECT word FROM word WHERE type = 'P') w
 264                               ON pc = word) x
 265                            WHERE pc is null or word is null""")
 266
 267             with CopyBuffer() as copystr:
 268                 for postcode, word in cur:
 269                     if postcode is None:
 270                         to_delete.append(word)
 271                     else:
 272                         copystr.add(self._search_normalized(postcode),
 273                                     'P', postcode)
 274
 275                 if to_delete:
 276                     cur.execute("""DELETE FROM WORD
 277                                    WHERE type ='P' and word = any(%s)
 278                                 """, (to_delete, ))
 279
 280                 copystr.copy_out(cur, 'word',
 281                                  columns=['word_token', 'type', 'word'])
 282
 283
 284     def update_special_phrases(self, phrases, should_replace):
 285         """ Replace the search index for special phrases with the new phrases.
 286             If `should_replace` is True, then the previous set of will be
 287             completely replaced. Otherwise the phrases are added to the
 288             already existing ones.
 289         """
 290         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 291                             for p in phrases))
 292
 293         with self.conn.cursor() as cur:
 294             # Get the old phrases.
 295             existing_phrases = set()
 296             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 297             for word, info in cur:
 298                 existing_phrases.add((word, info['class'], info['type'],
 299                                       info.get('op') or '-'))
 300
 301             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 302             if should_replace:
 303                 deleted = self._remove_special_phrases(cur, norm_phrases,
 304                                                        existing_phrases)
 305             else:
 306                 deleted = 0
 307
 308         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 309                  len(norm_phrases), added, deleted)
 310
 311
 312     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 313         """ Add all phrases to the database that are not yet there.
 314         """
 315         to_add = new_phrases - existing_phrases
 316
 317         added = 0
 318         with CopyBuffer() as copystr:
 319             for word, cls, typ, oper in to_add:
 320                 term = self._search_normalized(word)
 321                 if term:
 322                     copystr.add(term, 'S', word,
 323                                 json.dumps({'class': cls, 'type': typ,
 324                                             'op': oper if oper in ('in', 'near') else None}))
 325                     added += 1
 326
 327             copystr.copy_out(cursor, 'word',
 328                              columns=['word_token', 'type', 'word', 'info'])
 329
 330         return added
 331
 332
 333     @staticmethod
 334     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 335         """ Remove all phrases from the databse that are no longer in the
 336             new phrase list.
 337         """
 338         to_delete = existing_phrases - new_phrases
 339
 340         if to_delete:
 341             cursor.execute_values(
 342                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 343                     WHERE type = 'S' and word = name
 344                           and info->>'class' = in_class and info->>'type' = in_type
 345                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 346                 """, to_delete)
 347
 348         return len(to_delete)
 349
 350
 351     def add_country_names(self, country_code, names):
 352         """ Add names for the given country to the search index.
 353         """
 354         # Make sure any name preprocessing for country names applies.
 355         info = PlaceInfo({'name': names, 'country_code': country_code,
 356                           'rank_address': 4, 'class': 'boundary',
 357                           'type': 'administrative'})
 358         self._add_country_full_names(country_code,
 359                                      self.sanitizer.process_names(info)[0])
 360
 361
 362     def _add_country_full_names(self, country_code, names):
 363         """ Add names for the given country from an already sanitized
 364             name list.
 365         """
 366         word_tokens = set()
 367         for name in names:
 368             norm_name = self._search_normalized(name.name)
 369             if norm_name:
 370                 word_tokens.add(norm_name)
 371
 372         with self.conn.cursor() as cur:
 373             # Get existing names
 374             cur.execute("""SELECT word_token FROM word
 375                             WHERE type = 'C' and word = %s""",
 376                         (country_code, ))
 377             word_tokens.difference_update((t[0] for t in cur))
 378
 379             # Only add those names that are not yet in the list.
 380             if word_tokens:
 381                 cur.execute("""INSERT INTO word (word_token, type, word)
 382                                (SELECT token, 'C', %s
 383                                 FROM unnest(%s) as token)
 384                             """, (country_code, list(word_tokens)))
 385
 386             # No names are deleted at the moment.
 387             # If deletion is made possible, then the static names from the
 388             # initial 'country_name' table should be kept.
 389
 390
 391     def process_place(self, place):
 392         """ Determine tokenizer information about the given place.
 393
 394             Returns a JSON-serializable structure that will be handed into
 395             the database via the token_info field.
 396         """
 397         token_info = _TokenInfo(self._cache)
 398
 399         names, address = self.sanitizer.process_names(place)
 400
 401         if names:
 402             fulls, partials = self._compute_name_tokens(names)
 403
 404             token_info.add_names(fulls, partials)
 405
 406             if place.is_country():
 407                 self._add_country_full_names(place.country_code, names)
 408
 409         if address:
 410             self._process_place_address(token_info, address)
 411
 412         return token_info.data
 413
 414
 415     def _process_place_address(self, token_info, address):
 416         hnrs = []
 417         addr_terms = []
 418         streets = []
 419         for item in address:
 420             if item.kind == 'postcode':
 421                 self._add_postcode(item.name)
 422             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 423                 hnrs.append(item.name)
 424             elif item.kind == 'street':
 425                 streets.extend(self._retrieve_full_tokens(item.name))
 426             elif item.kind == 'place':
 427                 if not item.suffix:
 428                     token_info.add_place(self._compute_partial_tokens(item.name))
 429             elif not item.kind.startswith('_') and not item.suffix and \
 430                  item.kind not in ('country', 'full'):
 431                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 432
 433         if hnrs:
 434             hnrs = self._split_housenumbers(hnrs)
 435             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 436
 437         if addr_terms:
 438             token_info.add_address_terms(addr_terms)
 439
 440         if streets:
 441             token_info.add_street(streets)
 442
 443
 444     def _compute_partial_tokens(self, name):
 445         """ Normalize the given term, split it into partial words and return
 446             then token list for them.
 447         """
 448         norm_name = self._search_normalized(name)
 449
 450         tokens = []
 451         need_lookup = []
 452         for partial in norm_name.split():
 453             token = self._cache.partials.get(partial)
 454             if token:
 455                 tokens.append(token)
 456             else:
 457                 need_lookup.append(partial)
 458
 459         if need_lookup:
 460             with self.conn.cursor() as cur:
 461                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 462                                FROM unnest(%s) word""",
 463                             (need_lookup, ))
 464
 465                 for partial, token in cur:
 466                     tokens.append(token)
 467                     self._cache.partials[partial] = token
 468
 469         return tokens
 470
 471
 472     def _retrieve_full_tokens(self, name):
 473         """ Get the full name token for the given name, if it exists.
 474             The name is only retrived for the standard analyser.
 475         """
 476         norm_name = self._search_normalized(name)
 477
 478         # return cached if possible
 479         if norm_name in self._cache.fulls:
 480             return self._cache.fulls[norm_name]
 481
 482         with self.conn.cursor() as cur:
 483             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 484                         (norm_name, ))
 485             full = [row[0] for row in cur]
 486
 487         self._cache.fulls[norm_name] = full
 488
 489         return full
 490
 491
 492     def _compute_name_tokens(self, names):
 493         """ Computes the full name and partial name tokens for the given
 494             dictionary of names.
 495         """
 496         full_tokens = set()
 497         partial_tokens = set()
 498
 499         for name in names:
 500             analyzer_id = name.get_attr('analyzer')
 501             norm_name = self._normalized(name.name)
 502             if analyzer_id is None:
 503                 token_id = norm_name
 504             else:
 505                 token_id = f'{norm_name}@{analyzer_id}'
 506
 507             full, part = self._cache.names.get(token_id, (None, None))
 508             if full is None:
 509                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 510                 if not variants:
 511                     continue
 512
 513                 with self.conn.cursor() as cur:
 514                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 515                                 (token_id, variants))
 516                     full, part = cur.fetchone()
 517
 518                 self._cache.names[token_id] = (full, part)
 519
 520             full_tokens.add(full)
 521             partial_tokens.update(part)
 522
 523         return full_tokens, partial_tokens
 524
 525
 526     def _add_postcode(self, postcode):
 527         """ Make sure the normalized postcode is present in the word table.
 528         """
 529         if re.search(r'[:,;]', postcode) is None:
 530             postcode = self.normalize_postcode(postcode)
 531
 532             if postcode not in self._cache.postcodes:
 533                 term = self._search_normalized(postcode)
 534                 if not term:
 535                     return
 536
 537                 with self.conn.cursor() as cur:
 538                     # no word_id needed for postcodes
 539                     cur.execute("""INSERT INTO word (word_token, type, word)
 540                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 541                                     WHERE NOT EXISTS
 542                                      (SELECT * FROM word
 543                                       WHERE type = 'P' and word = pc))
 544                                 """, (term, postcode))
 545                 self._cache.postcodes.add(postcode)
 546
 547
 548     @staticmethod
 549     def _split_housenumbers(hnrs):
 550         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 551             # split numbers if necessary
 552             simple_list = []
 553             for hnr in hnrs:
 554                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 555
 556             if len(simple_list) > 1:
 557                 hnrs = list(set(simple_list))
 558             else:
 559                 hnrs = simple_list
 560
 561         return hnrs
 562
 563
 564
 565
 566 class _TokenInfo:
 567     """ Collect token information to be sent back to the database.
 568     """
 569     def __init__(self, cache):
 570         self._cache = cache
 571         self.data = {}
 572
 573     @staticmethod
 574     def _mk_array(tokens):
 575         return '{%s}' % ','.join((str(s) for s in tokens))
 576
 577
 578     def add_names(self, fulls, partials):
 579         """ Adds token information for the normalised names.
 580         """
 581         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 582
 583
 584     def add_housenumbers(self, conn, hnrs):
 585         """ Extract housenumber information from a list of normalised
 586             housenumbers.
 587         """
 588         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 589         self.data['hnr'] = ';'.join(hnrs)
 590
 591
 592     def add_street(self, tokens):
 593         """ Add addr:street match terms.
 594         """
 595         self.data['street'] = self._mk_array(tokens)
 596
 597
 598     def add_place(self, tokens):
 599         """ Add addr:place search and match terms.
 600         """
 601         if tokens:
 602             self.data['place'] = self._mk_array(tokens)
 603
 604
 605     def add_address_terms(self, terms):
 606         """ Add additional address terms.
 607         """
 608         tokens = {key: self._mk_array(partials)
 609                   for key, partials in terms if partials}
 610
 611         if tokens:
 612             self.data['addr'] = tokens
 613
 614
 615 class _TokenCache:
 616     """ Cache for token information to avoid repeated database queries.
 617
 618         This cache is not thread-safe and needs to be instantiated per
 619         analyzer.
 620     """
 621     def __init__(self):
 622         self.names = {}
 623         self.partials = {}
 624         self.fulls = {}
 625         self.postcodes = set()
 626         self.housenumbers = {}
 627
 628
 629     def get_hnr_tokens(self, conn, terms):
 630         """ Get token ids for a list of housenumbers, looking them up in the
 631             database if necessary. `terms` is an iterable of normalized
 632             housenumbers.
 633         """
 634         tokens = []
 635         askdb = []
 636
 637         for term in terms:
 638             token = self.housenumbers.get(term)
 639             if token is None:
 640                 askdb.append(term)
 641             else:
 642                 tokens.append(token)
 643
 644         if askdb:
 645             with conn.cursor() as cur:
 646                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 647                             (askdb, ))
 648                 for term, tid in cur:
 649                     self.housenumbers[term] = tid
 650                     tokens.append(tid)
 651
 652         return tokens