nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70
  71     def finalize_import(self, config):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75         with connect(self.dsn) as conn:
  76             sqlp = SQLPreprocessor(conn, config)
  77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  86
  87
  88     def check_database(self, config):
  89         """ Check that the tokenizer is set up correctly.
  90         """
  91         # Will throw an error if there is an issue.
  92         self.init_from_project(config)
  93
  94
  95     def update_statistics(self):
  96         """ Recompute frequencies for all name words.
  97         """
  98         with connect(self.dsn) as conn:
  99             if conn.table_exists('search_name'):
 100                 with conn.cursor() as cur:
 101                     cur.drop_table("word_frequencies")
 102                     LOG.info("Computing word frequencies")
 103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                      SELECT unnest(name_vector) as id, count(*)
 105                                      FROM search_name GROUP BY id""")
 106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                     LOG.info("Update word table with recomputed frequencies")
 108                     cur.execute("""UPDATE word
 109                                    SET info = info || jsonb_build_object('count', count)
 110                                    FROM word_frequencies WHERE word_id = id""")
 111                     cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def _cleanup_housenumbers(self):
 116         """ Remove unused house numbers.
 117         """
 118         with connect(self.dsn) as conn:
 119             if not conn.table_exists('search_name'):
 120                 return
 121             with conn.cursor(name="hnr_counter") as cur:
 122                 cur.execute("""SELECT word_id, word_token FROM word
 123                                WHERE type = 'H'
 124                                  AND NOT EXISTS(SELECT * FROM search_name
 125                                                 WHERE ARRAY[word.word_id] && name_vector)
 126                                  AND (char_length(word_token) > 6
 127                                       OR word_token not similar to '\\d+')
 128                             """)
 129                 candidates = {token: wid for wid, token in cur}
 130             with conn.cursor(name="hnr_counter") as cur:
 131                 cur.execute("""SELECT housenumber FROM placex
 132                                WHERE housenumber is not null
 133                                      AND (char_length(housenumber) > 6
 134                                           OR housenumber not similar to '\\d+')
 135                             """)
 136                 for row in cur:
 137                     for hnr in row[0].split(';'):
 138                         candidates.pop(hnr, None)
 139             LOG.info("There are %s outdated housenumbers.", len(candidates))
 140             if candidates:
 141                 with conn.cursor() as cur:
 142                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 143                                 (list(candidates.values()), ))
 144                 conn.commit()
 145
 146
 147
 148     def update_word_tokens(self):
 149         """ Remove unused tokens.
 150         """
 151         LOG.warning("Cleaning up housenumber tokens.")
 152         self._cleanup_housenumbers()
 153         LOG.warning("Tokenizer house-keeping done.")
 154
 155
 156     def name_analyzer(self):
 157         """ Create a new analyzer for tokenizing names and queries
 158             using this tokinzer. Analyzers are context managers and should
 159             be used accordingly:
 160
 161             ```
 162             with tokenizer.name_analyzer() as analyzer:
 163                 analyser.tokenize()
 164             ```
 165
 166             When used outside the with construct, the caller must ensure to
 167             call the close() function before destructing the analyzer.
 168
 169             Analyzers are not thread-safe. You need to instantiate one per thread.
 170         """
 171         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 172                                      self.loader.make_token_analysis())
 173
 174
 175     def _install_php(self, phpdir):
 176         """ Install the php script for the tokenizer.
 177         """
 178         php_file = self.data_dir / "tokenizer.php"
 179         php_file.write_text(dedent(f"""\
 180             <?php
 181             @define('CONST_Max_Word_Frequency', 10000000);
 182             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 183             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 184             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 185
 186
 187     def _save_config(self):
 188         """ Save the configuration that needs to remain stable for the given
 189             database as database properties.
 190         """
 191         with connect(self.dsn) as conn:
 192             self.loader.save_config_to_db(conn)
 193
 194
 195     def _init_db_tables(self, config):
 196         """ Set up the word table and fill it with pre-computed word
 197             frequencies.
 198         """
 199         with connect(self.dsn) as conn:
 200             sqlp = SQLPreprocessor(conn, config)
 201             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 202             conn.commit()
 203
 204
 205 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 206     """ The legacy analyzer uses the ICU library for splitting names.
 207
 208         Each instance opens a connection to the database to request the
 209         normalization.
 210     """
 211
 212     def __init__(self, dsn, sanitizer, token_analysis):
 213         self.conn = connect(dsn).connection
 214         self.conn.autocommit = True
 215         self.sanitizer = sanitizer
 216         self.token_analysis = token_analysis
 217
 218         self._cache = _TokenCache()
 219
 220
 221     def close(self):
 222         """ Free all resources used by the analyzer.
 223         """
 224         if self.conn:
 225             self.conn.close()
 226             self.conn = None
 227
 228
 229     def _search_normalized(self, name):
 230         """ Return the search token transliteration of the given name.
 231         """
 232         return self.token_analysis.search.transliterate(name).strip()
 233
 234
 235     def _normalized(self, name):
 236         """ Return the normalized version of the given name with all
 237             non-relevant information removed.
 238         """
 239         return self.token_analysis.normalizer.transliterate(name).strip()
 240
 241
 242     def get_word_token_info(self, words):
 243         """ Return token information for the given list of words.
 244             If a word starts with # it is assumed to be a full name
 245             otherwise is a partial name.
 246
 247             The function returns a list of tuples with
 248             (original word, word token, word id).
 249
 250             The function is used for testing and debugging only
 251             and not necessarily efficient.
 252         """
 253         full_tokens = {}
 254         partial_tokens = {}
 255         for word in words:
 256             if word.startswith('#'):
 257                 full_tokens[word] = self._search_normalized(word[1:])
 258             else:
 259                 partial_tokens[word] = self._search_normalized(word)
 260
 261         with self.conn.cursor() as cur:
 262             cur.execute("""SELECT word_token, word_id
 263                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 264                         """, (list(full_tokens.values()),))
 265             full_ids = {r[0]: r[1] for r in cur}
 266             cur.execute("""SELECT word_token, word_id
 267                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 268                         (list(partial_tokens.values()),))
 269             part_ids = {r[0]: r[1] for r in cur}
 270
 271         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 272                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 273
 274
 275     @staticmethod
 276     def normalize_postcode(postcode):
 277         """ Convert the postcode to a standardized form.
 278
 279             This function must yield exactly the same result as the SQL function
 280             'token_normalized_postcode()'.
 281         """
 282         return postcode.strip().upper()
 283
 284
 285     def _make_standard_hnr(self, hnr):
 286         """ Create a normalised version of a housenumber.
 287
 288             This function takes minor shortcuts on transliteration.
 289         """
 290         return self._search_normalized(hnr)
 291
 292     def update_postcodes_from_db(self):
 293         """ Update postcode tokens in the word table from the location_postcode
 294             table.
 295         """
 296         to_delete = []
 297         with self.conn.cursor() as cur:
 298             # This finds us the rows in location_postcode and word that are
 299             # missing in the other table.
 300             cur.execute("""SELECT * FROM
 301                             (SELECT pc, word FROM
 302                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 303                               FULL JOIN
 304                               (SELECT word FROM word WHERE type = 'P') w
 305                               ON pc = word) x
 306                            WHERE pc is null or word is null""")
 307
 308             with CopyBuffer() as copystr:
 309                 for postcode, word in cur:
 310                     if postcode is None:
 311                         to_delete.append(word)
 312                     else:
 313                         copystr.add(self._search_normalized(postcode),
 314                                     'P', postcode)
 315
 316                 if to_delete:
 317                     cur.execute("""DELETE FROM WORD
 318                                    WHERE type ='P' and word = any(%s)
 319                                 """, (to_delete, ))
 320
 321                 copystr.copy_out(cur, 'word',
 322                                  columns=['word_token', 'type', 'word'])
 323
 324
 325     def update_special_phrases(self, phrases, should_replace):
 326         """ Replace the search index for special phrases with the new phrases.
 327             If `should_replace` is True, then the previous set of will be
 328             completely replaced. Otherwise the phrases are added to the
 329             already existing ones.
 330         """
 331         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 332                             for p in phrases))
 333
 334         with self.conn.cursor() as cur:
 335             # Get the old phrases.
 336             existing_phrases = set()
 337             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 338             for word, info in cur:
 339                 existing_phrases.add((word, info['class'], info['type'],
 340                                       info.get('op') or '-'))
 341
 342             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 343             if should_replace:
 344                 deleted = self._remove_special_phrases(cur, norm_phrases,
 345                                                        existing_phrases)
 346             else:
 347                 deleted = 0
 348
 349         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 350                  len(norm_phrases), added, deleted)
 351
 352
 353     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 354         """ Add all phrases to the database that are not yet there.
 355         """
 356         to_add = new_phrases - existing_phrases
 357
 358         added = 0
 359         with CopyBuffer() as copystr:
 360             for word, cls, typ, oper in to_add:
 361                 term = self._search_normalized(word)
 362                 if term:
 363                     copystr.add(term, 'S', word,
 364                                 json.dumps({'class': cls, 'type': typ,
 365                                             'op': oper if oper in ('in', 'near') else None}))
 366                     added += 1
 367
 368             copystr.copy_out(cursor, 'word',
 369                              columns=['word_token', 'type', 'word', 'info'])
 370
 371         return added
 372
 373
 374     @staticmethod
 375     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 376         """ Remove all phrases from the databse that are no longer in the
 377             new phrase list.
 378         """
 379         to_delete = existing_phrases - new_phrases
 380
 381         if to_delete:
 382             cursor.execute_values(
 383                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 384                     WHERE type = 'S' and word = name
 385                           and info->>'class' = in_class and info->>'type' = in_type
 386                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 387                 """, to_delete)
 388
 389         return len(to_delete)
 390
 391
 392     def add_country_names(self, country_code, names):
 393         """ Add names for the given country to the search index.
 394         """
 395         # Make sure any name preprocessing for country names applies.
 396         info = PlaceInfo({'name': names, 'country_code': country_code,
 397                           'rank_address': 4, 'class': 'boundary',
 398                           'type': 'administrative'})
 399         self._add_country_full_names(country_code,
 400                                      self.sanitizer.process_names(info)[0])
 401
 402
 403     def _add_country_full_names(self, country_code, names):
 404         """ Add names for the given country from an already sanitized
 405             name list.
 406         """
 407         word_tokens = set()
 408         for name in names:
 409             norm_name = self._search_normalized(name.name)
 410             if norm_name:
 411                 word_tokens.add(norm_name)
 412
 413         with self.conn.cursor() as cur:
 414             # Get existing names
 415             cur.execute("""SELECT word_token FROM word
 416                             WHERE type = 'C' and word = %s""",
 417                         (country_code, ))
 418             existing_tokens = {t[0] for t in cur}
 419
 420             # Only add those names that are not yet in the list.
 421             new_tokens = word_tokens - existing_tokens
 422             if new_tokens:
 423                 cur.execute("""INSERT INTO word (word_token, type, word)
 424                                (SELECT token, 'C', %s
 425                                 FROM unnest(%s) as token)
 426                             """, (country_code, list(new_tokens)))
 427
 428             # Delete names that no longer exist.
 429             gone_tokens = existing_tokens - word_tokens
 430             if gone_tokens:
 431                 cur.execute("""DELETE FROM word
 432                                USING unnest(%s) as token
 433                                WHERE type = 'C' and word = %s
 434                                      and word_token = token""",
 435                             (list(gone_tokens), country_code))
 436
 437
 438     def process_place(self, place):
 439         """ Determine tokenizer information about the given place.
 440
 441             Returns a JSON-serializable structure that will be handed into
 442             the database via the token_info field.
 443         """
 444         token_info = _TokenInfo(self._cache)
 445
 446         names, address = self.sanitizer.process_names(place)
 447
 448         if names:
 449             fulls, partials = self._compute_name_tokens(names)
 450
 451             token_info.add_names(fulls, partials)
 452
 453             if place.is_country():
 454                 self._add_country_full_names(place.country_code, names)
 455
 456         if address:
 457             self._process_place_address(token_info, address)
 458
 459         return token_info.data
 460
 461
 462     def _process_place_address(self, token_info, address):
 463         hnrs = set()
 464         addr_terms = []
 465         streets = []
 466         for item in address:
 467             if item.kind == 'postcode':
 468                 self._add_postcode(item.name)
 469             elif item.kind == 'housenumber':
 470                 norm_name = self._make_standard_hnr(item.name)
 471                 if norm_name:
 472                     hnrs.add(norm_name)
 473             elif item.kind == 'street':
 474                 streets.extend(self._retrieve_full_tokens(item.name))
 475             elif item.kind == 'place':
 476                 if not item.suffix:
 477                     token_info.add_place(self._compute_partial_tokens(item.name))
 478             elif not item.kind.startswith('_') and not item.suffix and \
 479                  item.kind not in ('country', 'full'):
 480                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 481
 482         if hnrs:
 483             token_info.add_housenumbers(self.conn, hnrs)
 484
 485         if addr_terms:
 486             token_info.add_address_terms(addr_terms)
 487
 488         if streets:
 489             token_info.add_street(streets)
 490
 491
 492     def _compute_partial_tokens(self, name):
 493         """ Normalize the given term, split it into partial words and return
 494             then token list for them.
 495         """
 496         norm_name = self._search_normalized(name)
 497
 498         tokens = []
 499         need_lookup = []
 500         for partial in norm_name.split():
 501             token = self._cache.partials.get(partial)
 502             if token:
 503                 tokens.append(token)
 504             else:
 505                 need_lookup.append(partial)
 506
 507         if need_lookup:
 508             with self.conn.cursor() as cur:
 509                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 510                                FROM unnest(%s) word""",
 511                             (need_lookup, ))
 512
 513                 for partial, token in cur:
 514                     tokens.append(token)
 515                     self._cache.partials[partial] = token
 516
 517         return tokens
 518
 519
 520     def _retrieve_full_tokens(self, name):
 521         """ Get the full name token for the given name, if it exists.
 522             The name is only retrived for the standard analyser.
 523         """
 524         norm_name = self._search_normalized(name)
 525
 526         # return cached if possible
 527         if norm_name in self._cache.fulls:
 528             return self._cache.fulls[norm_name]
 529
 530         with self.conn.cursor() as cur:
 531             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 532                         (norm_name, ))
 533             full = [row[0] for row in cur]
 534
 535         self._cache.fulls[norm_name] = full
 536
 537         return full
 538
 539
 540     def _compute_name_tokens(self, names):
 541         """ Computes the full name and partial name tokens for the given
 542             dictionary of names.
 543         """
 544         full_tokens = set()
 545         partial_tokens = set()
 546
 547         for name in names:
 548             analyzer_id = name.get_attr('analyzer')
 549             norm_name = self._normalized(name.name)
 550             if analyzer_id is None:
 551                 token_id = norm_name
 552             else:
 553                 token_id = f'{norm_name}@{analyzer_id}'
 554
 555             full, part = self._cache.names.get(token_id, (None, None))
 556             if full is None:
 557                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 558                 if not variants:
 559                     continue
 560
 561                 with self.conn.cursor() as cur:
 562                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 563                                 (token_id, variants))
 564                     full, part = cur.fetchone()
 565
 566                 self._cache.names[token_id] = (full, part)
 567
 568             full_tokens.add(full)
 569             partial_tokens.update(part)
 570
 571         return full_tokens, partial_tokens
 572
 573
 574     def _add_postcode(self, postcode):
 575         """ Make sure the normalized postcode is present in the word table.
 576         """
 577         if re.search(r'[:,;]', postcode) is None:
 578             postcode = self.normalize_postcode(postcode)
 579
 580             if postcode not in self._cache.postcodes:
 581                 term = self._search_normalized(postcode)
 582                 if not term:
 583                     return
 584
 585                 with self.conn.cursor() as cur:
 586                     # no word_id needed for postcodes
 587                     cur.execute("""INSERT INTO word (word_token, type, word)
 588                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 589                                     WHERE NOT EXISTS
 590                                      (SELECT * FROM word
 591                                       WHERE type = 'P' and word = pc))
 592                                 """, (term, postcode))
 593                 self._cache.postcodes.add(postcode)
 594
 595
 596 class _TokenInfo:
 597     """ Collect token information to be sent back to the database.
 598     """
 599     def __init__(self, cache):
 600         self._cache = cache
 601         self.data = {}
 602
 603     @staticmethod
 604     def _mk_array(tokens):
 605         return '{%s}' % ','.join((str(s) for s in tokens))
 606
 607
 608     def add_names(self, fulls, partials):
 609         """ Adds token information for the normalised names.
 610         """
 611         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 612
 613
 614     def add_housenumbers(self, conn, hnrs):
 615         """ Extract housenumber information from a list of normalised
 616             housenumbers.
 617         """
 618         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 619         self.data['hnr'] = ';'.join(hnrs)
 620
 621
 622     def add_street(self, tokens):
 623         """ Add addr:street match terms.
 624         """
 625         self.data['street'] = self._mk_array(tokens)
 626
 627
 628     def add_place(self, tokens):
 629         """ Add addr:place search and match terms.
 630         """
 631         if tokens:
 632             self.data['place'] = self._mk_array(tokens)
 633
 634
 635     def add_address_terms(self, terms):
 636         """ Add additional address terms.
 637         """
 638         tokens = {key: self._mk_array(partials)
 639                   for key, partials in terms if partials}
 640
 641         if tokens:
 642             self.data['addr'] = tokens
 643
 644
 645 class _TokenCache:
 646     """ Cache for token information to avoid repeated database queries.
 647
 648         This cache is not thread-safe and needs to be instantiated per
 649         analyzer.
 650     """
 651     def __init__(self):
 652         self.names = {}
 653         self.partials = {}
 654         self.fulls = {}
 655         self.postcodes = set()
 656         self.housenumbers = {}
 657
 658
 659     def get_hnr_tokens(self, conn, terms):
 660         """ Get token ids for a list of housenumbers, looking them up in the
 661             database if necessary. `terms` is an iterable of normalized
 662             housenumbers.
 663         """
 664         tokens = []
 665         askdb = []
 666
 667         for term in terms:
 668             token = self.housenumbers.get(term)
 669             if token is None:
 670                 askdb.append(term)
 671             else:
 672                 tokens.append(token)
 673
 674         if askdb:
 675             with conn.cursor() as cur:
 676                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 677                             (askdb, ))
 678                 for term, tid in cur:
 679                     self.housenumbers[term] = tid
 680                     tokens.append(tid)
 681
 682         return tokens