nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 import itertools
  12 import json
  13 import logging
  14 import re
  15 from textwrap import dedent
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.utils import CopyBuffer
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20 from nominatim.indexer.place_info import PlaceInfo
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  23
  24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer(AbstractTokenizer):
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatim 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.loader = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         self.loader = ICURuleLoader(config)
  53
  54         self._install_php(config.lib_dir.php)
  55         self._save_config()
  56
  57         if init_db:
  58             self.update_sql_functions(config)
  59             self._init_db_tables(config)
  60
  61
  62     def init_from_project(self, config):
  63         """ Initialise the tokenizer from the project directory.
  64         """
  65         self.loader = ICURuleLoader(config)
  66
  67         with connect(self.dsn) as conn:
  68             self.loader.load_config_from_db(conn)
  69
  70
  71     def finalize_import(self, config):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75         with connect(self.dsn) as conn:
  76             sqlp = SQLPreprocessor(conn, config)
  77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  86
  87
  88     def check_database(self, config):
  89         """ Check that the tokenizer is set up correctly.
  90         """
  91         # Will throw an error if there is an issue.
  92         self.init_from_project(config)
  93
  94
  95     def update_statistics(self):
  96         """ Recompute frequencies for all name words.
  97         """
  98         with connect(self.dsn) as conn:
  99             if conn.table_exists('search_name'):
 100                 with conn.cursor() as cur:
 101                     cur.drop_table("word_frequencies")
 102                     LOG.info("Computing word frequencies")
 103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                      SELECT unnest(name_vector) as id, count(*)
 105                                      FROM search_name GROUP BY id""")
 106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                     LOG.info("Update word table with recomputed frequencies")
 108                     cur.execute("""UPDATE word
 109                                    SET info = info || jsonb_build_object('count', count)
 110                                    FROM word_frequencies WHERE word_id = id""")
 111                     cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def _cleanup_housenumbers(self):
 116         """ Remove unused house numbers.
 117         """
 118         with connect(self.dsn) as conn:
 119             with conn.cursor(name="hnr_counter") as cur:
 120                 cur.execute("""SELECT word_id, word_token FROM word
 121                                WHERE type = 'H'
 122                                  AND NOT EXISTS(SELECT * FROM search_name
 123                                                 WHERE ARRAY[word.word_id] && name_vector)
 124                                  AND (char_length(word_token) > 6
 125                                       OR word_token not similar to '\d+')
 126                             """)
 127                 candidates = {token: wid for wid, token in cur}
 128             with conn.cursor(name="hnr_counter") as cur:
 129                 cur.execute("""SELECT housenumber FROM placex
 130                                WHERE housenumber is not null
 131                                      AND (char_length(housenumber) > 6
 132                                           OR housenumber not similar to '\d+')
 133                             """)
 134                 for row in cur:
 135                     for hnr in row[0].split(';'):
 136                         candidates.pop(hnr, None)
 137         LOG.info("There are %s outdated housenumbers.", len(candidates))
 138
 139
 140     def update_word_tokens(self):
 141         """ Remove unused tokens.
 142         """
 143         LOG.info("Cleaning up housenumber tokens.")
 144         self._cleanup_housenumbers()
 145         LOG.info("Tokenizer house-keeping done.")
 146
 147
 148     def name_analyzer(self):
 149         """ Create a new analyzer for tokenizing names and queries
 150             using this tokinzer. Analyzers are context managers and should
 151             be used accordingly:
 152
 153             ```
 154             with tokenizer.name_analyzer() as analyzer:
 155                 analyser.tokenize()
 156             ```
 157
 158             When used outside the with construct, the caller must ensure to
 159             call the close() function before destructing the analyzer.
 160
 161             Analyzers are not thread-safe. You need to instantiate one per thread.
 162         """
 163         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 164                                      self.loader.make_token_analysis())
 165
 166
 167     def _install_php(self, phpdir):
 168         """ Install the php script for the tokenizer.
 169         """
 170         php_file = self.data_dir / "tokenizer.php"
 171         php_file.write_text(dedent(f"""\
 172             <?php
 173             @define('CONST_Max_Word_Frequency', 10000000);
 174             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 175             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 176             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 177
 178
 179     def _save_config(self):
 180         """ Save the configuration that needs to remain stable for the given
 181             database as database properties.
 182         """
 183         with connect(self.dsn) as conn:
 184             self.loader.save_config_to_db(conn)
 185
 186
 187     def _init_db_tables(self, config):
 188         """ Set up the word table and fill it with pre-computed word
 189             frequencies.
 190         """
 191         with connect(self.dsn) as conn:
 192             sqlp = SQLPreprocessor(conn, config)
 193             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 194             conn.commit()
 195
 196
 197 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 198     """ The legacy analyzer uses the ICU library for splitting names.
 199
 200         Each instance opens a connection to the database to request the
 201         normalization.
 202     """
 203
 204     def __init__(self, dsn, sanitizer, token_analysis):
 205         self.conn = connect(dsn).connection
 206         self.conn.autocommit = True
 207         self.sanitizer = sanitizer
 208         self.token_analysis = token_analysis
 209
 210         self._cache = _TokenCache()
 211
 212
 213     def close(self):
 214         """ Free all resources used by the analyzer.
 215         """
 216         if self.conn:
 217             self.conn.close()
 218             self.conn = None
 219
 220
 221     def _search_normalized(self, name):
 222         """ Return the search token transliteration of the given name.
 223         """
 224         return self.token_analysis.search.transliterate(name).strip()
 225
 226
 227     def _normalized(self, name):
 228         """ Return the normalized version of the given name with all
 229             non-relevant information removed.
 230         """
 231         return self.token_analysis.normalizer.transliterate(name).strip()
 232
 233
 234     def get_word_token_info(self, words):
 235         """ Return token information for the given list of words.
 236             If a word starts with # it is assumed to be a full name
 237             otherwise is a partial name.
 238
 239             The function returns a list of tuples with
 240             (original word, word token, word id).
 241
 242             The function is used for testing and debugging only
 243             and not necessarily efficient.
 244         """
 245         full_tokens = {}
 246         partial_tokens = {}
 247         for word in words:
 248             if word.startswith('#'):
 249                 full_tokens[word] = self._search_normalized(word[1:])
 250             else:
 251                 partial_tokens[word] = self._search_normalized(word)
 252
 253         with self.conn.cursor() as cur:
 254             cur.execute("""SELECT word_token, word_id
 255                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 256                         """, (list(full_tokens.values()),))
 257             full_ids = {r[0]: r[1] for r in cur}
 258             cur.execute("""SELECT word_token, word_id
 259                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 260                         (list(partial_tokens.values()),))
 261             part_ids = {r[0]: r[1] for r in cur}
 262
 263         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 264                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 265
 266
 267     @staticmethod
 268     def normalize_postcode(postcode):
 269         """ Convert the postcode to a standardized form.
 270
 271             This function must yield exactly the same result as the SQL function
 272             'token_normalized_postcode()'.
 273         """
 274         return postcode.strip().upper()
 275
 276
 277     def _make_standard_hnr(self, hnr):
 278         """ Create a normalised version of a housenumber.
 279
 280             This function takes minor shortcuts on transliteration.
 281         """
 282         return self._search_normalized(hnr)
 283
 284     def update_postcodes_from_db(self):
 285         """ Update postcode tokens in the word table from the location_postcode
 286             table.
 287         """
 288         to_delete = []
 289         with self.conn.cursor() as cur:
 290             # This finds us the rows in location_postcode and word that are
 291             # missing in the other table.
 292             cur.execute("""SELECT * FROM
 293                             (SELECT pc, word FROM
 294                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 295                               FULL JOIN
 296                               (SELECT word FROM word WHERE type = 'P') w
 297                               ON pc = word) x
 298                            WHERE pc is null or word is null""")
 299
 300             with CopyBuffer() as copystr:
 301                 for postcode, word in cur:
 302                     if postcode is None:
 303                         to_delete.append(word)
 304                     else:
 305                         copystr.add(self._search_normalized(postcode),
 306                                     'P', postcode)
 307
 308                 if to_delete:
 309                     cur.execute("""DELETE FROM WORD
 310                                    WHERE type ='P' and word = any(%s)
 311                                 """, (to_delete, ))
 312
 313                 copystr.copy_out(cur, 'word',
 314                                  columns=['word_token', 'type', 'word'])
 315
 316
 317     def update_special_phrases(self, phrases, should_replace):
 318         """ Replace the search index for special phrases with the new phrases.
 319             If `should_replace` is True, then the previous set of will be
 320             completely replaced. Otherwise the phrases are added to the
 321             already existing ones.
 322         """
 323         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 324                             for p in phrases))
 325
 326         with self.conn.cursor() as cur:
 327             # Get the old phrases.
 328             existing_phrases = set()
 329             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 330             for word, info in cur:
 331                 existing_phrases.add((word, info['class'], info['type'],
 332                                       info.get('op') or '-'))
 333
 334             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 335             if should_replace:
 336                 deleted = self._remove_special_phrases(cur, norm_phrases,
 337                                                        existing_phrases)
 338             else:
 339                 deleted = 0
 340
 341         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 342                  len(norm_phrases), added, deleted)
 343
 344
 345     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 346         """ Add all phrases to the database that are not yet there.
 347         """
 348         to_add = new_phrases - existing_phrases
 349
 350         added = 0
 351         with CopyBuffer() as copystr:
 352             for word, cls, typ, oper in to_add:
 353                 term = self._search_normalized(word)
 354                 if term:
 355                     copystr.add(term, 'S', word,
 356                                 json.dumps({'class': cls, 'type': typ,
 357                                             'op': oper if oper in ('in', 'near') else None}))
 358                     added += 1
 359
 360             copystr.copy_out(cursor, 'word',
 361                              columns=['word_token', 'type', 'word', 'info'])
 362
 363         return added
 364
 365
 366     @staticmethod
 367     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 368         """ Remove all phrases from the databse that are no longer in the
 369             new phrase list.
 370         """
 371         to_delete = existing_phrases - new_phrases
 372
 373         if to_delete:
 374             cursor.execute_values(
 375                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 376                     WHERE type = 'S' and word = name
 377                           and info->>'class' = in_class and info->>'type' = in_type
 378                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 379                 """, to_delete)
 380
 381         return len(to_delete)
 382
 383
 384     def add_country_names(self, country_code, names):
 385         """ Add names for the given country to the search index.
 386         """
 387         # Make sure any name preprocessing for country names applies.
 388         info = PlaceInfo({'name': names, 'country_code': country_code,
 389                           'rank_address': 4, 'class': 'boundary',
 390                           'type': 'administrative'})
 391         self._add_country_full_names(country_code,
 392                                      self.sanitizer.process_names(info)[0])
 393
 394
 395     def _add_country_full_names(self, country_code, names):
 396         """ Add names for the given country from an already sanitized
 397             name list.
 398         """
 399         word_tokens = set()
 400         for name in names:
 401             norm_name = self._search_normalized(name.name)
 402             if norm_name:
 403                 word_tokens.add(norm_name)
 404
 405         with self.conn.cursor() as cur:
 406             # Get existing names
 407             cur.execute("""SELECT word_token FROM word
 408                             WHERE type = 'C' and word = %s""",
 409                         (country_code, ))
 410             word_tokens.difference_update((t[0] for t in cur))
 411
 412             # Only add those names that are not yet in the list.
 413             if word_tokens:
 414                 cur.execute("""INSERT INTO word (word_token, type, word)
 415                                (SELECT token, 'C', %s
 416                                 FROM unnest(%s) as token)
 417                             """, (country_code, list(word_tokens)))
 418
 419             # No names are deleted at the moment.
 420             # If deletion is made possible, then the static names from the
 421             # initial 'country_name' table should be kept.
 422
 423
 424     def process_place(self, place):
 425         """ Determine tokenizer information about the given place.
 426
 427             Returns a JSON-serializable structure that will be handed into
 428             the database via the token_info field.
 429         """
 430         token_info = _TokenInfo(self._cache)
 431
 432         names, address = self.sanitizer.process_names(place)
 433
 434         if names:
 435             fulls, partials = self._compute_name_tokens(names)
 436
 437             token_info.add_names(fulls, partials)
 438
 439             if place.is_country():
 440                 self._add_country_full_names(place.country_code, names)
 441
 442         if address:
 443             self._process_place_address(token_info, address)
 444
 445         return token_info.data
 446
 447
 448     def _process_place_address(self, token_info, address):
 449         hnrs = set()
 450         addr_terms = []
 451         streets = []
 452         for item in address:
 453             if item.kind == 'postcode':
 454                 self._add_postcode(item.name)
 455             elif item.kind == 'housenumber':
 456                 norm_name = self._make_standard_hnr(item.name)
 457                 if norm_name:
 458                     hnrs.add(norm_name)
 459             elif item.kind == 'street':
 460                 streets.extend(self._retrieve_full_tokens(item.name))
 461             elif item.kind == 'place':
 462                 if not item.suffix:
 463                     token_info.add_place(self._compute_partial_tokens(item.name))
 464             elif not item.kind.startswith('_') and not item.suffix and \
 465                  item.kind not in ('country', 'full'):
 466                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 467
 468         if hnrs:
 469             token_info.add_housenumbers(self.conn, hnrs)
 470
 471         if addr_terms:
 472             token_info.add_address_terms(addr_terms)
 473
 474         if streets:
 475             token_info.add_street(streets)
 476
 477
 478     def _compute_partial_tokens(self, name):
 479         """ Normalize the given term, split it into partial words and return
 480             then token list for them.
 481         """
 482         norm_name = self._search_normalized(name)
 483
 484         tokens = []
 485         need_lookup = []
 486         for partial in norm_name.split():
 487             token = self._cache.partials.get(partial)
 488             if token:
 489                 tokens.append(token)
 490             else:
 491                 need_lookup.append(partial)
 492
 493         if need_lookup:
 494             with self.conn.cursor() as cur:
 495                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 496                                FROM unnest(%s) word""",
 497                             (need_lookup, ))
 498
 499                 for partial, token in cur:
 500                     tokens.append(token)
 501                     self._cache.partials[partial] = token
 502
 503         return tokens
 504
 505
 506     def _retrieve_full_tokens(self, name):
 507         """ Get the full name token for the given name, if it exists.
 508             The name is only retrived for the standard analyser.
 509         """
 510         norm_name = self._search_normalized(name)
 511
 512         # return cached if possible
 513         if norm_name in self._cache.fulls:
 514             return self._cache.fulls[norm_name]
 515
 516         with self.conn.cursor() as cur:
 517             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 518                         (norm_name, ))
 519             full = [row[0] for row in cur]
 520
 521         self._cache.fulls[norm_name] = full
 522
 523         return full
 524
 525
 526     def _compute_name_tokens(self, names):
 527         """ Computes the full name and partial name tokens for the given
 528             dictionary of names.
 529         """
 530         full_tokens = set()
 531         partial_tokens = set()
 532
 533         for name in names:
 534             analyzer_id = name.get_attr('analyzer')
 535             norm_name = self._normalized(name.name)
 536             if analyzer_id is None:
 537                 token_id = norm_name
 538             else:
 539                 token_id = f'{norm_name}@{analyzer_id}'
 540
 541             full, part = self._cache.names.get(token_id, (None, None))
 542             if full is None:
 543                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 544                 if not variants:
 545                     continue
 546
 547                 with self.conn.cursor() as cur:
 548                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 549                                 (token_id, variants))
 550                     full, part = cur.fetchone()
 551
 552                 self._cache.names[token_id] = (full, part)
 553
 554             full_tokens.add(full)
 555             partial_tokens.update(part)
 556
 557         return full_tokens, partial_tokens
 558
 559
 560     def _add_postcode(self, postcode):
 561         """ Make sure the normalized postcode is present in the word table.
 562         """
 563         if re.search(r'[:,;]', postcode) is None:
 564             postcode = self.normalize_postcode(postcode)
 565
 566             if postcode not in self._cache.postcodes:
 567                 term = self._search_normalized(postcode)
 568                 if not term:
 569                     return
 570
 571                 with self.conn.cursor() as cur:
 572                     # no word_id needed for postcodes
 573                     cur.execute("""INSERT INTO word (word_token, type, word)
 574                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 575                                     WHERE NOT EXISTS
 576                                      (SELECT * FROM word
 577                                       WHERE type = 'P' and word = pc))
 578                                 """, (term, postcode))
 579                 self._cache.postcodes.add(postcode)
 580
 581
 582 class _TokenInfo:
 583     """ Collect token information to be sent back to the database.
 584     """
 585     def __init__(self, cache):
 586         self._cache = cache
 587         self.data = {}
 588
 589     @staticmethod
 590     def _mk_array(tokens):
 591         return '{%s}' % ','.join((str(s) for s in tokens))
 592
 593
 594     def add_names(self, fulls, partials):
 595         """ Adds token information for the normalised names.
 596         """
 597         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 598
 599
 600     def add_housenumbers(self, conn, hnrs):
 601         """ Extract housenumber information from a list of normalised
 602             housenumbers.
 603         """
 604         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 605         self.data['hnr'] = ';'.join(hnrs)
 606
 607
 608     def add_street(self, tokens):
 609         """ Add addr:street match terms.
 610         """
 611         self.data['street'] = self._mk_array(tokens)
 612
 613
 614     def add_place(self, tokens):
 615         """ Add addr:place search and match terms.
 616         """
 617         if tokens:
 618             self.data['place'] = self._mk_array(tokens)
 619
 620
 621     def add_address_terms(self, terms):
 622         """ Add additional address terms.
 623         """
 624         tokens = {key: self._mk_array(partials)
 625                   for key, partials in terms if partials}
 626
 627         if tokens:
 628             self.data['addr'] = tokens
 629
 630
 631 class _TokenCache:
 632     """ Cache for token information to avoid repeated database queries.
 633
 634         This cache is not thread-safe and needs to be instantiated per
 635         analyzer.
 636     """
 637     def __init__(self):
 638         self.names = {}
 639         self.partials = {}
 640         self.fulls = {}
 641         self.postcodes = set()
 642         self.housenumbers = {}
 643
 644
 645     def get_hnr_tokens(self, conn, terms):
 646         """ Get token ids for a list of housenumbers, looking them up in the
 647             database if necessary. `terms` is an iterable of normalized
 648             housenumbers.
 649         """
 650         tokens = []
 651         askdb = []
 652
 653         for term in terms:
 654             token = self.housenumbers.get(term)
 655             if token is None:
 656                 askdb.append(term)
 657             else:
 658                 tokens.append(token)
 659
 660         if askdb:
 661             with conn.cursor() as cur:
 662                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 663                             (askdb, ))
 664                 for term, tid in cur:
 665                     self.housenumbers[term] = tid
 666                     tokens.append(tid)
 667
 668         return tokens