nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 import itertools
   6 import json
   7 import logging
   8 import re
   9 from textwrap import dedent
  10
  11 from nominatim.db.connection import connect
  12 from nominatim.db.utils import CopyBuffer
  13 from nominatim.db.sql_preprocessor import SQLPreprocessor
  14 from nominatim.indexer.place_info import PlaceInfo
  15 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  16 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  17
  18 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  19
  20 LOG = logging.getLogger()
  21
  22 def create(dsn, data_dir):
  23     """ Create a new instance of the tokenizer provided by this module.
  24     """
  25     return LegacyICUTokenizer(dsn, data_dir)
  26
  27
  28 class LegacyICUTokenizer(AbstractTokenizer):
  29     """ This tokenizer uses libICU to covert names and queries to ASCII.
  30         Otherwise it uses the same algorithms and data structures as the
  31         normalization routines in Nominatim 3.
  32     """
  33
  34     def __init__(self, dsn, data_dir):
  35         self.dsn = dsn
  36         self.data_dir = data_dir
  37         self.loader = None
  38
  39
  40     def init_new_db(self, config, init_db=True):
  41         """ Set up a new tokenizer for the database.
  42
  43             This copies all necessary data in the project directory to make
  44             sure the tokenizer remains stable even over updates.
  45         """
  46         self.loader = ICURuleLoader(config)
  47
  48         self._install_php(config.lib_dir.php)
  49         self._save_config()
  50
  51         if init_db:
  52             self.update_sql_functions(config)
  53             self._init_db_tables(config)
  54
  55
  56     def init_from_project(self, config):
  57         """ Initialise the tokenizer from the project directory.
  58         """
  59         self.loader = ICURuleLoader(config)
  60
  61         with connect(self.dsn) as conn:
  62             self.loader.load_config_from_db(conn)
  63
  64
  65     def finalize_import(self, config):
  66         """ Do any required postprocessing to make the tokenizer data ready
  67             for use.
  68         """
  69         with connect(self.dsn) as conn:
  70             sqlp = SQLPreprocessor(conn, config)
  71             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  72
  73
  74     def update_sql_functions(self, config):
  75         """ Reimport the SQL functions for this tokenizer.
  76         """
  77         with connect(self.dsn) as conn:
  78             sqlp = SQLPreprocessor(conn, config)
  79             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  80
  81
  82     def check_database(self, config):
  83         """ Check that the tokenizer is set up correctly.
  84         """
  85         # Will throw an error if there is an issue.
  86         self.init_from_project(config)
  87
  88
  89     def update_statistics(self):
  90         """ Recompute frequencies for all name words.
  91         """
  92         with connect(self.dsn) as conn:
  93             if conn.table_exists('search_name'):
  94                 with conn.cursor() as cur:
  95                     cur.drop_table("word_frequencies")
  96                     LOG.info("Computing word frequencies")
  97                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
  98                                      SELECT unnest(name_vector) as id, count(*)
  99                                      FROM search_name GROUP BY id""")
 100                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 101                     LOG.info("Update word table with recomputed frequencies")
 102                     cur.execute("""UPDATE word
 103                                    SET info = info || jsonb_build_object('count', count)
 104                                    FROM word_frequencies WHERE word_id = id""")
 105                     cur.drop_table("word_frequencies")
 106             conn.commit()
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 125                                      self.loader.make_token_analysis())
 126
 127
 128     def _install_php(self, phpdir):
 129         """ Install the php script for the tokenizer.
 130         """
 131         php_file = self.data_dir / "tokenizer.php"
 132         php_file.write_text(dedent(f"""\
 133             <?php
 134             @define('CONST_Max_Word_Frequency', 10000000);
 135             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 136             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 137             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 138
 139
 140     def _save_config(self):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.loader.save_config_to_db(conn)
 146
 147
 148     def _init_db_tables(self, config):
 149         """ Set up the word table and fill it with pre-computed word
 150             frequencies.
 151         """
 152         with connect(self.dsn) as conn:
 153             sqlp = SQLPreprocessor(conn, config)
 154             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 155             conn.commit()
 156
 157
 158 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 159     """ The legacy analyzer uses the ICU library for splitting names.
 160
 161         Each instance opens a connection to the database to request the
 162         normalization.
 163     """
 164
 165     def __init__(self, dsn, sanitizer, token_analysis):
 166         self.conn = connect(dsn).connection
 167         self.conn.autocommit = True
 168         self.sanitizer = sanitizer
 169         self.token_analysis = token_analysis
 170
 171         self._cache = _TokenCache()
 172
 173
 174     def close(self):
 175         """ Free all resources used by the analyzer.
 176         """
 177         if self.conn:
 178             self.conn.close()
 179             self.conn = None
 180
 181
 182     def _search_normalized(self, name):
 183         """ Return the search token transliteration of the given name.
 184         """
 185         return self.token_analysis.search.transliterate(name).strip()
 186
 187
 188     def _normalized(self, name):
 189         """ Return the normalized version of the given name with all
 190             non-relevant information removed.
 191         """
 192         return self.token_analysis.normalizer.transliterate(name).strip()
 193
 194
 195     def get_word_token_info(self, words):
 196         """ Return token information for the given list of words.
 197             If a word starts with # it is assumed to be a full name
 198             otherwise is a partial name.
 199
 200             The function returns a list of tuples with
 201             (original word, word token, word id).
 202
 203             The function is used for testing and debugging only
 204             and not necessarily efficient.
 205         """
 206         full_tokens = {}
 207         partial_tokens = {}
 208         for word in words:
 209             if word.startswith('#'):
 210                 full_tokens[word] = self._search_normalized(word[1:])
 211             else:
 212                 partial_tokens[word] = self._search_normalized(word)
 213
 214         with self.conn.cursor() as cur:
 215             cur.execute("""SELECT word_token, word_id
 216                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 217                         """, (list(full_tokens.values()),))
 218             full_ids = {r[0]: r[1] for r in cur}
 219             cur.execute("""SELECT word_token, word_id
 220                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 221                         (list(partial_tokens.values()),))
 222             part_ids = {r[0]: r[1] for r in cur}
 223
 224         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 225                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 226
 227
 228     @staticmethod
 229     def normalize_postcode(postcode):
 230         """ Convert the postcode to a standardized form.
 231
 232             This function must yield exactly the same result as the SQL function
 233             'token_normalized_postcode()'.
 234         """
 235         return postcode.strip().upper()
 236
 237
 238     def _make_standard_hnr(self, hnr):
 239         """ Create a normalised version of a housenumber.
 240
 241             This function takes minor shortcuts on transliteration.
 242         """
 243         return self._search_normalized(hnr)
 244
 245     def update_postcodes_from_db(self):
 246         """ Update postcode tokens in the word table from the location_postcode
 247             table.
 248         """
 249         to_delete = []
 250         with self.conn.cursor() as cur:
 251             # This finds us the rows in location_postcode and word that are
 252             # missing in the other table.
 253             cur.execute("""SELECT * FROM
 254                             (SELECT pc, word FROM
 255                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 256                               FULL JOIN
 257                               (SELECT word FROM word WHERE type = 'P') w
 258                               ON pc = word) x
 259                            WHERE pc is null or word is null""")
 260
 261             with CopyBuffer() as copystr:
 262                 for postcode, word in cur:
 263                     if postcode is None:
 264                         to_delete.append(word)
 265                     else:
 266                         copystr.add(self._search_normalized(postcode),
 267                                     'P', postcode)
 268
 269                 if to_delete:
 270                     cur.execute("""DELETE FROM WORD
 271                                    WHERE type ='P' and word = any(%s)
 272                                 """, (to_delete, ))
 273
 274                 copystr.copy_out(cur, 'word',
 275                                  columns=['word_token', 'type', 'word'])
 276
 277
 278     def update_special_phrases(self, phrases, should_replace):
 279         """ Replace the search index for special phrases with the new phrases.
 280             If `should_replace` is True, then the previous set of will be
 281             completely replaced. Otherwise the phrases are added to the
 282             already existing ones.
 283         """
 284         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 285                             for p in phrases))
 286
 287         with self.conn.cursor() as cur:
 288             # Get the old phrases.
 289             existing_phrases = set()
 290             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 291             for word, info in cur:
 292                 existing_phrases.add((word, info['class'], info['type'],
 293                                       info.get('op') or '-'))
 294
 295             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 296             if should_replace:
 297                 deleted = self._remove_special_phrases(cur, norm_phrases,
 298                                                        existing_phrases)
 299             else:
 300                 deleted = 0
 301
 302         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 303                  len(norm_phrases), added, deleted)
 304
 305
 306     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 307         """ Add all phrases to the database that are not yet there.
 308         """
 309         to_add = new_phrases - existing_phrases
 310
 311         added = 0
 312         with CopyBuffer() as copystr:
 313             for word, cls, typ, oper in to_add:
 314                 term = self._search_normalized(word)
 315                 if term:
 316                     copystr.add(term, 'S', word,
 317                                 json.dumps({'class': cls, 'type': typ,
 318                                             'op': oper if oper in ('in', 'near') else None}))
 319                     added += 1
 320
 321             copystr.copy_out(cursor, 'word',
 322                              columns=['word_token', 'type', 'word', 'info'])
 323
 324         return added
 325
 326
 327     @staticmethod
 328     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 329         """ Remove all phrases from the databse that are no longer in the
 330             new phrase list.
 331         """
 332         to_delete = existing_phrases - new_phrases
 333
 334         if to_delete:
 335             cursor.execute_values(
 336                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 337                     WHERE type = 'S' and word = name
 338                           and info->>'class' = in_class and info->>'type' = in_type
 339                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 340                 """, to_delete)
 341
 342         return len(to_delete)
 343
 344
 345     def add_country_names(self, country_code, names):
 346         """ Add names for the given country to the search index.
 347         """
 348         # Make sure any name preprocessing for country names applies.
 349         info = PlaceInfo({'name': names, 'country_code': country_code,
 350                           'rank_address': 4, 'class': 'boundary',
 351                           'type': 'administrative'})
 352         self._add_country_full_names(country_code,
 353                                      self.sanitizer.process_names(info)[0])
 354
 355
 356     def _add_country_full_names(self, country_code, names):
 357         """ Add names for the given country from an already sanitized
 358             name list.
 359         """
 360         word_tokens = set()
 361         for name in names:
 362             norm_name = self._search_normalized(name.name)
 363             if norm_name:
 364                 word_tokens.add(norm_name)
 365
 366         with self.conn.cursor() as cur:
 367             # Get existing names
 368             cur.execute("""SELECT word_token FROM word
 369                             WHERE type = 'C' and word = %s""",
 370                         (country_code, ))
 371             word_tokens.difference_update((t[0] for t in cur))
 372
 373             # Only add those names that are not yet in the list.
 374             if word_tokens:
 375                 cur.execute("""INSERT INTO word (word_token, type, word)
 376                                (SELECT token, 'C', %s
 377                                 FROM unnest(%s) as token)
 378                             """, (country_code, list(word_tokens)))
 379
 380             # No names are deleted at the moment.
 381             # If deletion is made possible, then the static names from the
 382             # initial 'country_name' table should be kept.
 383
 384
 385     def process_place(self, place):
 386         """ Determine tokenizer information about the given place.
 387
 388             Returns a JSON-serializable structure that will be handed into
 389             the database via the token_info field.
 390         """
 391         token_info = _TokenInfo(self._cache)
 392
 393         names, address = self.sanitizer.process_names(place)
 394
 395         if names:
 396             fulls, partials = self._compute_name_tokens(names)
 397
 398             token_info.add_names(fulls, partials)
 399
 400             if place.is_country():
 401                 self._add_country_full_names(place.country_code, names)
 402
 403         if address:
 404             self._process_place_address(token_info, address)
 405
 406         return token_info.data
 407
 408
 409     def _process_place_address(self, token_info, address):
 410         hnrs = []
 411         addr_terms = []
 412         streets = []
 413         for item in address:
 414             if item.kind == 'postcode':
 415                 self._add_postcode(item.name)
 416             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 417                 hnrs.append(item.name)
 418             elif item.kind == 'street':
 419                 streets.extend(self._retrieve_full_tokens(item.name))
 420             elif item.kind == 'place':
 421                 if not item.suffix:
 422                     token_info.add_place(self._compute_partial_tokens(item.name))
 423             elif not item.kind.startswith('_') and not item.suffix and \
 424                  item.kind not in ('country', 'full'):
 425                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 426
 427         if hnrs:
 428             hnrs = self._split_housenumbers(hnrs)
 429             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 430
 431         if addr_terms:
 432             token_info.add_address_terms(addr_terms)
 433
 434         if streets:
 435             token_info.add_street(streets)
 436
 437
 438     def _compute_partial_tokens(self, name):
 439         """ Normalize the given term, split it into partial words and return
 440             then token list for them.
 441         """
 442         norm_name = self._search_normalized(name)
 443
 444         tokens = []
 445         need_lookup = []
 446         for partial in norm_name.split():
 447             token = self._cache.partials.get(partial)
 448             if token:
 449                 tokens.append(token)
 450             else:
 451                 need_lookup.append(partial)
 452
 453         if need_lookup:
 454             with self.conn.cursor() as cur:
 455                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 456                                FROM unnest(%s) word""",
 457                             (need_lookup, ))
 458
 459                 for partial, token in cur:
 460                     tokens.append(token)
 461                     self._cache.partials[partial] = token
 462
 463         return tokens
 464
 465
 466     def _retrieve_full_tokens(self, name):
 467         """ Get the full name token for the given name, if it exists.
 468             The name is only retrived for the standard analyser.
 469         """
 470         norm_name = self._search_normalized(name)
 471
 472         # return cached if possible
 473         if norm_name in self._cache.fulls:
 474             return self._cache.fulls[norm_name]
 475
 476         with self.conn.cursor() as cur:
 477             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 478                         (norm_name, ))
 479             full = [row[0] for row in cur]
 480
 481         self._cache.fulls[norm_name] = full
 482
 483         return full
 484
 485
 486     def _compute_name_tokens(self, names):
 487         """ Computes the full name and partial name tokens for the given
 488             dictionary of names.
 489         """
 490         full_tokens = set()
 491         partial_tokens = set()
 492
 493         for name in names:
 494             analyzer_id = name.get_attr('analyzer')
 495             norm_name = self._normalized(name.name)
 496             if analyzer_id is None:
 497                 token_id = norm_name
 498             else:
 499                 token_id = f'{norm_name}@{analyzer_id}'
 500
 501             full, part = self._cache.names.get(token_id, (None, None))
 502             if full is None:
 503                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 504                 if not variants:
 505                     continue
 506
 507                 with self.conn.cursor() as cur:
 508                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 509                                 (token_id, variants))
 510                     full, part = cur.fetchone()
 511
 512                 self._cache.names[token_id] = (full, part)
 513
 514             full_tokens.add(full)
 515             partial_tokens.update(part)
 516
 517         return full_tokens, partial_tokens
 518
 519
 520     def _add_postcode(self, postcode):
 521         """ Make sure the normalized postcode is present in the word table.
 522         """
 523         if re.search(r'[:,;]', postcode) is None:
 524             postcode = self.normalize_postcode(postcode)
 525
 526             if postcode not in self._cache.postcodes:
 527                 term = self._search_normalized(postcode)
 528                 if not term:
 529                     return
 530
 531                 with self.conn.cursor() as cur:
 532                     # no word_id needed for postcodes
 533                     cur.execute("""INSERT INTO word (word_token, type, word)
 534                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 535                                     WHERE NOT EXISTS
 536                                      (SELECT * FROM word
 537                                       WHERE type = 'P' and word = pc))
 538                                 """, (term, postcode))
 539                 self._cache.postcodes.add(postcode)
 540
 541
 542     @staticmethod
 543     def _split_housenumbers(hnrs):
 544         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 545             # split numbers if necessary
 546             simple_list = []
 547             for hnr in hnrs:
 548                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 549
 550             if len(simple_list) > 1:
 551                 hnrs = list(set(simple_list))
 552             else:
 553                 hnrs = simple_list
 554
 555         return hnrs
 556
 557
 558
 559
 560 class _TokenInfo:
 561     """ Collect token information to be sent back to the database.
 562     """
 563     def __init__(self, cache):
 564         self._cache = cache
 565         self.data = {}
 566
 567     @staticmethod
 568     def _mk_array(tokens):
 569         return '{%s}' % ','.join((str(s) for s in tokens))
 570
 571
 572     def add_names(self, fulls, partials):
 573         """ Adds token information for the normalised names.
 574         """
 575         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 576
 577
 578     def add_housenumbers(self, conn, hnrs):
 579         """ Extract housenumber information from a list of normalised
 580             housenumbers.
 581         """
 582         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 583         self.data['hnr'] = ';'.join(hnrs)
 584
 585
 586     def add_street(self, tokens):
 587         """ Add addr:street match terms.
 588         """
 589         self.data['street'] = self._mk_array(tokens)
 590
 591
 592     def add_place(self, tokens):
 593         """ Add addr:place search and match terms.
 594         """
 595         if tokens:
 596             self.data['place'] = self._mk_array(tokens)
 597
 598
 599     def add_address_terms(self, terms):
 600         """ Add additional address terms.
 601         """
 602         tokens = {key: self._mk_array(partials)
 603                   for key, partials in terms if partials}
 604
 605         if tokens:
 606             self.data['addr'] = tokens
 607
 608
 609 class _TokenCache:
 610     """ Cache for token information to avoid repeated database queries.
 611
 612         This cache is not thread-safe and needs to be instantiated per
 613         analyzer.
 614     """
 615     def __init__(self):
 616         self.names = {}
 617         self.partials = {}
 618         self.fulls = {}
 619         self.postcodes = set()
 620         self.housenumbers = {}
 621
 622
 623     def get_hnr_tokens(self, conn, terms):
 624         """ Get token ids for a list of housenumbers, looking them up in the
 625             database if necessary. `terms` is an iterable of normalized
 626             housenumbers.
 627         """
 628         tokens = []
 629         askdb = []
 630
 631         for term in terms:
 632             token = self.housenumbers.get(term)
 633             if token is None:
 634                 askdb.append(term)
 635             else:
 636                 tokens.append(token)
 637
 638         if askdb:
 639             with conn.cursor() as cur:
 640                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 641                             (askdb, ))
 642                 for term, tid in cur:
 643                     self.housenumbers[term] = tid
 644                     tokens.append(tid)
 645
 646         return tokens