nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 import itertools
   6 import json
   7 import logging
   8 import re
   9 from textwrap import dedent
  10
  11 from nominatim.db.connection import connect
  12 from nominatim.db.properties import set_property, get_property
  13 from nominatim.db.utils import CopyBuffer
  14 from nominatim.db.sql_preprocessor import SQLPreprocessor
  15 from nominatim.indexer.place_info import PlaceInfo
  16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  18
  19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  20
  21 LOG = logging.getLogger()
  22
  23 def create(dsn, data_dir):
  24     """ Create a new instance of the tokenizer provided by this module.
  25     """
  26     return LegacyICUTokenizer(dsn, data_dir)
  27
  28
  29 class LegacyICUTokenizer(AbstractTokenizer):
  30     """ This tokenizer uses libICU to covert names and queries to ASCII.
  31         Otherwise it uses the same algorithms and data structures as the
  32         normalization routines in Nominatim 3.
  33     """
  34
  35     def __init__(self, dsn, data_dir):
  36         self.dsn = dsn
  37         self.data_dir = data_dir
  38         self.loader = None
  39         self.term_normalization = None
  40
  41
  42     def init_new_db(self, config, init_db=True):
  43         """ Set up a new tokenizer for the database.
  44
  45             This copies all necessary data in the project directory to make
  46             sure the tokenizer remains stable even over updates.
  47         """
  48         self.loader = ICURuleLoader(config)
  49
  50         self.term_normalization = config.TERM_NORMALIZATION
  51
  52         self._install_php(config.lib_dir.php)
  53         self._save_config()
  54
  55         if init_db:
  56             self.update_sql_functions(config)
  57             self._init_db_tables(config)
  58
  59
  60     def init_from_project(self, config):
  61         """ Initialise the tokenizer from the project directory.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         with connect(self.dsn) as conn:
  66             self.loader.load_config_from_db(conn)
  67             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  68
  69
  70     def finalize_import(self, config):
  71         """ Do any required postprocessing to make the tokenizer data ready
  72             for use.
  73         """
  74         with connect(self.dsn) as conn:
  75             sqlp = SQLPreprocessor(conn, config)
  76             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  77
  78
  79     def update_sql_functions(self, config):
  80         """ Reimport the SQL functions for this tokenizer.
  81         """
  82         with connect(self.dsn) as conn:
  83             sqlp = SQLPreprocessor(conn, config)
  84             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  85
  86
  87     def check_database(self, config):
  88         """ Check that the tokenizer is set up correctly.
  89         """
  90         self.init_from_project(config)
  91
  92         if self.term_normalization is None:
  93             return "Configuration for tokenizer 'icu' are missing."
  94
  95         return None
  96
  97
  98     def update_statistics(self):
  99         """ Recompute frequencies for all name words.
 100         """
 101         with connect(self.dsn) as conn:
 102             if conn.table_exists('search_name'):
 103                 with conn.cursor() as cur:
 104                     cur.drop_table("word_frequencies")
 105                     LOG.info("Computing word frequencies")
 106                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 107                                      SELECT unnest(name_vector) as id, count(*)
 108                                      FROM search_name GROUP BY id""")
 109                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 110                     LOG.info("Update word table with recomputed frequencies")
 111                     cur.execute("""UPDATE word
 112                                    SET info = info || jsonb_build_object('count', count)
 113                                    FROM word_frequencies WHERE word_id = id""")
 114                     cur.drop_table("word_frequencies")
 115             conn.commit()
 116
 117
 118     def name_analyzer(self):
 119         """ Create a new analyzer for tokenizing names and queries
 120             using this tokinzer. Analyzers are context managers and should
 121             be used accordingly:
 122
 123             ```
 124             with tokenizer.name_analyzer() as analyzer:
 125                 analyser.tokenize()
 126             ```
 127
 128             When used outside the with construct, the caller must ensure to
 129             call the close() function before destructing the analyzer.
 130
 131             Analyzers are not thread-safe. You need to instantiate one per thread.
 132         """
 133         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 134                                      self.loader.make_token_analysis())
 135
 136
 137     def _install_php(self, phpdir):
 138         """ Install the php script for the tokenizer.
 139         """
 140         php_file = self.data_dir / "tokenizer.php"
 141         php_file.write_text(dedent(f"""\
 142             <?php
 143             @define('CONST_Max_Word_Frequency', 10000000);
 144             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 145             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 146             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 147
 148
 149     def _save_config(self):
 150         """ Save the configuration that needs to remain stable for the given
 151             database as database properties.
 152         """
 153         with connect(self.dsn) as conn:
 154             self.loader.save_config_to_db(conn)
 155             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 156
 157
 158     def _init_db_tables(self, config):
 159         """ Set up the word table and fill it with pre-computed word
 160             frequencies.
 161         """
 162         with connect(self.dsn) as conn:
 163             sqlp = SQLPreprocessor(conn, config)
 164             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 165             conn.commit()
 166
 167
 168 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 169     """ The legacy analyzer uses the ICU library for splitting names.
 170
 171         Each instance opens a connection to the database to request the
 172         normalization.
 173     """
 174
 175     def __init__(self, dsn, sanitizer, token_analysis):
 176         self.conn = connect(dsn).connection
 177         self.conn.autocommit = True
 178         self.sanitizer = sanitizer
 179         self.token_analysis = token_analysis
 180
 181         self._cache = _TokenCache()
 182
 183
 184     def close(self):
 185         """ Free all resources used by the analyzer.
 186         """
 187         if self.conn:
 188             self.conn.close()
 189             self.conn = None
 190
 191
 192     def _search_normalized(self, name):
 193         """ Return the search token transliteration of the given name.
 194         """
 195         return self.token_analysis.search.transliterate(name).strip()
 196
 197
 198     def _normalized(self, name):
 199         """ Return the normalized version of the given name with all
 200             non-relevant information removed.
 201         """
 202         return self.token_analysis.normalizer.transliterate(name).strip()
 203
 204
 205     def get_word_token_info(self, words):
 206         """ Return token information for the given list of words.
 207             If a word starts with # it is assumed to be a full name
 208             otherwise is a partial name.
 209
 210             The function returns a list of tuples with
 211             (original word, word token, word id).
 212
 213             The function is used for testing and debugging only
 214             and not necessarily efficient.
 215         """
 216         full_tokens = {}
 217         partial_tokens = {}
 218         for word in words:
 219             if word.startswith('#'):
 220                 full_tokens[word] = self._search_normalized(word[1:])
 221             else:
 222                 partial_tokens[word] = self._search_normalized(word)
 223
 224         with self.conn.cursor() as cur:
 225             cur.execute("""SELECT word_token, word_id
 226                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 227                         """, (list(full_tokens.values()),))
 228             full_ids = {r[0]: r[1] for r in cur}
 229             cur.execute("""SELECT word_token, word_id
 230                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 231                         (list(partial_tokens.values()),))
 232             part_ids = {r[0]: r[1] for r in cur}
 233
 234         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 235                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 236
 237
 238     @staticmethod
 239     def normalize_postcode(postcode):
 240         """ Convert the postcode to a standardized form.
 241
 242             This function must yield exactly the same result as the SQL function
 243             'token_normalized_postcode()'.
 244         """
 245         return postcode.strip().upper()
 246
 247
 248     def _make_standard_hnr(self, hnr):
 249         """ Create a normalised version of a housenumber.
 250
 251             This function takes minor shortcuts on transliteration.
 252         """
 253         return self._search_normalized(hnr)
 254
 255     def update_postcodes_from_db(self):
 256         """ Update postcode tokens in the word table from the location_postcode
 257             table.
 258         """
 259         to_delete = []
 260         with self.conn.cursor() as cur:
 261             # This finds us the rows in location_postcode and word that are
 262             # missing in the other table.
 263             cur.execute("""SELECT * FROM
 264                             (SELECT pc, word FROM
 265                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 266                               FULL JOIN
 267                               (SELECT word FROM word WHERE type = 'P') w
 268                               ON pc = word) x
 269                            WHERE pc is null or word is null""")
 270
 271             with CopyBuffer() as copystr:
 272                 for postcode, word in cur:
 273                     if postcode is None:
 274                         to_delete.append(word)
 275                     else:
 276                         copystr.add(self._search_normalized(postcode),
 277                                     'P', postcode)
 278
 279                 if to_delete:
 280                     cur.execute("""DELETE FROM WORD
 281                                    WHERE type ='P' and word = any(%s)
 282                                 """, (to_delete, ))
 283
 284                 copystr.copy_out(cur, 'word',
 285                                  columns=['word_token', 'type', 'word'])
 286
 287
 288     def update_special_phrases(self, phrases, should_replace):
 289         """ Replace the search index for special phrases with the new phrases.
 290             If `should_replace` is True, then the previous set of will be
 291             completely replaced. Otherwise the phrases are added to the
 292             already existing ones.
 293         """
 294         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 295                             for p in phrases))
 296
 297         with self.conn.cursor() as cur:
 298             # Get the old phrases.
 299             existing_phrases = set()
 300             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 301             for word, info in cur:
 302                 existing_phrases.add((word, info['class'], info['type'],
 303                                       info.get('op') or '-'))
 304
 305             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 306             if should_replace:
 307                 deleted = self._remove_special_phrases(cur, norm_phrases,
 308                                                        existing_phrases)
 309             else:
 310                 deleted = 0
 311
 312         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 313                  len(norm_phrases), added, deleted)
 314
 315
 316     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 317         """ Add all phrases to the database that are not yet there.
 318         """
 319         to_add = new_phrases - existing_phrases
 320
 321         added = 0
 322         with CopyBuffer() as copystr:
 323             for word, cls, typ, oper in to_add:
 324                 term = self._search_normalized(word)
 325                 if term:
 326                     copystr.add(term, 'S', word,
 327                                 json.dumps({'class': cls, 'type': typ,
 328                                             'op': oper if oper in ('in', 'near') else None}))
 329                     added += 1
 330
 331             copystr.copy_out(cursor, 'word',
 332                              columns=['word_token', 'type', 'word', 'info'])
 333
 334         return added
 335
 336
 337     @staticmethod
 338     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 339         """ Remove all phrases from the databse that are no longer in the
 340             new phrase list.
 341         """
 342         to_delete = existing_phrases - new_phrases
 343
 344         if to_delete:
 345             cursor.execute_values(
 346                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 347                     WHERE type = 'S' and word = name
 348                           and info->>'class' = in_class and info->>'type' = in_type
 349                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 350                 """, to_delete)
 351
 352         return len(to_delete)
 353
 354
 355     def add_country_names(self, country_code, names):
 356         """ Add names for the given country to the search index.
 357         """
 358         # Make sure any name preprocessing for country names applies.
 359         info = PlaceInfo({'name': names, 'country_code': country_code,
 360                           'rank_address': 4, 'class': 'boundary',
 361                           'type': 'administrative'})
 362         self._add_country_full_names(country_code,
 363                                      self.sanitizer.process_names(info)[0])
 364
 365
 366     def _add_country_full_names(self, country_code, names):
 367         """ Add names for the given country from an already sanitized
 368             name list.
 369         """
 370         word_tokens = set()
 371         for name in names:
 372             norm_name = self._search_normalized(name.name)
 373             if norm_name:
 374                 word_tokens.add(norm_name)
 375
 376         with self.conn.cursor() as cur:
 377             # Get existing names
 378             cur.execute("""SELECT word_token FROM word
 379                             WHERE type = 'C' and word = %s""",
 380                         (country_code, ))
 381             word_tokens.difference_update((t[0] for t in cur))
 382
 383             # Only add those names that are not yet in the list.
 384             if word_tokens:
 385                 cur.execute("""INSERT INTO word (word_token, type, word)
 386                                (SELECT token, 'C', %s
 387                                 FROM unnest(%s) as token)
 388                             """, (country_code, list(word_tokens)))
 389
 390             # No names are deleted at the moment.
 391             # If deletion is made possible, then the static names from the
 392             # initial 'country_name' table should be kept.
 393
 394
 395     def process_place(self, place):
 396         """ Determine tokenizer information about the given place.
 397
 398             Returns a JSON-serializable structure that will be handed into
 399             the database via the token_info field.
 400         """
 401         token_info = _TokenInfo(self._cache)
 402
 403         names, address = self.sanitizer.process_names(place)
 404
 405         if names:
 406             fulls, partials = self._compute_name_tokens(names)
 407
 408             token_info.add_names(fulls, partials)
 409
 410             if place.is_country():
 411                 self._add_country_full_names(place.country_code, names)
 412
 413         if address:
 414             self._process_place_address(token_info, address)
 415
 416         return token_info.data
 417
 418
 419     def _process_place_address(self, token_info, address):
 420         hnrs = []
 421         addr_terms = []
 422         for item in address:
 423             if item.kind == 'postcode':
 424                 self._add_postcode(item.name)
 425             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 426                 hnrs.append(item.name)
 427             elif item.kind == 'street':
 428                 token_info.add_street(self._compute_partial_tokens(item.name))
 429             elif item.kind == 'place':
 430                 token_info.add_place(self._compute_partial_tokens(item.name))
 431             elif not item.kind.startswith('_') and \
 432                  item.kind not in ('country', 'full'):
 433                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 434
 435         if hnrs:
 436             hnrs = self._split_housenumbers(hnrs)
 437             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 438
 439         if addr_terms:
 440             token_info.add_address_terms(addr_terms)
 441
 442
 443     def _compute_partial_tokens(self, name):
 444         """ Normalize the given term, split it into partial words and return
 445             then token list for them.
 446         """
 447         norm_name = self._search_normalized(name)
 448
 449         tokens = []
 450         need_lookup = []
 451         for partial in norm_name.split():
 452             token = self._cache.partials.get(partial)
 453             if token:
 454                 tokens.append(token)
 455             else:
 456                 need_lookup.append(partial)
 457
 458         if need_lookup:
 459             with self.conn.cursor() as cur:
 460                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 461                                FROM unnest(%s) word""",
 462                             (need_lookup, ))
 463
 464                 for partial, token in cur:
 465                     tokens.append(token)
 466                     self._cache.partials[partial] = token
 467
 468         return tokens
 469
 470
 471     def _compute_name_tokens(self, names):
 472         """ Computes the full name and partial name tokens for the given
 473             dictionary of names.
 474         """
 475         full_tokens = set()
 476         partial_tokens = set()
 477
 478         for name in names:
 479             analyzer_id = name.get_attr('analyzer')
 480             norm_name = self._normalized(name.name)
 481             if analyzer_id is None:
 482                 token_id = norm_name
 483             else:
 484                 token_id = f'{norm_name}@{analyzer_id}'
 485
 486             full, part = self._cache.names.get(token_id, (None, None))
 487             if full is None:
 488                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 489                 if not variants:
 490                     continue
 491
 492                 with self.conn.cursor() as cur:
 493                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 494                                 (token_id, variants))
 495                     full, part = cur.fetchone()
 496
 497                 self._cache.names[token_id] = (full, part)
 498
 499             full_tokens.add(full)
 500             partial_tokens.update(part)
 501
 502         return full_tokens, partial_tokens
 503
 504
 505     def _add_postcode(self, postcode):
 506         """ Make sure the normalized postcode is present in the word table.
 507         """
 508         if re.search(r'[:,;]', postcode) is None:
 509             postcode = self.normalize_postcode(postcode)
 510
 511             if postcode not in self._cache.postcodes:
 512                 term = self._search_normalized(postcode)
 513                 if not term:
 514                     return
 515
 516                 with self.conn.cursor() as cur:
 517                     # no word_id needed for postcodes
 518                     cur.execute("""INSERT INTO word (word_token, type, word)
 519                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 520                                     WHERE NOT EXISTS
 521                                      (SELECT * FROM word
 522                                       WHERE type = 'P' and word = pc))
 523                                 """, (term, postcode))
 524                 self._cache.postcodes.add(postcode)
 525
 526
 527     @staticmethod
 528     def _split_housenumbers(hnrs):
 529         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 530             # split numbers if necessary
 531             simple_list = []
 532             for hnr in hnrs:
 533                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 534
 535             if len(simple_list) > 1:
 536                 hnrs = list(set(simple_list))
 537             else:
 538                 hnrs = simple_list
 539
 540         return hnrs
 541
 542
 543
 544
 545 class _TokenInfo:
 546     """ Collect token information to be sent back to the database.
 547     """
 548     def __init__(self, cache):
 549         self._cache = cache
 550         self.data = {}
 551
 552     @staticmethod
 553     def _mk_array(tokens):
 554         return '{%s}' % ','.join((str(s) for s in tokens))
 555
 556
 557     def add_names(self, fulls, partials):
 558         """ Adds token information for the normalised names.
 559         """
 560         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 561
 562
 563     def add_housenumbers(self, conn, hnrs):
 564         """ Extract housenumber information from a list of normalised
 565             housenumbers.
 566         """
 567         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 568         self.data['hnr'] = ';'.join(hnrs)
 569
 570
 571     def add_street(self, tokens):
 572         """ Add addr:street match terms.
 573         """
 574         if tokens:
 575             self.data['street'] = self._mk_array(tokens)
 576
 577
 578     def add_place(self, tokens):
 579         """ Add addr:place search and match terms.
 580         """
 581         if tokens:
 582             self.data['place'] = self._mk_array(tokens)
 583
 584
 585     def add_address_terms(self, terms):
 586         """ Add additional address terms.
 587         """
 588         tokens = {key: self._mk_array(partials)
 589                   for key, partials in terms if partials}
 590
 591         if tokens:
 592             self.data['addr'] = tokens
 593
 594
 595 class _TokenCache:
 596     """ Cache for token information to avoid repeated database queries.
 597
 598         This cache is not thread-safe and needs to be instantiated per
 599         analyzer.
 600     """
 601     def __init__(self):
 602         self.names = {}
 603         self.partials = {}
 604         self.postcodes = set()
 605         self.housenumbers = {}
 606
 607
 608     def get_hnr_tokens(self, conn, terms):
 609         """ Get token ids for a list of housenumbers, looking them up in the
 610             database if necessary. `terms` is an iterable of normalized
 611             housenumbers.
 612         """
 613         tokens = []
 614         askdb = []
 615
 616         for term in terms:
 617             token = self.housenumbers.get(term)
 618             if token is None:
 619                 askdb.append(term)
 620             else:
 621                 tokens.append(token)
 622
 623         if askdb:
 624             with conn.cursor() as cur:
 625                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 626                             (askdb, ))
 627                 for term, tid in cur:
 628                     self.housenumbers[term] = tid
 629                     tokens.append(tid)
 630
 631         return tokens