nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.indexer.place_info import PlaceInfo
  17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  19
  20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  21
  22 LOG = logging.getLogger()
  23
  24 def create(dsn, data_dir):
  25     """ Create a new instance of the tokenizer provided by this module.
  26     """
  27     return LegacyICUTokenizer(dsn, data_dir)
  28
  29
  30 class LegacyICUTokenizer(AbstractTokenizer):
  31     """ This tokenizer uses libICU to covert names and queries to ASCII.
  32         Otherwise it uses the same algorithms and data structures as the
  33         normalization routines in Nominatim 3.
  34     """
  35
  36     def __init__(self, dsn, data_dir):
  37         self.dsn = dsn
  38         self.data_dir = data_dir
  39         self.loader = None
  40         self.term_normalization = None
  41
  42
  43     def init_new_db(self, config, init_db=True):
  44         """ Set up a new tokenizer for the database.
  45
  46             This copies all necessary data in the project directory to make
  47             sure the tokenizer remains stable even over updates.
  48         """
  49         self.loader = ICURuleLoader(config)
  50
  51         self.term_normalization = config.TERM_NORMALIZATION
  52
  53         self._install_php(config.lib_dir.php)
  54         self._save_config()
  55
  56         if init_db:
  57             self.update_sql_functions(config)
  58             self._init_db_tables(config)
  59
  60
  61     def init_from_project(self, config):
  62         """ Initialise the tokenizer from the project directory.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         with connect(self.dsn) as conn:
  67             self.loader.load_config_from_db(conn)
  68             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  69
  70
  71     def finalize_import(self, _):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75
  76
  77     def update_sql_functions(self, config):
  78         """ Reimport the SQL functions for this tokenizer.
  79         """
  80         with connect(self.dsn) as conn:
  81             sqlp = SQLPreprocessor(conn, config)
  82             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  83
  84
  85     def check_database(self, config):
  86         """ Check that the tokenizer is set up correctly.
  87         """
  88         self.init_from_project(config)
  89
  90         if self.term_normalization is None:
  91             return "Configuration for tokenizer 'icu' are missing."
  92
  93         return None
  94
  95
  96     def update_statistics(self):
  97         """ Recompute frequencies for all name words.
  98         """
  99         with connect(self.dsn) as conn:
 100             with conn.cursor() as cur:
 101                 cur.drop_table("word_frequencies")
 102                 LOG.info("Computing word frequencies")
 103                 cur.execute("""CREATE TEMP TABLE word_frequencies AS
 104                                  SELECT unnest(name_vector) as id, count(*)
 105                                  FROM search_name GROUP BY id""")
 106                 cur.execute("CREATE INDEX ON word_frequencies(id)")
 107                 LOG.info("Update word table with recomputed frequencies")
 108                 cur.execute("""UPDATE word
 109                                SET info = info || jsonb_build_object('count', count)
 110                                FROM word_frequencies WHERE word_id = id""")
 111                 cur.drop_table("word_frequencies")
 112             conn.commit()
 113
 114
 115     def name_analyzer(self):
 116         """ Create a new analyzer for tokenizing names and queries
 117             using this tokinzer. Analyzers are context managers and should
 118             be used accordingly:
 119
 120             ```
 121             with tokenizer.name_analyzer() as analyzer:
 122                 analyser.tokenize()
 123             ```
 124
 125             When used outside the with construct, the caller must ensure to
 126             call the close() function before destructing the analyzer.
 127
 128             Analyzers are not thread-safe. You need to instantiate one per thread.
 129         """
 130         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 131                                      self.loader.make_token_analysis())
 132
 133
 134     def _install_php(self, phpdir):
 135         """ Install the php script for the tokenizer.
 136         """
 137         php_file = self.data_dir / "tokenizer.php"
 138         php_file.write_text(dedent(f"""\
 139             <?php
 140             @define('CONST_Max_Word_Frequency', 10000000);
 141             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 142             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 143             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 144
 145
 146     def _save_config(self):
 147         """ Save the configuration that needs to remain stable for the given
 148             database as database properties.
 149         """
 150         with connect(self.dsn) as conn:
 151             self.loader.save_config_to_db(conn)
 152             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 153
 154
 155     def _init_db_tables(self, config):
 156         """ Set up the word table and fill it with pre-computed word
 157             frequencies.
 158         """
 159         with connect(self.dsn) as conn:
 160             sqlp = SQLPreprocessor(conn, config)
 161             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 162             conn.commit()
 163
 164             LOG.warning("Precomputing word tokens")
 165
 166             # get partial words and their frequencies
 167             words = self._count_partial_terms(conn)
 168
 169             # copy them back into the word table
 170             with CopyBuffer() as copystr:
 171                 for term, cnt in words.items():
 172                     copystr.add('w', term, json.dumps({'count': cnt}))
 173
 174                 with conn.cursor() as cur:
 175                     copystr.copy_out(cur, 'word',
 176                                      columns=['type', 'word_token', 'info'])
 177                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 178                                    WHERE word_id is null and type = 'w'""")
 179
 180             conn.commit()
 181
 182     def _count_partial_terms(self, conn):
 183         """ Count the partial terms from the names in the place table.
 184         """
 185         words = Counter()
 186         analysis = self.loader.make_token_analysis()
 187
 188         with conn.cursor(name="words") as cur:
 189             cur.execute(""" SELECT v, count(*) FROM
 190                               (SELECT svals(name) as v FROM place)x
 191                             WHERE length(v) < 75 GROUP BY v""")
 192
 193             for name, cnt in cur:
 194                 word = analysis.search.transliterate(name)
 195                 if word and ' ' in word:
 196                     for term in set(word.split()):
 197                         words[term] += cnt
 198
 199         return words
 200
 201
 202 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 203     """ The legacy analyzer uses the ICU library for splitting names.
 204
 205         Each instance opens a connection to the database to request the
 206         normalization.
 207     """
 208
 209     def __init__(self, dsn, sanitizer, token_analysis):
 210         self.conn = connect(dsn).connection
 211         self.conn.autocommit = True
 212         self.sanitizer = sanitizer
 213         self.token_analysis = token_analysis
 214
 215         self._cache = _TokenCache()
 216
 217
 218     def close(self):
 219         """ Free all resources used by the analyzer.
 220         """
 221         if self.conn:
 222             self.conn.close()
 223             self.conn = None
 224
 225
 226     def _search_normalized(self, name):
 227         """ Return the search token transliteration of the given name.
 228         """
 229         return self.token_analysis.search.transliterate(name).strip()
 230
 231
 232     def _normalized(self, name):
 233         """ Return the normalized version of the given name with all
 234             non-relevant information removed.
 235         """
 236         return self.token_analysis.normalizer.transliterate(name).strip()
 237
 238
 239     def get_word_token_info(self, words):
 240         """ Return token information for the given list of words.
 241             If a word starts with # it is assumed to be a full name
 242             otherwise is a partial name.
 243
 244             The function returns a list of tuples with
 245             (original word, word token, word id).
 246
 247             The function is used for testing and debugging only
 248             and not necessarily efficient.
 249         """
 250         full_tokens = {}
 251         partial_tokens = {}
 252         for word in words:
 253             if word.startswith('#'):
 254                 full_tokens[word] = self._search_normalized(word[1:])
 255             else:
 256                 partial_tokens[word] = self._search_normalized(word)
 257
 258         with self.conn.cursor() as cur:
 259             cur.execute("""SELECT word_token, word_id
 260                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 261                         """, (list(full_tokens.values()),))
 262             full_ids = {r[0]: r[1] for r in cur}
 263             cur.execute("""SELECT word_token, word_id
 264                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 265                         (list(partial_tokens.values()),))
 266             part_ids = {r[0]: r[1] for r in cur}
 267
 268         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 269                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 270
 271
 272     @staticmethod
 273     def normalize_postcode(postcode):
 274         """ Convert the postcode to a standardized form.
 275
 276             This function must yield exactly the same result as the SQL function
 277             'token_normalized_postcode()'.
 278         """
 279         return postcode.strip().upper()
 280
 281
 282     def _make_standard_hnr(self, hnr):
 283         """ Create a normalised version of a housenumber.
 284
 285             This function takes minor shortcuts on transliteration.
 286         """
 287         return self._search_normalized(hnr)
 288
 289     def update_postcodes_from_db(self):
 290         """ Update postcode tokens in the word table from the location_postcode
 291             table.
 292         """
 293         to_delete = []
 294         with self.conn.cursor() as cur:
 295             # This finds us the rows in location_postcode and word that are
 296             # missing in the other table.
 297             cur.execute("""SELECT * FROM
 298                             (SELECT pc, word FROM
 299                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 300                               FULL JOIN
 301                               (SELECT word FROM word WHERE type = 'P') w
 302                               ON pc = word) x
 303                            WHERE pc is null or word is null""")
 304
 305             with CopyBuffer() as copystr:
 306                 for postcode, word in cur:
 307                     if postcode is None:
 308                         to_delete.append(word)
 309                     else:
 310                         copystr.add(self._search_normalized(postcode),
 311                                     'P', postcode)
 312
 313                 if to_delete:
 314                     cur.execute("""DELETE FROM WORD
 315                                    WHERE type ='P' and word = any(%s)
 316                                 """, (to_delete, ))
 317
 318                 copystr.copy_out(cur, 'word',
 319                                  columns=['word_token', 'type', 'word'])
 320
 321
 322     def update_special_phrases(self, phrases, should_replace):
 323         """ Replace the search index for special phrases with the new phrases.
 324             If `should_replace` is True, then the previous set of will be
 325             completely replaced. Otherwise the phrases are added to the
 326             already existing ones.
 327         """
 328         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 329                             for p in phrases))
 330
 331         with self.conn.cursor() as cur:
 332             # Get the old phrases.
 333             existing_phrases = set()
 334             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 335             for word, info in cur:
 336                 existing_phrases.add((word, info['class'], info['type'],
 337                                       info.get('op') or '-'))
 338
 339             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 340             if should_replace:
 341                 deleted = self._remove_special_phrases(cur, norm_phrases,
 342                                                        existing_phrases)
 343             else:
 344                 deleted = 0
 345
 346         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 347                  len(norm_phrases), added, deleted)
 348
 349
 350     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 351         """ Add all phrases to the database that are not yet there.
 352         """
 353         to_add = new_phrases - existing_phrases
 354
 355         added = 0
 356         with CopyBuffer() as copystr:
 357             for word, cls, typ, oper in to_add:
 358                 term = self._search_normalized(word)
 359                 if term:
 360                     copystr.add(term, 'S', word,
 361                                 json.dumps({'class': cls, 'type': typ,
 362                                             'op': oper if oper in ('in', 'near') else None}))
 363                     added += 1
 364
 365             copystr.copy_out(cursor, 'word',
 366                              columns=['word_token', 'type', 'word', 'info'])
 367
 368         return added
 369
 370
 371     @staticmethod
 372     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 373         """ Remove all phrases from the databse that are no longer in the
 374             new phrase list.
 375         """
 376         to_delete = existing_phrases - new_phrases
 377
 378         if to_delete:
 379             cursor.execute_values(
 380                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 381                     WHERE type = 'S' and word = name
 382                           and info->>'class' = in_class and info->>'type' = in_type
 383                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 384                 """, to_delete)
 385
 386         return len(to_delete)
 387
 388
 389     def add_country_names(self, country_code, names):
 390         """ Add names for the given country to the search index.
 391         """
 392         # Make sure any name preprocessing for country names applies.
 393         info = PlaceInfo({'name': names, 'country_code': country_code,
 394                           'rank_address': 4, 'class': 'boundary',
 395                           'type': 'administrative'})
 396         self._add_country_full_names(country_code,
 397                                      self.sanitizer.process_names(info)[0])
 398
 399
 400     def _add_country_full_names(self, country_code, names):
 401         """ Add names for the given country from an already sanitized
 402             name list.
 403         """
 404         word_tokens = set()
 405         for name in names:
 406             norm_name = self._search_normalized(name.name)
 407             if norm_name:
 408                 word_tokens.add(norm_name)
 409
 410         with self.conn.cursor() as cur:
 411             # Get existing names
 412             cur.execute("""SELECT word_token FROM word
 413                             WHERE type = 'C' and word = %s""",
 414                         (country_code, ))
 415             word_tokens.difference_update((t[0] for t in cur))
 416
 417             # Only add those names that are not yet in the list.
 418             if word_tokens:
 419                 cur.execute("""INSERT INTO word (word_token, type, word)
 420                                (SELECT token, 'C', %s
 421                                 FROM unnest(%s) as token)
 422                             """, (country_code, list(word_tokens)))
 423
 424             # No names are deleted at the moment.
 425             # If deletion is made possible, then the static names from the
 426             # initial 'country_name' table should be kept.
 427
 428
 429     def process_place(self, place):
 430         """ Determine tokenizer information about the given place.
 431
 432             Returns a JSON-serializable structure that will be handed into
 433             the database via the token_info field.
 434         """
 435         token_info = _TokenInfo(self._cache)
 436
 437         names, address = self.sanitizer.process_names(place)
 438
 439         if names:
 440             fulls, partials = self._compute_name_tokens(names)
 441
 442             token_info.add_names(fulls, partials)
 443
 444             if place.is_country():
 445                 self._add_country_full_names(place.country_code, names)
 446
 447         if address:
 448             self._process_place_address(token_info, address)
 449
 450         return token_info.data
 451
 452
 453     def _process_place_address(self, token_info, address):
 454         hnrs = []
 455         addr_terms = []
 456         for item in address:
 457             if item.kind == 'postcode':
 458                 self._add_postcode(item.name)
 459             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 460                 hnrs.append(item.name)
 461             elif item.kind == 'street':
 462                 token_info.add_street(self._compute_partial_tokens(item.name))
 463             elif item.kind == 'place':
 464                 token_info.add_place(self._compute_partial_tokens(item.name))
 465             elif not item.kind.startswith('_') and \
 466                  item.kind not in ('country', 'full'):
 467                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 468
 469         if hnrs:
 470             hnrs = self._split_housenumbers(hnrs)
 471             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 472
 473         if addr_terms:
 474             token_info.add_address_terms(addr_terms)
 475
 476
 477     def _compute_partial_tokens(self, name):
 478         """ Normalize the given term, split it into partial words and return
 479             then token list for them.
 480         """
 481         norm_name = self._search_normalized(name)
 482
 483         tokens = []
 484         need_lookup = []
 485         for partial in norm_name.split():
 486             token = self._cache.partials.get(partial)
 487             if token:
 488                 tokens.append(token)
 489             else:
 490                 need_lookup.append(partial)
 491
 492         if need_lookup:
 493             with self.conn.cursor() as cur:
 494                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 495                                FROM unnest(%s) word""",
 496                             (need_lookup, ))
 497
 498                 for partial, token in cur:
 499                     tokens.append(token)
 500                     self._cache.partials[partial] = token
 501
 502         return tokens
 503
 504
 505     def _compute_name_tokens(self, names):
 506         """ Computes the full name and partial name tokens for the given
 507             dictionary of names.
 508         """
 509         full_tokens = set()
 510         partial_tokens = set()
 511
 512         for name in names:
 513             analyzer_id = name.get_attr('analyzer')
 514             norm_name = self._normalized(name.name)
 515             if analyzer_id is None:
 516                 token_id = norm_name
 517             else:
 518                 token_id = f'{norm_name}@{analyzer_id}'
 519
 520             full, part = self._cache.names.get(token_id, (None, None))
 521             if full is None:
 522                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 523                 if not variants:
 524                     continue
 525
 526                 with self.conn.cursor() as cur:
 527                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 528                                 (token_id, variants))
 529                     full, part = cur.fetchone()
 530
 531                 self._cache.names[token_id] = (full, part)
 532
 533             full_tokens.add(full)
 534             partial_tokens.update(part)
 535
 536         return full_tokens, partial_tokens
 537
 538
 539     def _add_postcode(self, postcode):
 540         """ Make sure the normalized postcode is present in the word table.
 541         """
 542         if re.search(r'[:,;]', postcode) is None:
 543             postcode = self.normalize_postcode(postcode)
 544
 545             if postcode not in self._cache.postcodes:
 546                 term = self._search_normalized(postcode)
 547                 if not term:
 548                     return
 549
 550                 with self.conn.cursor() as cur:
 551                     # no word_id needed for postcodes
 552                     cur.execute("""INSERT INTO word (word_token, type, word)
 553                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 554                                     WHERE NOT EXISTS
 555                                      (SELECT * FROM word
 556                                       WHERE type = 'P' and word = pc))
 557                                 """, (term, postcode))
 558                 self._cache.postcodes.add(postcode)
 559
 560
 561     @staticmethod
 562     def _split_housenumbers(hnrs):
 563         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 564             # split numbers if necessary
 565             simple_list = []
 566             for hnr in hnrs:
 567                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 568
 569             if len(simple_list) > 1:
 570                 hnrs = list(set(simple_list))
 571             else:
 572                 hnrs = simple_list
 573
 574         return hnrs
 575
 576
 577
 578
 579 class _TokenInfo:
 580     """ Collect token information to be sent back to the database.
 581     """
 582     def __init__(self, cache):
 583         self._cache = cache
 584         self.data = {}
 585
 586     @staticmethod
 587     def _mk_array(tokens):
 588         return '{%s}' % ','.join((str(s) for s in tokens))
 589
 590
 591     def add_names(self, fulls, partials):
 592         """ Adds token information for the normalised names.
 593         """
 594         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 595
 596
 597     def add_housenumbers(self, conn, hnrs):
 598         """ Extract housenumber information from a list of normalised
 599             housenumbers.
 600         """
 601         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 602         self.data['hnr'] = ';'.join(hnrs)
 603
 604
 605     def add_street(self, tokens):
 606         """ Add addr:street match terms.
 607         """
 608         if tokens:
 609             self.data['street'] = self._mk_array(tokens)
 610
 611
 612     def add_place(self, tokens):
 613         """ Add addr:place search and match terms.
 614         """
 615         if tokens:
 616             self.data['place'] = self._mk_array(tokens)
 617
 618
 619     def add_address_terms(self, terms):
 620         """ Add additional address terms.
 621         """
 622         tokens = {key: self._mk_array(partials)
 623                   for key, partials in terms if partials}
 624
 625         if tokens:
 626             self.data['addr'] = tokens
 627
 628
 629 class _TokenCache:
 630     """ Cache for token information to avoid repeated database queries.
 631
 632         This cache is not thread-safe and needs to be instantiated per
 633         analyzer.
 634     """
 635     def __init__(self):
 636         self.names = {}
 637         self.partials = {}
 638         self.postcodes = set()
 639         self.housenumbers = {}
 640
 641
 642     def get_hnr_tokens(self, conn, terms):
 643         """ Get token ids for a list of housenumbers, looking them up in the
 644             database if necessary. `terms` is an iterable of normalized
 645             housenumbers.
 646         """
 647         tokens = []
 648         askdb = []
 649
 650         for term in terms:
 651             token = self.housenumbers.get(term)
 652             if token is None:
 653                 askdb.append(term)
 654             else:
 655                 tokens.append(token)
 656
 657         if askdb:
 658             with conn.cursor() as cur:
 659                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 660                             (askdb, ))
 661                 for term, tid in cur:
 662                     self.housenumbers[term] = tid
 663                     tokens.append(tid)
 664
 665         return tokens