nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.indexer.place_info import PlaceInfo
  17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  19
  20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  21
  22 LOG = logging.getLogger()
  23
  24 def create(dsn, data_dir):
  25     """ Create a new instance of the tokenizer provided by this module.
  26     """
  27     return LegacyICUTokenizer(dsn, data_dir)
  28
  29
  30 class LegacyICUTokenizer(AbstractTokenizer):
  31     """ This tokenizer uses libICU to covert names and queries to ASCII.
  32         Otherwise it uses the same algorithms and data structures as the
  33         normalization routines in Nominatim 3.
  34     """
  35
  36     def __init__(self, dsn, data_dir):
  37         self.dsn = dsn
  38         self.data_dir = data_dir
  39         self.loader = None
  40         self.term_normalization = None
  41
  42
  43     def init_new_db(self, config, init_db=True):
  44         """ Set up a new tokenizer for the database.
  45
  46             This copies all necessary data in the project directory to make
  47             sure the tokenizer remains stable even over updates.
  48         """
  49         self.loader = ICURuleLoader(config)
  50
  51         self.term_normalization = config.TERM_NORMALIZATION
  52
  53         self._install_php(config.lib_dir.php)
  54         self._save_config()
  55
  56         if init_db:
  57             self.update_sql_functions(config)
  58             self._init_db_tables(config)
  59
  60
  61     def init_from_project(self, config):
  62         """ Initialise the tokenizer from the project directory.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         with connect(self.dsn) as conn:
  67             self.loader.load_config_from_db(conn)
  68             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  69
  70
  71     def finalize_import(self, _):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75
  76
  77     def update_sql_functions(self, config):
  78         """ Reimport the SQL functions for this tokenizer.
  79         """
  80         with connect(self.dsn) as conn:
  81             sqlp = SQLPreprocessor(conn, config)
  82             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  83
  84
  85     def check_database(self, config):
  86         """ Check that the tokenizer is set up correctly.
  87         """
  88         self.init_from_project(config)
  89
  90         if self.term_normalization is None:
  91             return "Configuration for tokenizer 'icu' are missing."
  92
  93         return None
  94
  95
  96     def name_analyzer(self):
  97         """ Create a new analyzer for tokenizing names and queries
  98             using this tokinzer. Analyzers are context managers and should
  99             be used accordingly:
 100
 101             ```
 102             with tokenizer.name_analyzer() as analyzer:
 103                 analyser.tokenize()
 104             ```
 105
 106             When used outside the with construct, the caller must ensure to
 107             call the close() function before destructing the analyzer.
 108
 109             Analyzers are not thread-safe. You need to instantiate one per thread.
 110         """
 111         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 112                                      self.loader.make_token_analysis())
 113
 114
 115     def _install_php(self, phpdir):
 116         """ Install the php script for the tokenizer.
 117         """
 118         php_file = self.data_dir / "tokenizer.php"
 119         php_file.write_text(dedent(f"""\
 120             <?php
 121             @define('CONST_Max_Word_Frequency', 10000000);
 122             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 123             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 124             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 125
 126
 127     def _save_config(self):
 128         """ Save the configuration that needs to remain stable for the given
 129             database as database properties.
 130         """
 131         with connect(self.dsn) as conn:
 132             self.loader.save_config_to_db(conn)
 133             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 134
 135
 136     def _init_db_tables(self, config):
 137         """ Set up the word table and fill it with pre-computed word
 138             frequencies.
 139         """
 140         with connect(self.dsn) as conn:
 141             sqlp = SQLPreprocessor(conn, config)
 142             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 143             conn.commit()
 144
 145             LOG.warning("Precomputing word tokens")
 146
 147             # get partial words and their frequencies
 148             words = self._count_partial_terms(conn)
 149
 150             # copy them back into the word table
 151             with CopyBuffer() as copystr:
 152                 for term, cnt in words.items():
 153                     copystr.add('w', term, json.dumps({'count': cnt}))
 154
 155                 with conn.cursor() as cur:
 156                     copystr.copy_out(cur, 'word',
 157                                      columns=['type', 'word_token', 'info'])
 158                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 159                                    WHERE word_id is null and type = 'w'""")
 160
 161             conn.commit()
 162
 163     def _count_partial_terms(self, conn):
 164         """ Count the partial terms from the names in the place table.
 165         """
 166         words = Counter()
 167         name_proc = self.loader.make_token_analysis()
 168
 169         with conn.cursor(name="words") as cur:
 170             cur.execute(""" SELECT v, count(*) FROM
 171                               (SELECT svals(name) as v FROM place)x
 172                             WHERE length(v) < 75 GROUP BY v""")
 173
 174             for name, cnt in cur:
 175                 terms = set()
 176                 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 177                     if ' ' in word:
 178                         terms.update(word.split())
 179                 for term in terms:
 180                     words[term] += cnt
 181
 182         return words
 183
 184
 185 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 186     """ The legacy analyzer uses the ICU library for splitting names.
 187
 188         Each instance opens a connection to the database to request the
 189         normalization.
 190     """
 191
 192     def __init__(self, dsn, sanitizer, token_analysis):
 193         self.conn = connect(dsn).connection
 194         self.conn.autocommit = True
 195         self.sanitizer = sanitizer
 196         self.token_analysis = token_analysis
 197
 198         self._cache = _TokenCache()
 199
 200
 201     def close(self):
 202         """ Free all resources used by the analyzer.
 203         """
 204         if self.conn:
 205             self.conn.close()
 206             self.conn = None
 207
 208
 209     def _search_normalized(self, name):
 210         """ Return the search token transliteration of the given name.
 211         """
 212         return self.token_analysis.get_search_normalized(name)
 213
 214
 215     def _normalized(self, name):
 216         """ Return the normalized version of the given name with all
 217             non-relevant information removed.
 218         """
 219         return self.token_analysis.get_normalized(name)
 220
 221
 222     def get_word_token_info(self, words):
 223         """ Return token information for the given list of words.
 224             If a word starts with # it is assumed to be a full name
 225             otherwise is a partial name.
 226
 227             The function returns a list of tuples with
 228             (original word, word token, word id).
 229
 230             The function is used for testing and debugging only
 231             and not necessarily efficient.
 232         """
 233         full_tokens = {}
 234         partial_tokens = {}
 235         for word in words:
 236             if word.startswith('#'):
 237                 full_tokens[word] = self._search_normalized(word[1:])
 238             else:
 239                 partial_tokens[word] = self._search_normalized(word)
 240
 241         with self.conn.cursor() as cur:
 242             cur.execute("""SELECT word_token, word_id
 243                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 244                         """, (list(full_tokens.values()),))
 245             full_ids = {r[0]: r[1] for r in cur}
 246             cur.execute("""SELECT word_token, word_id
 247                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 248                         (list(partial_tokens.values()),))
 249             part_ids = {r[0]: r[1] for r in cur}
 250
 251         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 252                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 253
 254
 255     @staticmethod
 256     def normalize_postcode(postcode):
 257         """ Convert the postcode to a standardized form.
 258
 259             This function must yield exactly the same result as the SQL function
 260             'token_normalized_postcode()'.
 261         """
 262         return postcode.strip().upper()
 263
 264
 265     def _make_standard_hnr(self, hnr):
 266         """ Create a normalised version of a housenumber.
 267
 268             This function takes minor shortcuts on transliteration.
 269         """
 270         return self._search_normalized(hnr)
 271
 272     def update_postcodes_from_db(self):
 273         """ Update postcode tokens in the word table from the location_postcode
 274             table.
 275         """
 276         to_delete = []
 277         with self.conn.cursor() as cur:
 278             # This finds us the rows in location_postcode and word that are
 279             # missing in the other table.
 280             cur.execute("""SELECT * FROM
 281                             (SELECT pc, word FROM
 282                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 283                               FULL JOIN
 284                               (SELECT word FROM word WHERE type = 'P') w
 285                               ON pc = word) x
 286                            WHERE pc is null or word is null""")
 287
 288             with CopyBuffer() as copystr:
 289                 for postcode, word in cur:
 290                     if postcode is None:
 291                         to_delete.append(word)
 292                     else:
 293                         copystr.add(self._search_normalized(postcode),
 294                                     'P', postcode)
 295
 296                 if to_delete:
 297                     cur.execute("""DELETE FROM WORD
 298                                    WHERE type ='P' and word = any(%s)
 299                                 """, (to_delete, ))
 300
 301                 copystr.copy_out(cur, 'word',
 302                                  columns=['word_token', 'type', 'word'])
 303
 304
 305     def update_special_phrases(self, phrases, should_replace):
 306         """ Replace the search index for special phrases with the new phrases.
 307             If `should_replace` is True, then the previous set of will be
 308             completely replaced. Otherwise the phrases are added to the
 309             already existing ones.
 310         """
 311         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 312                             for p in phrases))
 313
 314         with self.conn.cursor() as cur:
 315             # Get the old phrases.
 316             existing_phrases = set()
 317             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 318             for word, info in cur:
 319                 existing_phrases.add((word, info['class'], info['type'],
 320                                       info.get('op') or '-'))
 321
 322             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 323             if should_replace:
 324                 deleted = self._remove_special_phrases(cur, norm_phrases,
 325                                                        existing_phrases)
 326             else:
 327                 deleted = 0
 328
 329         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 330                  len(norm_phrases), added, deleted)
 331
 332
 333     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 334         """ Add all phrases to the database that are not yet there.
 335         """
 336         to_add = new_phrases - existing_phrases
 337
 338         added = 0
 339         with CopyBuffer() as copystr:
 340             for word, cls, typ, oper in to_add:
 341                 term = self._search_normalized(word)
 342                 if term:
 343                     copystr.add(term, 'S', word,
 344                                 json.dumps({'class': cls, 'type': typ,
 345                                             'op': oper if oper in ('in', 'near') else None}))
 346                     added += 1
 347
 348             copystr.copy_out(cursor, 'word',
 349                              columns=['word_token', 'type', 'word', 'info'])
 350
 351         return added
 352
 353
 354     @staticmethod
 355     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 356         """ Remove all phrases from the databse that are no longer in the
 357             new phrase list.
 358         """
 359         to_delete = existing_phrases - new_phrases
 360
 361         if to_delete:
 362             cursor.execute_values(
 363                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 364                     WHERE type = 'S' and word = name
 365                           and info->>'class' = in_class and info->>'type' = in_type
 366                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 367                 """, to_delete)
 368
 369         return len(to_delete)
 370
 371
 372     def add_country_names(self, country_code, names):
 373         """ Add names for the given country to the search index.
 374         """
 375         # Make sure any name preprocessing for country names applies.
 376         info = PlaceInfo({'name': names, 'country_code': country_code,
 377                           'rank_address': 4, 'class': 'boundary',
 378                           'type': 'administrative'})
 379         self._add_country_full_names(country_code,
 380                                      self.sanitizer.process_names(info)[0])
 381
 382
 383     def _add_country_full_names(self, country_code, names):
 384         """ Add names for the given country from an already sanitized
 385             name list.
 386         """
 387         word_tokens = set()
 388         for name in names:
 389             norm_name = self._search_normalized(name.name)
 390             if norm_name:
 391                 word_tokens.add(norm_name)
 392
 393         with self.conn.cursor() as cur:
 394             # Get existing names
 395             cur.execute("""SELECT word_token FROM word
 396                             WHERE type = 'C' and word = %s""",
 397                         (country_code, ))
 398             word_tokens.difference_update((t[0] for t in cur))
 399
 400             # Only add those names that are not yet in the list.
 401             if word_tokens:
 402                 cur.execute("""INSERT INTO word (word_token, type, word)
 403                                (SELECT token, 'C', %s
 404                                 FROM unnest(%s) as token)
 405                             """, (country_code, list(word_tokens)))
 406
 407             # No names are deleted at the moment.
 408             # If deletion is made possible, then the static names from the
 409             # initial 'country_name' table should be kept.
 410
 411
 412     def process_place(self, place):
 413         """ Determine tokenizer information about the given place.
 414
 415             Returns a JSON-serializable structure that will be handed into
 416             the database via the token_info field.
 417         """
 418         token_info = _TokenInfo(self._cache)
 419
 420         names, address = self.sanitizer.process_names(place)
 421
 422         if names:
 423             fulls, partials = self._compute_name_tokens(names)
 424
 425             token_info.add_names(fulls, partials)
 426
 427             if place.is_country():
 428                 self._add_country_full_names(place.country_code, names)
 429
 430         if address:
 431             self._process_place_address(token_info, address)
 432
 433         return token_info.data
 434
 435
 436     def _process_place_address(self, token_info, address):
 437         hnrs = []
 438         addr_terms = []
 439         for item in address:
 440             if item.kind == 'postcode':
 441                 self._add_postcode(item.name)
 442             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 443                 hnrs.append(item.name)
 444             elif item.kind == 'street':
 445                 token_info.add_street(self._compute_partial_tokens(item.name))
 446             elif item.kind == 'place':
 447                 token_info.add_place(self._compute_partial_tokens(item.name))
 448             elif not item.kind.startswith('_') and \
 449                  item.kind not in ('country', 'full'):
 450                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 451
 452         if hnrs:
 453             hnrs = self._split_housenumbers(hnrs)
 454             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 455
 456         if addr_terms:
 457             token_info.add_address_terms(addr_terms)
 458
 459     def _compute_partial_tokens(self, name):
 460         """ Normalize the given term, split it into partial words and return
 461             then token list for them.
 462         """
 463         norm_name = self._search_normalized(name)
 464
 465         tokens = []
 466         need_lookup = []
 467         for partial in norm_name.split():
 468             token = self._cache.partials.get(partial)
 469             if token:
 470                 tokens.append(token)
 471             else:
 472                 need_lookup.append(partial)
 473
 474         if need_lookup:
 475             with self.conn.cursor() as cur:
 476                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 477                                FROM unnest(%s) word""",
 478                             (need_lookup, ))
 479
 480                 for partial, token in cur:
 481                     tokens.append(token)
 482                     self._cache.partials[partial] = token
 483
 484         return tokens
 485
 486
 487     def _compute_name_tokens(self, names):
 488         """ Computes the full name and partial name tokens for the given
 489             dictionary of names.
 490         """
 491         full_tokens = set()
 492         partial_tokens = set()
 493
 494         for name in names:
 495             norm_name = self._normalized(name.name)
 496             full, part = self._cache.names.get(norm_name, (None, None))
 497             if full is None:
 498                 variants = self.token_analysis.get_variants_ascii(norm_name)
 499                 if not variants:
 500                     continue
 501
 502                 with self.conn.cursor() as cur:
 503                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 504                                 (norm_name, variants))
 505                     full, part = cur.fetchone()
 506
 507                 self._cache.names[norm_name] = (full, part)
 508
 509             full_tokens.add(full)
 510             partial_tokens.update(part)
 511
 512         return full_tokens, partial_tokens
 513
 514
 515     def _add_postcode(self, postcode):
 516         """ Make sure the normalized postcode is present in the word table.
 517         """
 518         if re.search(r'[:,;]', postcode) is None:
 519             postcode = self.normalize_postcode(postcode)
 520
 521             if postcode not in self._cache.postcodes:
 522                 term = self._search_normalized(postcode)
 523                 if not term:
 524                     return
 525
 526                 with self.conn.cursor() as cur:
 527                     # no word_id needed for postcodes
 528                     cur.execute("""INSERT INTO word (word_token, type, word)
 529                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 530                                     WHERE NOT EXISTS
 531                                      (SELECT * FROM word
 532                                       WHERE type = 'P' and word = pc))
 533                                 """, (term, postcode))
 534                 self._cache.postcodes.add(postcode)
 535
 536
 537     @staticmethod
 538     def _split_housenumbers(hnrs):
 539         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 540             # split numbers if necessary
 541             simple_list = []
 542             for hnr in hnrs:
 543                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 544
 545             if len(simple_list) > 1:
 546                 hnrs = list(set(simple_list))
 547             else:
 548                 hnrs = simple_list
 549
 550         return hnrs
 551
 552
 553
 554
 555 class _TokenInfo:
 556     """ Collect token information to be sent back to the database.
 557     """
 558     def __init__(self, cache):
 559         self._cache = cache
 560         self.data = {}
 561
 562     @staticmethod
 563     def _mk_array(tokens):
 564         return '{%s}' % ','.join((str(s) for s in tokens))
 565
 566
 567     def add_names(self, fulls, partials):
 568         """ Adds token information for the normalised names.
 569         """
 570         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 571
 572
 573     def add_housenumbers(self, conn, hnrs):
 574         """ Extract housenumber information from a list of normalised
 575             housenumbers.
 576         """
 577         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 578         self.data['hnr'] = ';'.join(hnrs)
 579
 580
 581     def add_street(self, tokens):
 582         """ Add addr:street match terms.
 583         """
 584         if tokens:
 585             self.data['street'] = self._mk_array(tokens)
 586
 587
 588     def add_place(self, tokens):
 589         """ Add addr:place search and match terms.
 590         """
 591         if tokens:
 592             self.data['place'] = self._mk_array(tokens)
 593
 594
 595     def add_address_terms(self, terms):
 596         """ Add additional address terms.
 597         """
 598         tokens = {key: self._mk_array(partials)
 599                   for key, partials in terms if partials}
 600
 601         if tokens:
 602             self.data['addr'] = tokens
 603
 604
 605 class _TokenCache:
 606     """ Cache for token information to avoid repeated database queries.
 607
 608         This cache is not thread-safe and needs to be instantiated per
 609         analyzer.
 610     """
 611     def __init__(self):
 612         self.names = {}
 613         self.partials = {}
 614         self.postcodes = set()
 615         self.housenumbers = {}
 616
 617
 618     def get_hnr_tokens(self, conn, terms):
 619         """ Get token ids for a list of housenumbers, looking them up in the
 620             database if necessary. `terms` is an iterable of normalized
 621             housenumbers.
 622         """
 623         tokens = []
 624         askdb = []
 625
 626         for term in terms:
 627             token = self.housenumbers.get(term)
 628             if token is None:
 629                 askdb.append(term)
 630             else:
 631                 tokens.append(token)
 632
 633         if askdb:
 634             with conn.cursor() as cur:
 635                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 636                             (askdb, ))
 637                 for term, tid in cur:
 638                     self.housenumbers[term] = tid
 639                     tokens.append(tid)
 640
 641         return tokens