nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.indexer.place_info import PlaceInfo
  17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  19
  20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  21
  22 LOG = logging.getLogger()
  23
  24 def create(dsn, data_dir):
  25     """ Create a new instance of the tokenizer provided by this module.
  26     """
  27     return LegacyICUTokenizer(dsn, data_dir)
  28
  29
  30 class LegacyICUTokenizer(AbstractTokenizer):
  31     """ This tokenizer uses libICU to covert names and queries to ASCII.
  32         Otherwise it uses the same algorithms and data structures as the
  33         normalization routines in Nominatim 3.
  34     """
  35
  36     def __init__(self, dsn, data_dir):
  37         self.dsn = dsn
  38         self.data_dir = data_dir
  39         self.loader = None
  40         self.term_normalization = None
  41
  42
  43     def init_new_db(self, config, init_db=True):
  44         """ Set up a new tokenizer for the database.
  45
  46             This copies all necessary data in the project directory to make
  47             sure the tokenizer remains stable even over updates.
  48         """
  49         self.loader = ICURuleLoader(config)
  50
  51         self.term_normalization = config.TERM_NORMALIZATION
  52
  53         self._install_php(config.lib_dir.php)
  54         self._save_config()
  55
  56         if init_db:
  57             self.update_sql_functions(config)
  58             self._init_db_tables(config)
  59
  60
  61     def init_from_project(self, config):
  62         """ Initialise the tokenizer from the project directory.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         with connect(self.dsn) as conn:
  67             self.loader.load_config_from_db(conn)
  68             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  69
  70
  71     def finalize_import(self, _):
  72         """ Do any required postprocessing to make the tokenizer data ready
  73             for use.
  74         """
  75
  76
  77     def update_sql_functions(self, config):
  78         """ Reimport the SQL functions for this tokenizer.
  79         """
  80         with connect(self.dsn) as conn:
  81             sqlp = SQLPreprocessor(conn, config)
  82             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  83
  84
  85     def check_database(self, config):
  86         """ Check that the tokenizer is set up correctly.
  87         """
  88         self.init_from_project(config)
  89
  90         if self.term_normalization is None:
  91             return "Configuration for tokenizer 'icu' are missing."
  92
  93         return None
  94
  95
  96     def name_analyzer(self):
  97         """ Create a new analyzer for tokenizing names and queries
  98             using this tokinzer. Analyzers are context managers and should
  99             be used accordingly:
 100
 101             ```
 102             with tokenizer.name_analyzer() as analyzer:
 103                 analyser.tokenize()
 104             ```
 105
 106             When used outside the with construct, the caller must ensure to
 107             call the close() function before destructing the analyzer.
 108
 109             Analyzers are not thread-safe. You need to instantiate one per thread.
 110         """
 111         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 112                                      self.loader.make_token_analysis())
 113
 114
 115     def _install_php(self, phpdir):
 116         """ Install the php script for the tokenizer.
 117         """
 118         php_file = self.data_dir / "tokenizer.php"
 119         php_file.write_text(dedent(f"""\
 120             <?php
 121             @define('CONST_Max_Word_Frequency', 10000000);
 122             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 123             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 124             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 125
 126
 127     def _save_config(self):
 128         """ Save the configuration that needs to remain stable for the given
 129             database as database properties.
 130         """
 131         with connect(self.dsn) as conn:
 132             self.loader.save_config_to_db(conn)
 133             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 134
 135
 136     def _init_db_tables(self, config):
 137         """ Set up the word table and fill it with pre-computed word
 138             frequencies.
 139         """
 140         with connect(self.dsn) as conn:
 141             sqlp = SQLPreprocessor(conn, config)
 142             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 143             conn.commit()
 144
 145             LOG.warning("Precomputing word tokens")
 146
 147             # get partial words and their frequencies
 148             words = self._count_partial_terms(conn)
 149
 150             # copy them back into the word table
 151             with CopyBuffer() as copystr:
 152                 for term, cnt in words.items():
 153                     copystr.add('w', term, json.dumps({'count': cnt}))
 154
 155                 with conn.cursor() as cur:
 156                     copystr.copy_out(cur, 'word',
 157                                      columns=['type', 'word_token', 'info'])
 158                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 159                                    WHERE word_id is null and type = 'w'""")
 160
 161             conn.commit()
 162
 163     def _count_partial_terms(self, conn):
 164         """ Count the partial terms from the names in the place table.
 165         """
 166         words = Counter()
 167         analysis = self.loader.make_token_analysis()
 168
 169         with conn.cursor(name="words") as cur:
 170             cur.execute(""" SELECT v, count(*) FROM
 171                               (SELECT svals(name) as v FROM place)x
 172                             WHERE length(v) < 75 GROUP BY v""")
 173
 174             for name, cnt in cur:
 175                 word = analysis.search.transliterate(name)
 176                 if word and ' ' in word:
 177                     for term in set(word.split()):
 178                         words[term] += cnt
 179
 180         return words
 181
 182
 183 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 184     """ The legacy analyzer uses the ICU library for splitting names.
 185
 186         Each instance opens a connection to the database to request the
 187         normalization.
 188     """
 189
 190     def __init__(self, dsn, sanitizer, token_analysis):
 191         self.conn = connect(dsn).connection
 192         self.conn.autocommit = True
 193         self.sanitizer = sanitizer
 194         self.token_analysis = token_analysis
 195
 196         self._cache = _TokenCache()
 197
 198
 199     def close(self):
 200         """ Free all resources used by the analyzer.
 201         """
 202         if self.conn:
 203             self.conn.close()
 204             self.conn = None
 205
 206
 207     def _search_normalized(self, name):
 208         """ Return the search token transliteration of the given name.
 209         """
 210         return self.token_analysis.search.transliterate(name).strip()
 211
 212
 213     def _normalized(self, name):
 214         """ Return the normalized version of the given name with all
 215             non-relevant information removed.
 216         """
 217         return self.token_analysis.normalizer.transliterate(name).strip()
 218
 219
 220     def get_word_token_info(self, words):
 221         """ Return token information for the given list of words.
 222             If a word starts with # it is assumed to be a full name
 223             otherwise is a partial name.
 224
 225             The function returns a list of tuples with
 226             (original word, word token, word id).
 227
 228             The function is used for testing and debugging only
 229             and not necessarily efficient.
 230         """
 231         full_tokens = {}
 232         partial_tokens = {}
 233         for word in words:
 234             if word.startswith('#'):
 235                 full_tokens[word] = self._search_normalized(word[1:])
 236             else:
 237                 partial_tokens[word] = self._search_normalized(word)
 238
 239         with self.conn.cursor() as cur:
 240             cur.execute("""SELECT word_token, word_id
 241                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 242                         """, (list(full_tokens.values()),))
 243             full_ids = {r[0]: r[1] for r in cur}
 244             cur.execute("""SELECT word_token, word_id
 245                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 246                         (list(partial_tokens.values()),))
 247             part_ids = {r[0]: r[1] for r in cur}
 248
 249         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 250                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 251
 252
 253     @staticmethod
 254     def normalize_postcode(postcode):
 255         """ Convert the postcode to a standardized form.
 256
 257             This function must yield exactly the same result as the SQL function
 258             'token_normalized_postcode()'.
 259         """
 260         return postcode.strip().upper()
 261
 262
 263     def _make_standard_hnr(self, hnr):
 264         """ Create a normalised version of a housenumber.
 265
 266             This function takes minor shortcuts on transliteration.
 267         """
 268         return self._search_normalized(hnr)
 269
 270     def update_postcodes_from_db(self):
 271         """ Update postcode tokens in the word table from the location_postcode
 272             table.
 273         """
 274         to_delete = []
 275         with self.conn.cursor() as cur:
 276             # This finds us the rows in location_postcode and word that are
 277             # missing in the other table.
 278             cur.execute("""SELECT * FROM
 279                             (SELECT pc, word FROM
 280                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 281                               FULL JOIN
 282                               (SELECT word FROM word WHERE type = 'P') w
 283                               ON pc = word) x
 284                            WHERE pc is null or word is null""")
 285
 286             with CopyBuffer() as copystr:
 287                 for postcode, word in cur:
 288                     if postcode is None:
 289                         to_delete.append(word)
 290                     else:
 291                         copystr.add(self._search_normalized(postcode),
 292                                     'P', postcode)
 293
 294                 if to_delete:
 295                     cur.execute("""DELETE FROM WORD
 296                                    WHERE type ='P' and word = any(%s)
 297                                 """, (to_delete, ))
 298
 299                 copystr.copy_out(cur, 'word',
 300                                  columns=['word_token', 'type', 'word'])
 301
 302
 303     def update_special_phrases(self, phrases, should_replace):
 304         """ Replace the search index for special phrases with the new phrases.
 305             If `should_replace` is True, then the previous set of will be
 306             completely replaced. Otherwise the phrases are added to the
 307             already existing ones.
 308         """
 309         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 310                             for p in phrases))
 311
 312         with self.conn.cursor() as cur:
 313             # Get the old phrases.
 314             existing_phrases = set()
 315             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 316             for word, info in cur:
 317                 existing_phrases.add((word, info['class'], info['type'],
 318                                       info.get('op') or '-'))
 319
 320             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 321             if should_replace:
 322                 deleted = self._remove_special_phrases(cur, norm_phrases,
 323                                                        existing_phrases)
 324             else:
 325                 deleted = 0
 326
 327         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 328                  len(norm_phrases), added, deleted)
 329
 330
 331     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 332         """ Add all phrases to the database that are not yet there.
 333         """
 334         to_add = new_phrases - existing_phrases
 335
 336         added = 0
 337         with CopyBuffer() as copystr:
 338             for word, cls, typ, oper in to_add:
 339                 term = self._search_normalized(word)
 340                 if term:
 341                     copystr.add(term, 'S', word,
 342                                 json.dumps({'class': cls, 'type': typ,
 343                                             'op': oper if oper in ('in', 'near') else None}))
 344                     added += 1
 345
 346             copystr.copy_out(cursor, 'word',
 347                              columns=['word_token', 'type', 'word', 'info'])
 348
 349         return added
 350
 351
 352     @staticmethod
 353     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 354         """ Remove all phrases from the databse that are no longer in the
 355             new phrase list.
 356         """
 357         to_delete = existing_phrases - new_phrases
 358
 359         if to_delete:
 360             cursor.execute_values(
 361                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 362                     WHERE type = 'S' and word = name
 363                           and info->>'class' = in_class and info->>'type' = in_type
 364                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 365                 """, to_delete)
 366
 367         return len(to_delete)
 368
 369
 370     def add_country_names(self, country_code, names):
 371         """ Add names for the given country to the search index.
 372         """
 373         # Make sure any name preprocessing for country names applies.
 374         info = PlaceInfo({'name': names, 'country_code': country_code,
 375                           'rank_address': 4, 'class': 'boundary',
 376                           'type': 'administrative'})
 377         self._add_country_full_names(country_code,
 378                                      self.sanitizer.process_names(info)[0])
 379
 380
 381     def _add_country_full_names(self, country_code, names):
 382         """ Add names for the given country from an already sanitized
 383             name list.
 384         """
 385         word_tokens = set()
 386         for name in names:
 387             norm_name = self._search_normalized(name.name)
 388             if norm_name:
 389                 word_tokens.add(norm_name)
 390
 391         with self.conn.cursor() as cur:
 392             # Get existing names
 393             cur.execute("""SELECT word_token FROM word
 394                             WHERE type = 'C' and word = %s""",
 395                         (country_code, ))
 396             word_tokens.difference_update((t[0] for t in cur))
 397
 398             # Only add those names that are not yet in the list.
 399             if word_tokens:
 400                 cur.execute("""INSERT INTO word (word_token, type, word)
 401                                (SELECT token, 'C', %s
 402                                 FROM unnest(%s) as token)
 403                             """, (country_code, list(word_tokens)))
 404
 405             # No names are deleted at the moment.
 406             # If deletion is made possible, then the static names from the
 407             # initial 'country_name' table should be kept.
 408
 409
 410     def process_place(self, place):
 411         """ Determine tokenizer information about the given place.
 412
 413             Returns a JSON-serializable structure that will be handed into
 414             the database via the token_info field.
 415         """
 416         token_info = _TokenInfo(self._cache)
 417
 418         names, address = self.sanitizer.process_names(place)
 419
 420         if names:
 421             fulls, partials = self._compute_name_tokens(names)
 422
 423             token_info.add_names(fulls, partials)
 424
 425             if place.is_country():
 426                 self._add_country_full_names(place.country_code, names)
 427
 428         if address:
 429             self._process_place_address(token_info, address)
 430
 431         return token_info.data
 432
 433
 434     def _process_place_address(self, token_info, address):
 435         hnrs = []
 436         addr_terms = []
 437         for item in address:
 438             if item.kind == 'postcode':
 439                 self._add_postcode(item.name)
 440             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 441                 hnrs.append(item.name)
 442             elif item.kind == 'street':
 443                 token_info.add_street(self._compute_partial_tokens(item.name))
 444             elif item.kind == 'place':
 445                 token_info.add_place(self._compute_partial_tokens(item.name))
 446             elif not item.kind.startswith('_') and \
 447                  item.kind not in ('country', 'full'):
 448                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 449
 450         if hnrs:
 451             hnrs = self._split_housenumbers(hnrs)
 452             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 453
 454         if addr_terms:
 455             token_info.add_address_terms(addr_terms)
 456
 457
 458     def _compute_partial_tokens(self, name):
 459         """ Normalize the given term, split it into partial words and return
 460             then token list for them.
 461         """
 462         norm_name = self._search_normalized(name)
 463
 464         tokens = []
 465         need_lookup = []
 466         for partial in norm_name.split():
 467             token = self._cache.partials.get(partial)
 468             if token:
 469                 tokens.append(token)
 470             else:
 471                 need_lookup.append(partial)
 472
 473         if need_lookup:
 474             with self.conn.cursor() as cur:
 475                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 476                                FROM unnest(%s) word""",
 477                             (need_lookup, ))
 478
 479                 for partial, token in cur:
 480                     tokens.append(token)
 481                     self._cache.partials[partial] = token
 482
 483         return tokens
 484
 485
 486     def _compute_name_tokens(self, names):
 487         """ Computes the full name and partial name tokens for the given
 488             dictionary of names.
 489         """
 490         full_tokens = set()
 491         partial_tokens = set()
 492
 493         for name in names:
 494             analyzer_id = name.get_attr('analyzer')
 495             norm_name = self._normalized(name.name)
 496             if analyzer_id is None:
 497                 token_id = norm_name
 498             else:
 499                 token_id = f'{norm_name}@{analyzer_id}'
 500
 501             full, part = self._cache.names.get(token_id, (None, None))
 502             if full is None:
 503                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 504                 if not variants:
 505                     continue
 506
 507                 with self.conn.cursor() as cur:
 508                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 509                                 (token_id, variants))
 510                     full, part = cur.fetchone()
 511
 512                 self._cache.names[token_id] = (full, part)
 513
 514             full_tokens.add(full)
 515             partial_tokens.update(part)
 516
 517         return full_tokens, partial_tokens
 518
 519
 520     def _add_postcode(self, postcode):
 521         """ Make sure the normalized postcode is present in the word table.
 522         """
 523         if re.search(r'[:,;]', postcode) is None:
 524             postcode = self.normalize_postcode(postcode)
 525
 526             if postcode not in self._cache.postcodes:
 527                 term = self._search_normalized(postcode)
 528                 if not term:
 529                     return
 530
 531                 with self.conn.cursor() as cur:
 532                     # no word_id needed for postcodes
 533                     cur.execute("""INSERT INTO word (word_token, type, word)
 534                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 535                                     WHERE NOT EXISTS
 536                                      (SELECT * FROM word
 537                                       WHERE type = 'P' and word = pc))
 538                                 """, (term, postcode))
 539                 self._cache.postcodes.add(postcode)
 540
 541
 542     @staticmethod
 543     def _split_housenumbers(hnrs):
 544         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 545             # split numbers if necessary
 546             simple_list = []
 547             for hnr in hnrs:
 548                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 549
 550             if len(simple_list) > 1:
 551                 hnrs = list(set(simple_list))
 552             else:
 553                 hnrs = simple_list
 554
 555         return hnrs
 556
 557
 558
 559
 560 class _TokenInfo:
 561     """ Collect token information to be sent back to the database.
 562     """
 563     def __init__(self, cache):
 564         self._cache = cache
 565         self.data = {}
 566
 567     @staticmethod
 568     def _mk_array(tokens):
 569         return '{%s}' % ','.join((str(s) for s in tokens))
 570
 571
 572     def add_names(self, fulls, partials):
 573         """ Adds token information for the normalised names.
 574         """
 575         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 576
 577
 578     def add_housenumbers(self, conn, hnrs):
 579         """ Extract housenumber information from a list of normalised
 580             housenumbers.
 581         """
 582         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 583         self.data['hnr'] = ';'.join(hnrs)
 584
 585
 586     def add_street(self, tokens):
 587         """ Add addr:street match terms.
 588         """
 589         if tokens:
 590             self.data['street'] = self._mk_array(tokens)
 591
 592
 593     def add_place(self, tokens):
 594         """ Add addr:place search and match terms.
 595         """
 596         if tokens:
 597             self.data['place'] = self._mk_array(tokens)
 598
 599
 600     def add_address_terms(self, terms):
 601         """ Add additional address terms.
 602         """
 603         tokens = {key: self._mk_array(partials)
 604                   for key, partials in terms if partials}
 605
 606         if tokens:
 607             self.data['addr'] = tokens
 608
 609
 610 class _TokenCache:
 611     """ Cache for token information to avoid repeated database queries.
 612
 613         This cache is not thread-safe and needs to be instantiated per
 614         analyzer.
 615     """
 616     def __init__(self):
 617         self.names = {}
 618         self.partials = {}
 619         self.postcodes = set()
 620         self.housenumbers = {}
 621
 622
 623     def get_hnr_tokens(self, conn, terms):
 624         """ Get token ids for a list of housenumbers, looking them up in the
 625             database if necessary. `terms` is an iterable of normalized
 626             housenumbers.
 627         """
 628         tokens = []
 629         askdb = []
 630
 631         for term in terms:
 632             token = self.housenumbers.get(term)
 633             if token is None:
 634                 askdb.append(term)
 635             else:
 636                 tokens.append(token)
 637
 638         if askdb:
 639             with conn.cursor() as cur:
 640                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 641                             (askdb, ))
 642                 for term, tid in cur:
 643                     self.housenumbers[term] = tid
 644                     tokens.append(tid)
 645
 646         return tokens