nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  18
  19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  20
  21 LOG = logging.getLogger()
  22
  23 def create(dsn, data_dir):
  24     """ Create a new instance of the tokenizer provided by this module.
  25     """
  26     return LegacyICUTokenizer(dsn, data_dir)
  27
  28
  29 class LegacyICUTokenizer(AbstractTokenizer):
  30     """ This tokenizer uses libICU to covert names and queries to ASCII.
  31         Otherwise it uses the same algorithms and data structures as the
  32         normalization routines in Nominatim 3.
  33     """
  34
  35     def __init__(self, dsn, data_dir):
  36         self.dsn = dsn
  37         self.data_dir = data_dir
  38         self.loader = None
  39         self.term_normalization = None
  40
  41
  42     def init_new_db(self, config, init_db=True):
  43         """ Set up a new tokenizer for the database.
  44
  45             This copies all necessary data in the project directory to make
  46             sure the tokenizer remains stable even over updates.
  47         """
  48         self.loader = ICURuleLoader(config)
  49
  50         self.term_normalization = config.TERM_NORMALIZATION
  51
  52         self._install_php(config.lib_dir.php)
  53         self._save_config()
  54
  55         if init_db:
  56             self.update_sql_functions(config)
  57             self._init_db_tables(config)
  58
  59
  60     def init_from_project(self, config):
  61         """ Initialise the tokenizer from the project directory.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         with connect(self.dsn) as conn:
  66             self.loader.load_config_from_db(conn)
  67             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  68
  69
  70     def finalize_import(self, _):
  71         """ Do any required postprocessing to make the tokenizer data ready
  72             for use.
  73         """
  74
  75
  76     def update_sql_functions(self, config):
  77         """ Reimport the SQL functions for this tokenizer.
  78         """
  79         with connect(self.dsn) as conn:
  80             sqlp = SQLPreprocessor(conn, config)
  81             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  82
  83
  84     def check_database(self, config):
  85         """ Check that the tokenizer is set up correctly.
  86         """
  87         self.init_from_project(config)
  88
  89         if self.term_normalization is None:
  90             return "Configuration for tokenizer 'icu' are missing."
  91
  92         return None
  93
  94
  95     def name_analyzer(self):
  96         """ Create a new analyzer for tokenizing names and queries
  97             using this tokinzer. Analyzers are context managers and should
  98             be used accordingly:
  99
 100             ```
 101             with tokenizer.name_analyzer() as analyzer:
 102                 analyser.tokenize()
 103             ```
 104
 105             When used outside the with construct, the caller must ensure to
 106             call the close() function before destructing the analyzer.
 107
 108             Analyzers are not thread-safe. You need to instantiate one per thread.
 109         """
 110         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
 111
 112
 113     def _install_php(self, phpdir):
 114         """ Install the php script for the tokenizer.
 115         """
 116         php_file = self.data_dir / "tokenizer.php"
 117         php_file.write_text(dedent(f"""\
 118             <?php
 119             @define('CONST_Max_Word_Frequency', 10000000);
 120             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 121             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 122             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 123
 124
 125     def _save_config(self):
 126         """ Save the configuration that needs to remain stable for the given
 127             database as database properties.
 128         """
 129         with connect(self.dsn) as conn:
 130             self.loader.save_config_to_db(conn)
 131             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 132
 133
 134     def _init_db_tables(self, config):
 135         """ Set up the word table and fill it with pre-computed word
 136             frequencies.
 137         """
 138         with connect(self.dsn) as conn:
 139             sqlp = SQLPreprocessor(conn, config)
 140             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 141             conn.commit()
 142
 143             LOG.warning("Precomputing word tokens")
 144
 145             # get partial words and their frequencies
 146             words = self._count_partial_terms(conn)
 147
 148             # copy them back into the word table
 149             with CopyBuffer() as copystr:
 150                 for term, cnt in words.items():
 151                     copystr.add('w', term, json.dumps({'count': cnt}))
 152
 153                 with conn.cursor() as cur:
 154                     copystr.copy_out(cur, 'word',
 155                                      columns=['type', 'word_token', 'info'])
 156                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 157                                    WHERE word_id is null and type = 'w'""")
 158
 159             conn.commit()
 160
 161     def _count_partial_terms(self, conn):
 162         """ Count the partial terms from the names in the place table.
 163         """
 164         words = Counter()
 165         name_proc = self.loader.make_token_analysis()
 166
 167         with conn.cursor(name="words") as cur:
 168             cur.execute(""" SELECT v, count(*) FROM
 169                               (SELECT svals(name) as v FROM place)x
 170                             WHERE length(v) < 75 GROUP BY v""")
 171
 172             for name, cnt in cur:
 173                 terms = set()
 174                 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 175                     if ' ' in word:
 176                         terms.update(word.split())
 177                 for term in terms:
 178                     words[term] += cnt
 179
 180         return words
 181
 182
 183 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 184     """ The legacy analyzer uses the ICU library for splitting names.
 185
 186         Each instance opens a connection to the database to request the
 187         normalization.
 188     """
 189
 190     def __init__(self, dsn, name_proc):
 191         self.conn = connect(dsn).connection
 192         self.conn.autocommit = True
 193         self.name_processor = name_proc
 194
 195         self._cache = _TokenCache()
 196
 197
 198     def close(self):
 199         """ Free all resources used by the analyzer.
 200         """
 201         if self.conn:
 202             self.conn.close()
 203             self.conn = None
 204
 205
 206     def get_word_token_info(self, words):
 207         """ Return token information for the given list of words.
 208             If a word starts with # it is assumed to be a full name
 209             otherwise is a partial name.
 210
 211             The function returns a list of tuples with
 212             (original word, word token, word id).
 213
 214             The function is used for testing and debugging only
 215             and not necessarily efficient.
 216         """
 217         full_tokens = {}
 218         partial_tokens = {}
 219         for word in words:
 220             if word.startswith('#'):
 221                 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
 222             else:
 223                 partial_tokens[word] = self.name_processor.get_search_normalized(word)
 224
 225         with self.conn.cursor() as cur:
 226             cur.execute("""SELECT word_token, word_id
 227                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 228                         """, (list(full_tokens.values()),))
 229             full_ids = {r[0]: r[1] for r in cur}
 230             cur.execute("""SELECT word_token, word_id
 231                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 232                         (list(partial_tokens.values()),))
 233             part_ids = {r[0]: r[1] for r in cur}
 234
 235         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 236                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 237
 238
 239     @staticmethod
 240     def normalize_postcode(postcode):
 241         """ Convert the postcode to a standardized form.
 242
 243             This function must yield exactly the same result as the SQL function
 244             'token_normalized_postcode()'.
 245         """
 246         return postcode.strip().upper()
 247
 248
 249     def _make_standard_hnr(self, hnr):
 250         """ Create a normalised version of a housenumber.
 251
 252             This function takes minor shortcuts on transliteration.
 253         """
 254         return self.name_processor.get_search_normalized(hnr)
 255
 256     def update_postcodes_from_db(self):
 257         """ Update postcode tokens in the word table from the location_postcode
 258             table.
 259         """
 260         to_delete = []
 261         with self.conn.cursor() as cur:
 262             # This finds us the rows in location_postcode and word that are
 263             # missing in the other table.
 264             cur.execute("""SELECT * FROM
 265                             (SELECT pc, word FROM
 266                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 267                               FULL JOIN
 268                               (SELECT word FROM word WHERE type = 'P') w
 269                               ON pc = word) x
 270                            WHERE pc is null or word is null""")
 271
 272             with CopyBuffer() as copystr:
 273                 for postcode, word in cur:
 274                     if postcode is None:
 275                         to_delete.append(word)
 276                     else:
 277                         copystr.add(self.name_processor.get_search_normalized(postcode),
 278                                     'P', postcode)
 279
 280                 if to_delete:
 281                     cur.execute("""DELETE FROM WORD
 282                                    WHERE type ='P' and word = any(%s)
 283                                 """, (to_delete, ))
 284
 285                 copystr.copy_out(cur, 'word',
 286                                  columns=['word_token', 'type', 'word'])
 287
 288
 289     def update_special_phrases(self, phrases, should_replace):
 290         """ Replace the search index for special phrases with the new phrases.
 291             If `should_replace` is True, then the previous set of will be
 292             completely replaced. Otherwise the phrases are added to the
 293             already existing ones.
 294         """
 295         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 296                             for p in phrases))
 297
 298         with self.conn.cursor() as cur:
 299             # Get the old phrases.
 300             existing_phrases = set()
 301             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 302             for word, info in cur:
 303                 existing_phrases.add((word, info['class'], info['type'],
 304                                       info.get('op') or '-'))
 305
 306             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 307             if should_replace:
 308                 deleted = self._remove_special_phrases(cur, norm_phrases,
 309                                                        existing_phrases)
 310             else:
 311                 deleted = 0
 312
 313         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 314                  len(norm_phrases), added, deleted)
 315
 316
 317     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 318         """ Add all phrases to the database that are not yet there.
 319         """
 320         to_add = new_phrases - existing_phrases
 321
 322         added = 0
 323         with CopyBuffer() as copystr:
 324             for word, cls, typ, oper in to_add:
 325                 term = self.name_processor.get_search_normalized(word)
 326                 if term:
 327                     copystr.add(term, 'S', word,
 328                                 json.dumps({'class': cls, 'type': typ,
 329                                             'op': oper if oper in ('in', 'near') else None}))
 330                     added += 1
 331
 332             copystr.copy_out(cursor, 'word',
 333                              columns=['word_token', 'type', 'word', 'info'])
 334
 335         return added
 336
 337
 338     @staticmethod
 339     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 340         """ Remove all phrases from the databse that are no longer in the
 341             new phrase list.
 342         """
 343         to_delete = existing_phrases - new_phrases
 344
 345         if to_delete:
 346             cursor.execute_values(
 347                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 348                     WHERE type = 'S' and word = name
 349                           and info->>'class' = in_class and info->>'type' = in_type
 350                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 351                 """, to_delete)
 352
 353         return len(to_delete)
 354
 355
 356     def add_country_names(self, country_code, names):
 357         """ Add names for the given country to the search index.
 358         """
 359         word_tokens = set()
 360         for name in self._compute_full_names(names):
 361             norm_name = self.name_processor.get_search_normalized(name)
 362             if norm_name:
 363                 word_tokens.add(norm_name)
 364
 365         with self.conn.cursor() as cur:
 366             # Get existing names
 367             cur.execute("""SELECT word_token FROM word
 368                             WHERE type = 'C' and word = %s""",
 369                         (country_code, ))
 370             word_tokens.difference_update((t[0] for t in cur))
 371
 372             # Only add those names that are not yet in the list.
 373             if word_tokens:
 374                 cur.execute("""INSERT INTO word (word_token, type, word)
 375                                (SELECT token, 'C', %s
 376                                 FROM unnest(%s) as token)
 377                             """, (country_code, list(word_tokens)))
 378
 379             # No names are deleted at the moment.
 380             # If deletion is made possible, then the static names from the
 381             # initial 'country_name' table should be kept.
 382
 383
 384     def process_place(self, place):
 385         """ Determine tokenizer information about the given place.
 386
 387             Returns a JSON-serialisable structure that will be handed into
 388             the database via the token_info field.
 389         """
 390         token_info = _TokenInfo(self._cache)
 391
 392         names = place.name
 393
 394         if names:
 395             fulls, partials = self._compute_name_tokens(names)
 396
 397             token_info.add_names(fulls, partials)
 398
 399             if place.is_country():
 400                 self.add_country_names(place.country_code, names)
 401
 402         address = place.address
 403         if address:
 404             self._process_place_address(token_info, address)
 405
 406         return token_info.data
 407
 408
 409     def _process_place_address(self, token_info, address):
 410         hnrs = []
 411         addr_terms = []
 412         for key, value in address.items():
 413             if key == 'postcode':
 414                 self._add_postcode(value)
 415             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 416                 hnrs.append(value)
 417             elif key == 'street':
 418                 token_info.add_street(self._compute_partial_tokens(value))
 419             elif key == 'place':
 420                 token_info.add_place(self._compute_partial_tokens(value))
 421             elif not key.startswith('_') and \
 422                  key not in ('country', 'full'):
 423                 addr_terms.append((key, self._compute_partial_tokens(value)))
 424
 425         if hnrs:
 426             hnrs = self._split_housenumbers(hnrs)
 427             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 428
 429         if addr_terms:
 430             token_info.add_address_terms(addr_terms)
 431
 432     def _compute_partial_tokens(self, name):
 433         """ Normalize the given term, split it into partial words and return
 434             then token list for them.
 435         """
 436         norm_name = self.name_processor.get_search_normalized(name)
 437
 438         tokens = []
 439         need_lookup = []
 440         for partial in norm_name.split():
 441             token = self._cache.partials.get(partial)
 442             if token:
 443                 tokens.append(token)
 444             else:
 445                 need_lookup.append(partial)
 446
 447         if need_lookup:
 448             with self.conn.cursor() as cur:
 449                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 450                                FROM unnest(%s) word""",
 451                             (need_lookup, ))
 452
 453                 for partial, token in cur:
 454                     tokens.append(token)
 455                     self._cache.partials[partial] = token
 456
 457         return tokens
 458
 459     def _compute_name_tokens(self, names):
 460         """ Computes the full name and partial name tokens for the given
 461             dictionary of names.
 462         """
 463         full_names = self._compute_full_names(names)
 464         full_tokens = set()
 465         partial_tokens = set()
 466
 467         for name in full_names:
 468             norm_name = self.name_processor.get_normalized(name)
 469             full, part = self._cache.names.get(norm_name, (None, None))
 470             if full is None:
 471                 variants = self.name_processor.get_variants_ascii(norm_name)
 472                 if not variants:
 473                     continue
 474
 475                 with self.conn.cursor() as cur:
 476                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 477                                 (norm_name, variants))
 478                     full, part = cur.fetchone()
 479
 480                 self._cache.names[norm_name] = (full, part)
 481
 482             full_tokens.add(full)
 483             partial_tokens.update(part)
 484
 485         return full_tokens, partial_tokens
 486
 487
 488     @staticmethod
 489     def _compute_full_names(names):
 490         """ Return the set of all full name word ids to be used with the
 491             given dictionary of names.
 492         """
 493         full_names = set()
 494         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 495             if name:
 496                 full_names.add(name)
 497
 498                 brace_idx = name.find('(')
 499                 if brace_idx >= 0:
 500                     full_names.add(name[:brace_idx].strip())
 501
 502         return full_names
 503
 504
 505     def _add_postcode(self, postcode):
 506         """ Make sure the normalized postcode is present in the word table.
 507         """
 508         if re.search(r'[:,;]', postcode) is None:
 509             postcode = self.normalize_postcode(postcode)
 510
 511             if postcode not in self._cache.postcodes:
 512                 term = self.name_processor.get_search_normalized(postcode)
 513                 if not term:
 514                     return
 515
 516                 with self.conn.cursor() as cur:
 517                     # no word_id needed for postcodes
 518                     cur.execute("""INSERT INTO word (word_token, type, word)
 519                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 520                                     WHERE NOT EXISTS
 521                                      (SELECT * FROM word
 522                                       WHERE type = 'P' and word = pc))
 523                                 """, (term, postcode))
 524                 self._cache.postcodes.add(postcode)
 525
 526
 527     @staticmethod
 528     def _split_housenumbers(hnrs):
 529         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 530             # split numbers if necessary
 531             simple_list = []
 532             for hnr in hnrs:
 533                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 534
 535             if len(simple_list) > 1:
 536                 hnrs = list(set(simple_list))
 537             else:
 538                 hnrs = simple_list
 539
 540         return hnrs
 541
 542
 543
 544
 545 class _TokenInfo:
 546     """ Collect token information to be sent back to the database.
 547     """
 548     def __init__(self, cache):
 549         self._cache = cache
 550         self.data = {}
 551
 552     @staticmethod
 553     def _mk_array(tokens):
 554         return '{%s}' % ','.join((str(s) for s in tokens))
 555
 556
 557     def add_names(self, fulls, partials):
 558         """ Adds token information for the normalised names.
 559         """
 560         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 561
 562
 563     def add_housenumbers(self, conn, hnrs):
 564         """ Extract housenumber information from a list of normalised
 565             housenumbers.
 566         """
 567         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 568         self.data['hnr'] = ';'.join(hnrs)
 569
 570
 571     def add_street(self, tokens):
 572         """ Add addr:street match terms.
 573         """
 574         if tokens:
 575             self.data['street'] = self._mk_array(tokens)
 576
 577
 578     def add_place(self, tokens):
 579         """ Add addr:place search and match terms.
 580         """
 581         if tokens:
 582             self.data['place'] = self._mk_array(tokens)
 583
 584
 585     def add_address_terms(self, terms):
 586         """ Add additional address terms.
 587         """
 588         tokens = {key: self._mk_array(partials)
 589                   for key, partials in terms if partials}
 590
 591         if tokens:
 592             self.data['addr'] = tokens
 593
 594
 595 class _TokenCache:
 596     """ Cache for token information to avoid repeated database queries.
 597
 598         This cache is not thread-safe and needs to be instantiated per
 599         analyzer.
 600     """
 601     def __init__(self):
 602         self.names = {}
 603         self.partials = {}
 604         self.postcodes = set()
 605         self.housenumbers = {}
 606
 607
 608     def get_hnr_tokens(self, conn, terms):
 609         """ Get token ids for a list of housenumbers, looking them up in the
 610             database if necessary. `terms` is an iterable of normalized
 611             housenumbers.
 612         """
 613         tokens = []
 614         askdb = []
 615
 616         for term in terms:
 617             token = self.housenumbers.get(term)
 618             if token is None:
 619                 askdb.append(term)
 620             else:
 621                 tokens.append(token)
 622
 623         if askdb:
 624             with conn.cursor() as cur:
 625                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 626                             (askdb, ))
 627                 for term, tid in cur:
 628                     self.housenumbers[term] = tid
 629                     tokens.append(tid)
 630
 631         return tokens