nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  18
  19 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  21
  22 LOG = logging.getLogger()
  23
  24 def create(dsn, data_dir):
  25     """ Create a new instance of the tokenizer provided by this module.
  26     """
  27     return LegacyICUTokenizer(dsn, data_dir)
  28
  29
  30 class LegacyICUTokenizer:
  31     """ This tokenizer uses libICU to covert names and queries to ASCII.
  32         Otherwise it uses the same algorithms and data structures as the
  33         normalization routines in Nominatim 3.
  34     """
  35
  36     def __init__(self, dsn, data_dir):
  37         self.dsn = dsn
  38         self.data_dir = data_dir
  39         self.naming_rules = None
  40         self.term_normalization = None
  41         self.max_word_frequency = None
  42
  43
  44     def init_new_db(self, config, init_db=True):
  45         """ Set up a new tokenizer for the database.
  46
  47             This copies all necessary data in the project directory to make
  48             sure the tokenizer remains stable even over updates.
  49         """
  50         if config.TOKENIZER_CONFIG:
  51             cfgfile = Path(config.TOKENIZER_CONFIG)
  52         else:
  53             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  54
  55         loader = ICURuleLoader(cfgfile)
  56         self.naming_rules = ICUNameProcessorRules(loader=loader)
  57         self.term_normalization = config.TERM_NORMALIZATION
  58         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  59
  60         self._install_php(config.lib_dir.php)
  61         self._save_config(config)
  62
  63         if init_db:
  64             self.update_sql_functions(config)
  65             self._init_db_tables(config)
  66
  67
  68     def init_from_project(self):
  69         """ Initialise the tokenizer from the project directory.
  70         """
  71         with connect(self.dsn) as conn:
  72             self.naming_rules = ICUNameProcessorRules(conn=conn)
  73             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  74             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  75
  76
  77     def finalize_import(self, _):
  78         """ Do any required postprocessing to make the tokenizer data ready
  79             for use.
  80         """
  81         pass
  82
  83
  84     def update_sql_functions(self, config):
  85         """ Reimport the SQL functions for this tokenizer.
  86         """
  87         with connect(self.dsn) as conn:
  88             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  89             sqlp = SQLPreprocessor(conn, config)
  90             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  91                               max_word_freq=max_word_freq)
  92
  93
  94     def check_database(self):
  95         """ Check that the tokenizer is set up correctly.
  96         """
  97         self.init_from_project()
  98
  99         if self.naming_rules is None:
 100             return "Configuration for tokenizer 'legacy_icu' are missing."
 101
 102         return None
 103
 104
 105     def name_analyzer(self):
 106         """ Create a new analyzer for tokenizing names and queries
 107             using this tokinzer. Analyzers are context managers and should
 108             be used accordingly:
 109
 110             ```
 111             with tokenizer.name_analyzer() as analyzer:
 112                 analyser.tokenize()
 113             ```
 114
 115             When used outside the with construct, the caller must ensure to
 116             call the close() function before destructing the analyzer.
 117
 118             Analyzers are not thread-safe. You need to instantiate one per thread.
 119         """
 120         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 121
 122
 123     def _install_php(self, phpdir):
 124         """ Install the php script for the tokenizer.
 125         """
 126         php_file = self.data_dir / "tokenizer.php"
 127         php_file.write_text(dedent(f"""\
 128             <?php
 129             @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
 130             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 131             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
 132             require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
 133
 134
 135     def _save_config(self, config):
 136         """ Save the configuration that needs to remain stable for the given
 137             database as database properties.
 138         """
 139         with connect(self.dsn) as conn:
 140             self.naming_rules.save_rules(conn)
 141
 142             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 143             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 144
 145
 146     def _init_db_tables(self, config):
 147         """ Set up the word table and fill it with pre-computed word
 148             frequencies.
 149         """
 150         with connect(self.dsn) as conn:
 151             sqlp = SQLPreprocessor(conn, config)
 152             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 153             conn.commit()
 154
 155             LOG.warning("Precomputing word tokens")
 156
 157             # get partial words and their frequencies
 158             words = Counter()
 159             name_proc = ICUNameProcessor(self.naming_rules)
 160             with conn.cursor(name="words") as cur:
 161                 cur.execute(""" SELECT v, count(*) FROM
 162                                   (SELECT svals(name) as v FROM place)x
 163                                 WHERE length(v) < 75 GROUP BY v""")
 164
 165                 for name, cnt in cur:
 166                     terms = set()
 167                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 168                         if ' ' in word:
 169                             terms.update(word.split())
 170                     for term in terms:
 171                         words[term] += cnt
 172
 173             # copy them back into the word table
 174             with CopyBuffer() as copystr:
 175                 for k, v in words.items():
 176                     copystr.add('w', k, {'count': v})
 177
 178                 with conn.cursor() as cur:
 179                     copystr.copy_out(cur, 'word',
 180                                      columns=['type', 'word_token', 'info'])
 181                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 182                                    WHERE word_id is null and type = 'w'""")
 183
 184             conn.commit()
 185
 186
 187 class LegacyICUNameAnalyzer:
 188     """ The legacy analyzer uses the ICU library for splitting names.
 189
 190         Each instance opens a connection to the database to request the
 191         normalization.
 192     """
 193
 194     def __init__(self, dsn, name_proc):
 195         self.conn = connect(dsn).connection
 196         self.conn.autocommit = True
 197         self.name_processor = name_proc
 198
 199         self._cache = _TokenCache()
 200
 201
 202     def __enter__(self):
 203         return self
 204
 205
 206     def __exit__(self, exc_type, exc_value, traceback):
 207         self.close()
 208
 209
 210     def close(self):
 211         """ Free all resources used by the analyzer.
 212         """
 213         if self.conn:
 214             self.conn.close()
 215             self.conn = None
 216
 217
 218     def get_word_token_info(self, words):
 219         """ Return token information for the given list of words.
 220             If a word starts with # it is assumed to be a full name
 221             otherwise is a partial name.
 222
 223             The function returns a list of tuples with
 224             (original word, word token, word id).
 225
 226             The function is used for testing and debugging only
 227             and not necessarily efficient.
 228         """
 229         full_tokens = {}
 230         partial_tokens = {}
 231         for word in words:
 232             if word.startswith('#'):
 233                 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
 234             else:
 235                 partial_tokens[word] = self.name_processor.get_search_normalized(word)
 236
 237         with self.conn.cursor() as cur:
 238             cur.execute("""(SELECT word_token, word_id
 239                             FROM word WHERE word_token = ANY(%s) and type = 'W')
 240                            UNION
 241                            (SELECT word_token, word_id
 242                             FROM word WHERE word_token = ANY(%s) and type = 'w')""",
 243                         (list(full_tokens.values()),
 244                          list(partial_tokens.values())))
 245             ids = {r[0]: r[1] for r in cur}
 246
 247         return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
 248                + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
 249
 250
 251     @staticmethod
 252     def normalize_postcode(postcode):
 253         """ Convert the postcode to a standardized form.
 254
 255             This function must yield exactly the same result as the SQL function
 256             'token_normalized_postcode()'.
 257         """
 258         return postcode.strip().upper()
 259
 260
 261     def _make_standard_hnr(self, hnr):
 262         """ Create a normalised version of a housenumber.
 263
 264             This function takes minor shortcuts on transliteration.
 265         """
 266         return self.name_processor.get_search_normalized(hnr)
 267
 268     def update_postcodes_from_db(self):
 269         """ Update postcode tokens in the word table from the location_postcode
 270             table.
 271         """
 272         to_delete = []
 273         with self.conn.cursor() as cur:
 274             # This finds us the rows in location_postcode and word that are
 275             # missing in the other table.
 276             cur.execute("""SELECT * FROM
 277                             (SELECT pc, word FROM
 278                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 279                               FULL JOIN
 280                               (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
 281                               ON pc = word) x
 282                            WHERE pc is null or word is null""")
 283
 284             with CopyBuffer() as copystr:
 285                 for postcode, word in cur:
 286                     if postcode is None:
 287                         to_delete.append(word)
 288                     else:
 289                         copystr.add(self.name_processor.get_search_normalized(postcode),
 290                                     'P', {'postcode': postcode})
 291
 292                 if to_delete:
 293                     cur.execute("""DELETE FROM WORD
 294                                    WHERE type ='P' and info->>'postcode' = any(%s)
 295                                 """, (to_delete, ))
 296
 297                 copystr.copy_out(cur, 'word',
 298                                  columns=['word_token', 'type', 'info'])
 299
 300
 301     def update_special_phrases(self, phrases, should_replace):
 302         """ Replace the search index for special phrases with the new phrases.
 303             If `should_replace` is True, then the previous set of will be
 304             completely replaced. Otherwise the phrases are added to the
 305             already existing ones.
 306         """
 307         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 308                             for p in phrases))
 309
 310         with self.conn.cursor() as cur:
 311             # Get the old phrases.
 312             existing_phrases = set()
 313             cur.execute("SELECT info FROM word WHERE type = 'S'")
 314             for (info, ) in cur:
 315                 existing_phrases.add((info['word'], info['class'], info['type'],
 316                                       info.get('op') or '-'))
 317
 318             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 319             if should_replace:
 320                 deleted = self._remove_special_phrases(cur, norm_phrases,
 321                                                        existing_phrases)
 322             else:
 323                 deleted = 0
 324
 325         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 326                  len(norm_phrases), added, deleted)
 327
 328
 329     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 330         """ Add all phrases to the database that are not yet there.
 331         """
 332         to_add = new_phrases - existing_phrases
 333
 334         added = 0
 335         with CopyBuffer() as copystr:
 336             for word, cls, typ, oper in to_add:
 337                 term = self.name_processor.get_search_normalized(word)
 338                 if term:
 339                     copystr.add(term, 'S',
 340                                 {'word': word, 'class': cls, 'type': typ,
 341                                  'op': oper if oper in ('in', 'near') else None})
 342                     added += 1
 343
 344             copystr.copy_out(cursor, 'word',
 345                              columns=['word_token', 'type', 'info'])
 346
 347         return added
 348
 349
 350     @staticmethod
 351     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 352         """ Remove all phrases from the databse that are no longer in the
 353             new phrase list.
 354         """
 355         to_delete = existing_phrases - new_phrases
 356
 357         if to_delete:
 358             cursor.execute_values(
 359                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 360                     WHERE info->>'word' = name
 361                           and info->>'class' = in_class and info->>'type' = in_type
 362                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 363                 """, to_delete)
 364
 365         return len(to_delete)
 366
 367
 368     def add_country_names(self, country_code, names):
 369         """ Add names for the given country to the search index.
 370         """
 371         word_tokens = set()
 372         for name in self._compute_full_names(names):
 373             norm_name = self.name_processor.get_search_normalized(name)
 374             if norm_name:
 375                 word_tokens.add(norm_name)
 376
 377         with self.conn.cursor() as cur:
 378             # Get existing names
 379             cur.execute("""SELECT word_token FROM word
 380                             WHERE type = 'C' and info->>'cc'= %s""",
 381                         (country_code, ))
 382             word_tokens.difference_update((t[0] for t in cur))
 383
 384             # Only add those names that are not yet in the list.
 385             if word_tokens:
 386                 cur.execute("""INSERT INTO word (word_token, type, info)
 387                                (SELECT token, 'C', json_build_object('cc', %s)
 388                                 FROM unnest(%s) as token)
 389                             """, (country_code, list(word_tokens)))
 390
 391             # No names are deleted at the moment.
 392             # If deletion is made possible, then the static names from the
 393             # initial 'country_name' table should be kept.
 394
 395
 396     def process_place(self, place):
 397         """ Determine tokenizer information about the given place.
 398
 399             Returns a JSON-serialisable structure that will be handed into
 400             the database via the token_info field.
 401         """
 402         token_info = _TokenInfo(self._cache)
 403
 404         names = place.get('name')
 405
 406         if names:
 407             fulls, partials = self._compute_name_tokens(names)
 408
 409             token_info.add_names(fulls, partials)
 410
 411             country_feature = place.get('country_feature')
 412             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 413                 self.add_country_names(country_feature.lower(), names)
 414
 415         address = place.get('address')
 416         if address:
 417             self._process_place_address(token_info, address)
 418
 419         return token_info.data
 420
 421
 422     def _process_place_address(self, token_info, address):
 423         hnrs = []
 424         addr_terms = []
 425         for key, value in address.items():
 426             if key == 'postcode':
 427                 self._add_postcode(value)
 428             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 429                 hnrs.append(value)
 430             elif key == 'street':
 431                 token_info.add_street(*self._compute_name_tokens({'name': value}))
 432             elif key == 'place':
 433                 token_info.add_place(*self._compute_name_tokens({'name': value}))
 434             elif not key.startswith('_') and \
 435                  key not in ('country', 'full'):
 436                 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 437
 438         if hnrs:
 439             hnrs = self._split_housenumbers(hnrs)
 440             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 441
 442         if addr_terms:
 443             token_info.add_address_terms(addr_terms)
 444
 445
 446     def _compute_name_tokens(self, names):
 447         """ Computes the full name and partial name tokens for the given
 448             dictionary of names.
 449         """
 450         full_names = self._compute_full_names(names)
 451         full_tokens = set()
 452         partial_tokens = set()
 453
 454         for name in full_names:
 455             norm_name = self.name_processor.get_normalized(name)
 456             full, part = self._cache.names.get(norm_name, (None, None))
 457             if full is None:
 458                 variants = self.name_processor.get_variants_ascii(norm_name)
 459                 if not variants:
 460                     continue
 461
 462                 with self.conn.cursor() as cur:
 463                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 464                                 (norm_name, variants))
 465                     full, part = cur.fetchone()
 466
 467                 self._cache.names[norm_name] = (full, part)
 468
 469             full_tokens.add(full)
 470             partial_tokens.update(part)
 471
 472         return full_tokens, partial_tokens
 473
 474
 475     @staticmethod
 476     def _compute_full_names(names):
 477         """ Return the set of all full name word ids to be used with the
 478             given dictionary of names.
 479         """
 480         full_names = set()
 481         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 482             if name:
 483                 full_names.add(name)
 484
 485                 brace_idx = name.find('(')
 486                 if brace_idx >= 0:
 487                     full_names.add(name[:brace_idx].strip())
 488
 489         return full_names
 490
 491
 492     def _add_postcode(self, postcode):
 493         """ Make sure the normalized postcode is present in the word table.
 494         """
 495         if re.search(r'[:,;]', postcode) is None:
 496             postcode = self.normalize_postcode(postcode)
 497
 498             if postcode not in self._cache.postcodes:
 499                 term = self.name_processor.get_search_normalized(postcode)
 500                 if not term:
 501                     return
 502
 503                 with self.conn.cursor() as cur:
 504                     # no word_id needed for postcodes
 505                     cur.execute("""INSERT INTO word (word_token, type, info)
 506                                    (SELECT %s, 'P', json_build_object('postcode', pc)
 507                                     FROM (VALUES (%s)) as v(pc)
 508                                     WHERE NOT EXISTS
 509                                      (SELECT * FROM word
 510                                       WHERE type = 'P' and info->>postcode = pc))
 511                                 """, (term, postcode))
 512                 self._cache.postcodes.add(postcode)
 513
 514
 515     @staticmethod
 516     def _split_housenumbers(hnrs):
 517         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 518             # split numbers if necessary
 519             simple_list = []
 520             for hnr in hnrs:
 521                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 522
 523             if len(simple_list) > 1:
 524                 hnrs = list(set(simple_list))
 525             else:
 526                 hnrs = simple_list
 527
 528         return hnrs
 529
 530
 531
 532
 533 class _TokenInfo:
 534     """ Collect token information to be sent back to the database.
 535     """
 536     def __init__(self, cache):
 537         self._cache = cache
 538         self.data = {}
 539
 540     @staticmethod
 541     def _mk_array(tokens):
 542         return '{%s}' % ','.join((str(s) for s in tokens))
 543
 544
 545     def add_names(self, fulls, partials):
 546         """ Adds token information for the normalised names.
 547         """
 548         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 549
 550
 551     def add_housenumbers(self, conn, hnrs):
 552         """ Extract housenumber information from a list of normalised
 553             housenumbers.
 554         """
 555         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 556         self.data['hnr'] = ';'.join(hnrs)
 557
 558
 559     def add_street(self, fulls, _):
 560         """ Add addr:street match terms.
 561         """
 562         if fulls:
 563             self.data['street'] = self._mk_array(fulls)
 564
 565
 566     def add_place(self, fulls, partials):
 567         """ Add addr:place search and match terms.
 568         """
 569         if fulls:
 570             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 571             self.data['place_match'] = self._mk_array(fulls)
 572
 573
 574     def add_address_terms(self, terms):
 575         """ Add additional address terms.
 576         """
 577         tokens = {}
 578
 579         for key, fulls, partials in terms:
 580             if fulls:
 581                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 582                                self._mk_array(fulls)]
 583
 584         if tokens:
 585             self.data['addr'] = tokens
 586
 587
 588 class _TokenCache:
 589     """ Cache for token information to avoid repeated database queries.
 590
 591         This cache is not thread-safe and needs to be instantiated per
 592         analyzer.
 593     """
 594     def __init__(self):
 595         self.names = {}
 596         self.postcodes = set()
 597         self.housenumbers = {}
 598
 599
 600     def get_hnr_tokens(self, conn, terms):
 601         """ Get token ids for a list of housenumbers, looking them up in the
 602             database if necessary. `terms` is an iterable of normalized
 603             housenumbers.
 604         """
 605         tokens = []
 606         askdb = []
 607
 608         for term in terms:
 609             token = self.housenumbers.get(term)
 610             if token is None:
 611                 askdb.append(term)
 612             else:
 613                 tokens.append(token)
 614
 615         if askdb:
 616             with conn.cursor() as cur:
 617                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 618                             (askdb, ))
 619                 for term, tid in cur:
 620                     self.housenumbers[term] = tid
 621                     tokens.append(tid)
 622
 623         return tokens