nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11 from pathlib import Path
  12
  13 from nominatim.db.connection import connect
  14 from nominatim.db.properties import set_property, get_property
  15 from nominatim.db.utils import CopyBuffer
  16 from nominatim.db.sql_preprocessor import SQLPreprocessor
  17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  18 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  19
  20 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  21 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  22
  23 LOG = logging.getLogger()
  24
  25 def create(dsn, data_dir):
  26     """ Create a new instance of the tokenizer provided by this module.
  27     """
  28     return LegacyICUTokenizer(dsn, data_dir)
  29
  30
  31 class LegacyICUTokenizer:
  32     """ This tokenizer uses libICU to covert names and queries to ASCII.
  33         Otherwise it uses the same algorithms and data structures as the
  34         normalization routines in Nominatim 3.
  35     """
  36
  37     def __init__(self, dsn, data_dir):
  38         self.dsn = dsn
  39         self.data_dir = data_dir
  40         self.naming_rules = None
  41         self.term_normalization = None
  42         self.max_word_frequency = None
  43
  44
  45     def init_new_db(self, config, init_db=True):
  46         """ Set up a new tokenizer for the database.
  47
  48             This copies all necessary data in the project directory to make
  49             sure the tokenizer remains stable even over updates.
  50         """
  51         if config.TOKENIZER_CONFIG:
  52             cfgfile = Path(config.TOKENIZER_CONFIG)
  53         else:
  54             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  55
  56         loader = ICURuleLoader(cfgfile)
  57         self.naming_rules = ICUNameProcessorRules(loader=loader)
  58         self.term_normalization = config.TERM_NORMALIZATION
  59         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  60
  61         self._install_php(config.lib_dir.php)
  62         self._save_config(config)
  63
  64         if init_db:
  65             self.update_sql_functions(config)
  66             self._init_db_tables(config)
  67
  68
  69     def init_from_project(self):
  70         """ Initialise the tokenizer from the project directory.
  71         """
  72         with connect(self.dsn) as conn:
  73             self.naming_rules = ICUNameProcessorRules(conn=conn)
  74             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  75             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  76
  77
  78     def finalize_import(self, _):
  79         """ Do any required postprocessing to make the tokenizer data ready
  80             for use.
  81         """
  82
  83
  84     def update_sql_functions(self, config):
  85         """ Reimport the SQL functions for this tokenizer.
  86         """
  87         with connect(self.dsn) as conn:
  88             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  89             sqlp = SQLPreprocessor(conn, config)
  90             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  91                               max_word_freq=max_word_freq)
  92
  93
  94     def check_database(self):
  95         """ Check that the tokenizer is set up correctly.
  96         """
  97         self.init_from_project()
  98
  99         if self.naming_rules is None:
 100             return "Configuration for tokenizer 'legacy_icu' are missing."
 101
 102         return None
 103
 104
 105     def name_analyzer(self):
 106         """ Create a new analyzer for tokenizing names and queries
 107             using this tokinzer. Analyzers are context managers and should
 108             be used accordingly:
 109
 110             ```
 111             with tokenizer.name_analyzer() as analyzer:
 112                 analyser.tokenize()
 113             ```
 114
 115             When used outside the with construct, the caller must ensure to
 116             call the close() function before destructing the analyzer.
 117
 118             Analyzers are not thread-safe. You need to instantiate one per thread.
 119         """
 120         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 121
 122
 123     def _install_php(self, phpdir):
 124         """ Install the php script for the tokenizer.
 125         """
 126         php_file = self.data_dir / "tokenizer.php"
 127         php_file.write_text(dedent(f"""\
 128             <?php
 129             @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
 130             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 131             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
 132             require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
 133
 134
 135     def _save_config(self, config):
 136         """ Save the configuration that needs to remain stable for the given
 137             database as database properties.
 138         """
 139         with connect(self.dsn) as conn:
 140             self.naming_rules.save_rules(conn)
 141
 142             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 143             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 144
 145
 146     def _init_db_tables(self, config):
 147         """ Set up the word table and fill it with pre-computed word
 148             frequencies.
 149         """
 150         with connect(self.dsn) as conn:
 151             sqlp = SQLPreprocessor(conn, config)
 152             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 153             conn.commit()
 154
 155             LOG.warning("Precomputing word tokens")
 156
 157             # get partial words and their frequencies
 158             words = self._count_partial_terms(conn)
 159
 160             # copy them back into the word table
 161             with CopyBuffer() as copystr:
 162                 for term, cnt in words.items():
 163                     copystr.add('w', term, json.dumps({'count': cnt}))
 164
 165                 with conn.cursor() as cur:
 166                     copystr.copy_out(cur, 'word',
 167                                      columns=['type', 'word_token', 'info'])
 168                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 169                                    WHERE word_id is null and type = 'w'""")
 170
 171             conn.commit()
 172
 173     def _count_partial_terms(self, conn):
 174         """ Count the partial terms from the names in the place table.
 175         """
 176         words = Counter()
 177         name_proc = ICUNameProcessor(self.naming_rules)
 178
 179         with conn.cursor(name="words") as cur:
 180             cur.execute(""" SELECT v, count(*) FROM
 181                               (SELECT svals(name) as v FROM place)x
 182                             WHERE length(v) < 75 GROUP BY v""")
 183
 184             for name, cnt in cur:
 185                 terms = set()
 186                 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 187                     if ' ' in word:
 188                         terms.update(word.split())
 189                 for term in terms:
 190                     words[term] += cnt
 191
 192         return words
 193
 194
 195 class LegacyICUNameAnalyzer:
 196     """ The legacy analyzer uses the ICU library for splitting names.
 197
 198         Each instance opens a connection to the database to request the
 199         normalization.
 200     """
 201
 202     def __init__(self, dsn, name_proc):
 203         self.conn = connect(dsn).connection
 204         self.conn.autocommit = True
 205         self.name_processor = name_proc
 206
 207         self._cache = _TokenCache()
 208
 209
 210     def __enter__(self):
 211         return self
 212
 213
 214     def __exit__(self, exc_type, exc_value, traceback):
 215         self.close()
 216
 217
 218     def close(self):
 219         """ Free all resources used by the analyzer.
 220         """
 221         if self.conn:
 222             self.conn.close()
 223             self.conn = None
 224
 225
 226     def get_word_token_info(self, words):
 227         """ Return token information for the given list of words.
 228             If a word starts with # it is assumed to be a full name
 229             otherwise is a partial name.
 230
 231             The function returns a list of tuples with
 232             (original word, word token, word id).
 233
 234             The function is used for testing and debugging only
 235             and not necessarily efficient.
 236         """
 237         full_tokens = {}
 238         partial_tokens = {}
 239         for word in words:
 240             if word.startswith('#'):
 241                 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
 242             else:
 243                 partial_tokens[word] = self.name_processor.get_search_normalized(word)
 244
 245         with self.conn.cursor() as cur:
 246             cur.execute("""SELECT word_token, word_id
 247                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 248                         """, (list(full_tokens.values()),))
 249             full_ids = {r[0]: r[1] for r in cur}
 250             cur.execute("""SELECT word_token, word_id
 251                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 252                         (list(partial_tokens.values()),))
 253             part_ids = {r[0]: r[1] for r in cur}
 254
 255         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 256                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 257
 258
 259     @staticmethod
 260     def normalize_postcode(postcode):
 261         """ Convert the postcode to a standardized form.
 262
 263             This function must yield exactly the same result as the SQL function
 264             'token_normalized_postcode()'.
 265         """
 266         return postcode.strip().upper()
 267
 268
 269     def _make_standard_hnr(self, hnr):
 270         """ Create a normalised version of a housenumber.
 271
 272             This function takes minor shortcuts on transliteration.
 273         """
 274         return self.name_processor.get_search_normalized(hnr)
 275
 276     def update_postcodes_from_db(self):
 277         """ Update postcode tokens in the word table from the location_postcode
 278             table.
 279         """
 280         to_delete = []
 281         with self.conn.cursor() as cur:
 282             # This finds us the rows in location_postcode and word that are
 283             # missing in the other table.
 284             cur.execute("""SELECT * FROM
 285                             (SELECT pc, word FROM
 286                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 287                               FULL JOIN
 288                               (SELECT word FROM word WHERE type = 'P') w
 289                               ON pc = word) x
 290                            WHERE pc is null or word is null""")
 291
 292             with CopyBuffer() as copystr:
 293                 for postcode, word in cur:
 294                     if postcode is None:
 295                         to_delete.append(word)
 296                     else:
 297                         copystr.add(self.name_processor.get_search_normalized(postcode),
 298                                     'P', postcode)
 299
 300                 if to_delete:
 301                     cur.execute("""DELETE FROM WORD
 302                                    WHERE type ='P' and word = any(%s)
 303                                 """, (to_delete, ))
 304
 305                 copystr.copy_out(cur, 'word',
 306                                  columns=['word_token', 'type', 'word'])
 307
 308
 309     def update_special_phrases(self, phrases, should_replace):
 310         """ Replace the search index for special phrases with the new phrases.
 311             If `should_replace` is True, then the previous set of will be
 312             completely replaced. Otherwise the phrases are added to the
 313             already existing ones.
 314         """
 315         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 316                             for p in phrases))
 317
 318         with self.conn.cursor() as cur:
 319             # Get the old phrases.
 320             existing_phrases = set()
 321             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 322             for word, info in cur:
 323                 existing_phrases.add((word, info['class'], info['type'],
 324                                       info.get('op') or '-'))
 325
 326             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 327             if should_replace:
 328                 deleted = self._remove_special_phrases(cur, norm_phrases,
 329                                                        existing_phrases)
 330             else:
 331                 deleted = 0
 332
 333         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 334                  len(norm_phrases), added, deleted)
 335
 336
 337     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 338         """ Add all phrases to the database that are not yet there.
 339         """
 340         to_add = new_phrases - existing_phrases
 341
 342         added = 0
 343         with CopyBuffer() as copystr:
 344             for word, cls, typ, oper in to_add:
 345                 term = self.name_processor.get_search_normalized(word)
 346                 if term:
 347                     copystr.add(term, 'S', word,
 348                                 json.dumps({'class': cls, 'type': typ,
 349                                             'op': oper if oper in ('in', 'near') else None}))
 350                     added += 1
 351
 352             copystr.copy_out(cursor, 'word',
 353                              columns=['word_token', 'type', 'word', 'info'])
 354
 355         return added
 356
 357
 358     @staticmethod
 359     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 360         """ Remove all phrases from the databse that are no longer in the
 361             new phrase list.
 362         """
 363         to_delete = existing_phrases - new_phrases
 364
 365         if to_delete:
 366             cursor.execute_values(
 367                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 368                     WHERE type = 'S' and word = name
 369                           and info->>'class' = in_class and info->>'type' = in_type
 370                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 371                 """, to_delete)
 372
 373         return len(to_delete)
 374
 375
 376     def add_country_names(self, country_code, names):
 377         """ Add names for the given country to the search index.
 378         """
 379         word_tokens = set()
 380         for name in self._compute_full_names(names):
 381             norm_name = self.name_processor.get_search_normalized(name)
 382             if norm_name:
 383                 word_tokens.add(norm_name)
 384
 385         with self.conn.cursor() as cur:
 386             # Get existing names
 387             cur.execute("""SELECT word_token FROM word
 388                             WHERE type = 'C' and word = %s""",
 389                         (country_code, ))
 390             word_tokens.difference_update((t[0] for t in cur))
 391
 392             # Only add those names that are not yet in the list.
 393             if word_tokens:
 394                 cur.execute("""INSERT INTO word (word_token, type, word)
 395                                (SELECT token, 'C', %s
 396                                 FROM unnest(%s) as token)
 397                             """, (country_code, list(word_tokens)))
 398
 399             # No names are deleted at the moment.
 400             # If deletion is made possible, then the static names from the
 401             # initial 'country_name' table should be kept.
 402
 403
 404     def process_place(self, place):
 405         """ Determine tokenizer information about the given place.
 406
 407             Returns a JSON-serialisable structure that will be handed into
 408             the database via the token_info field.
 409         """
 410         token_info = _TokenInfo(self._cache)
 411
 412         names = place.get('name')
 413
 414         if names:
 415             fulls, partials = self._compute_name_tokens(names)
 416
 417             token_info.add_names(fulls, partials)
 418
 419             country_feature = place.get('country_feature')
 420             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 421                 self.add_country_names(country_feature.lower(), names)
 422
 423         address = place.get('address')
 424         if address:
 425             self._process_place_address(token_info, address)
 426
 427         return token_info.data
 428
 429
 430     def _process_place_address(self, token_info, address):
 431         hnrs = []
 432         addr_terms = []
 433         for key, value in address.items():
 434             if key == 'postcode':
 435                 self._add_postcode(value)
 436             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 437                 hnrs.append(value)
 438             elif key == 'street':
 439                 token_info.add_street(*self._compute_name_tokens({'name': value}))
 440             elif key == 'place':
 441                 token_info.add_place(*self._compute_name_tokens({'name': value}))
 442             elif not key.startswith('_') and \
 443                  key not in ('country', 'full'):
 444                 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 445
 446         if hnrs:
 447             hnrs = self._split_housenumbers(hnrs)
 448             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 449
 450         if addr_terms:
 451             token_info.add_address_terms(addr_terms)
 452
 453
 454     def _compute_name_tokens(self, names):
 455         """ Computes the full name and partial name tokens for the given
 456             dictionary of names.
 457         """
 458         full_names = self._compute_full_names(names)
 459         full_tokens = set()
 460         partial_tokens = set()
 461
 462         for name in full_names:
 463             norm_name = self.name_processor.get_normalized(name)
 464             full, part = self._cache.names.get(norm_name, (None, None))
 465             if full is None:
 466                 variants = self.name_processor.get_variants_ascii(norm_name)
 467                 if not variants:
 468                     continue
 469
 470                 with self.conn.cursor() as cur:
 471                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 472                                 (norm_name, variants))
 473                     full, part = cur.fetchone()
 474
 475                 self._cache.names[norm_name] = (full, part)
 476
 477             full_tokens.add(full)
 478             partial_tokens.update(part)
 479
 480         return full_tokens, partial_tokens
 481
 482
 483     @staticmethod
 484     def _compute_full_names(names):
 485         """ Return the set of all full name word ids to be used with the
 486             given dictionary of names.
 487         """
 488         full_names = set()
 489         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 490             if name:
 491                 full_names.add(name)
 492
 493                 brace_idx = name.find('(')
 494                 if brace_idx >= 0:
 495                     full_names.add(name[:brace_idx].strip())
 496
 497         return full_names
 498
 499
 500     def _add_postcode(self, postcode):
 501         """ Make sure the normalized postcode is present in the word table.
 502         """
 503         if re.search(r'[:,;]', postcode) is None:
 504             postcode = self.normalize_postcode(postcode)
 505
 506             if postcode not in self._cache.postcodes:
 507                 term = self.name_processor.get_search_normalized(postcode)
 508                 if not term:
 509                     return
 510
 511                 with self.conn.cursor() as cur:
 512                     # no word_id needed for postcodes
 513                     cur.execute("""INSERT INTO word (word_token, type, word)
 514                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 515                                     WHERE NOT EXISTS
 516                                      (SELECT * FROM word
 517                                       WHERE type = 'P' and word = pc))
 518                                 """, (term, postcode))
 519                 self._cache.postcodes.add(postcode)
 520
 521
 522     @staticmethod
 523     def _split_housenumbers(hnrs):
 524         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 525             # split numbers if necessary
 526             simple_list = []
 527             for hnr in hnrs:
 528                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 529
 530             if len(simple_list) > 1:
 531                 hnrs = list(set(simple_list))
 532             else:
 533                 hnrs = simple_list
 534
 535         return hnrs
 536
 537
 538
 539
 540 class _TokenInfo:
 541     """ Collect token information to be sent back to the database.
 542     """
 543     def __init__(self, cache):
 544         self._cache = cache
 545         self.data = {}
 546
 547     @staticmethod
 548     def _mk_array(tokens):
 549         return '{%s}' % ','.join((str(s) for s in tokens))
 550
 551
 552     def add_names(self, fulls, partials):
 553         """ Adds token information for the normalised names.
 554         """
 555         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 556
 557
 558     def add_housenumbers(self, conn, hnrs):
 559         """ Extract housenumber information from a list of normalised
 560             housenumbers.
 561         """
 562         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 563         self.data['hnr'] = ';'.join(hnrs)
 564
 565
 566     def add_street(self, fulls, _):
 567         """ Add addr:street match terms.
 568         """
 569         if fulls:
 570             self.data['street'] = self._mk_array(fulls)
 571
 572
 573     def add_place(self, fulls, partials):
 574         """ Add addr:place search and match terms.
 575         """
 576         if fulls:
 577             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 578             self.data['place_match'] = self._mk_array(fulls)
 579
 580
 581     def add_address_terms(self, terms):
 582         """ Add additional address terms.
 583         """
 584         tokens = {}
 585
 586         for key, fulls, partials in terms:
 587             if fulls:
 588                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 589                                self._mk_array(fulls)]
 590
 591         if tokens:
 592             self.data['addr'] = tokens
 593
 594
 595 class _TokenCache:
 596     """ Cache for token information to avoid repeated database queries.
 597
 598         This cache is not thread-safe and needs to be instantiated per
 599         analyzer.
 600     """
 601     def __init__(self):
 602         self.names = {}
 603         self.postcodes = set()
 604         self.housenumbers = {}
 605
 606
 607     def get_hnr_tokens(self, conn, terms):
 608         """ Get token ids for a list of housenumbers, looking them up in the
 609             database if necessary. `terms` is an iterable of normalized
 610             housenumbers.
 611         """
 612         tokens = []
 613         askdb = []
 614
 615         for term in terms:
 616             token = self.housenumbers.get(term)
 617             if token is None:
 618                 askdb.append(term)
 619             else:
 620                 tokens.append(token)
 621
 622         if askdb:
 623             with conn.cursor() as cur:
 624                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 625                             (askdb, ))
 626                 for term, tid in cur:
 627                     self.housenumbers[term] = tid
 628                     tokens.append(tid)
 629
 630         return tokens