nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import io
   7 import itertools
   8 import logging
   9 import re
  10 from textwrap import dedent
  11 from pathlib import Path
  12
  13 import psycopg2.extras
  14
  15 from nominatim.db.connection import connect
  16 from nominatim.db.properties import set_property, get_property
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer:
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, config):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config):
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  95                               max_word_freq=max_word_freq)
  96
  97
  98     def check_database(self):
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         self.init_from_project()
 102
 103         if self.naming_rules is None:
 104             return "Configuration for tokenizer 'legacy_icu' are missing."
 105
 106         return None
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 125
 126
 127     def _install_php(self, phpdir):
 128         """ Install the php script for the tokenizer.
 129         """
 130         php_file = self.data_dir / "tokenizer.php"
 131         php_file.write_text(dedent("""\
 132             <?php
 133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 137             """.format(self, phpdir)))
 138
 139
 140     def _save_config(self, config):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.naming_rules.save_rules(conn)
 146
 147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 149
 150
 151     def _init_db_tables(self, config):
 152         """ Set up the word table and fill it with pre-computed word
 153             frequencies.
 154         """
 155         with connect(self.dsn) as conn:
 156             sqlp = SQLPreprocessor(conn, config)
 157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 158             conn.commit()
 159
 160             LOG.warning("Precomputing word tokens")
 161
 162             # get partial words and their frequencies
 163             words = Counter()
 164             name_proc = ICUNameProcessor(self.naming_rules)
 165             with conn.cursor(name="words") as cur:
 166                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 167
 168                 for name, cnt in cur:
 169                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 170                         for term in word.split():
 171                             words[term] += cnt
 172
 173             # copy them back into the word table
 174             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 175
 176
 177             with conn.cursor() as cur:
 178                 copystr.seek(0)
 179                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 180                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 181                                WHERE word_id is null""")
 182
 183             conn.commit()
 184
 185
 186 class LegacyICUNameAnalyzer:
 187     """ The legacy analyzer uses the ICU library for splitting names.
 188
 189         Each instance opens a connection to the database to request the
 190         normalization.
 191     """
 192
 193     def __init__(self, dsn, name_proc):
 194         self.conn = connect(dsn).connection
 195         self.conn.autocommit = True
 196         self.name_processor = name_proc
 197
 198         self._cache = _TokenCache()
 199
 200
 201     def __enter__(self):
 202         return self
 203
 204
 205     def __exit__(self, exc_type, exc_value, traceback):
 206         self.close()
 207
 208
 209     def close(self):
 210         """ Free all resources used by the analyzer.
 211         """
 212         if self.conn:
 213             self.conn.close()
 214             self.conn = None
 215
 216
 217     def get_word_token_info(self, words):
 218         """ Return token information for the given list of words.
 219             If a word starts with # it is assumed to be a full name
 220             otherwise is a partial name.
 221
 222             The function returns a list of tuples with
 223             (original word, word token, word id).
 224
 225             The function is used for testing and debugging only
 226             and not necessarily efficient.
 227         """
 228         tokens = {}
 229         for word in words:
 230             if word.startswith('#'):
 231                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 232             else:
 233                 tokens[word] = self.name_processor.get_search_normalized(word)
 234
 235         with self.conn.cursor() as cur:
 236             cur.execute("""SELECT word_token, word_id
 237                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 238                            WHERE word_token = t.term
 239                                  and class is null and country_code is null""",
 240                         (list(tokens.values()), ))
 241             ids = {r[0]: r[1] for r in cur}
 242
 243         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 244
 245
 246     @staticmethod
 247     def normalize_postcode(postcode):
 248         """ Convert the postcode to a standardized form.
 249
 250             This function must yield exactly the same result as the SQL function
 251             'token_normalized_postcode()'.
 252         """
 253         return postcode.strip().upper()
 254
 255
 256     def _make_standard_hnr(self, hnr):
 257         """ Create a normalised version of a housenumber.
 258
 259             This function takes minor shortcuts on transliteration.
 260         """
 261         return self.name_processor.get_search_normalized(hnr)
 262
 263     def update_postcodes_from_db(self):
 264         """ Update postcode tokens in the word table from the location_postcode
 265             table.
 266         """
 267         to_delete = []
 268         copystr = io.StringIO()
 269         with self.conn.cursor() as cur:
 270             # This finds us the rows in location_postcode and word that are
 271             # missing in the other table.
 272             cur.execute("""SELECT * FROM
 273                             (SELECT pc, word FROM
 274                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 275                               FULL JOIN
 276                               (SELECT word FROM word
 277                                 WHERE class ='place' and type = 'postcode') w
 278                               ON pc = word) x
 279                            WHERE pc is null or word is null""")
 280
 281             for postcode, word in cur:
 282                 if postcode is None:
 283                     to_delete.append(word)
 284                 else:
 285                     copystr.write(postcode)
 286                     copystr.write('\t ')
 287                     copystr.write(self.name_processor.get_search_normalized(postcode))
 288                     copystr.write('\tplace\tpostcode\t0\n')
 289
 290             if to_delete:
 291                 cur.execute("""DELETE FROM WORD
 292                                WHERE class ='place' and type = 'postcode'
 293                                      and word = any(%s)
 294                             """, (to_delete, ))
 295
 296             if copystr.getvalue():
 297                 copystr.seek(0)
 298                 cur.copy_from(copystr, 'word',
 299                               columns=['word', 'word_token', 'class', 'type',
 300                                        'search_name_count'])
 301
 302
 303     def update_special_phrases(self, phrases, should_replace):
 304         """ Replace the search index for special phrases with the new phrases.
 305         """
 306         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 307                             for p in phrases))
 308
 309         with self.conn.cursor() as cur:
 310             # Get the old phrases.
 311             existing_phrases = set()
 312             cur.execute("""SELECT word, class, type, operator FROM word
 313                            WHERE class != 'place'
 314                                  OR (type != 'house' AND type != 'postcode')""")
 315             for label, cls, typ, oper in cur:
 316                 existing_phrases.add((label, cls, typ, oper or '-'))
 317
 318             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 319             if should_replace:
 320                 deleted = self._remove_special_phrases(cur, norm_phrases,
 321                                                        existing_phrases)
 322             else:
 323                 deleted = 0
 324
 325         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 326                  len(norm_phrases), added, deleted)
 327
 328
 329     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 330         """ Add all phrases to the database that are not yet there.
 331         """
 332         to_add = new_phrases - existing_phrases
 333
 334         copystr = io.StringIO()
 335         added = 0
 336         for word, cls, typ, oper in to_add:
 337             term = self.name_processor.get_search_normalized(word)
 338             if term:
 339                 copystr.write(word)
 340                 copystr.write('\t ')
 341                 copystr.write(term)
 342                 copystr.write('\t')
 343                 copystr.write(cls)
 344                 copystr.write('\t')
 345                 copystr.write(typ)
 346                 copystr.write('\t')
 347                 copystr.write(oper if oper in ('in', 'near')  else '\\N')
 348                 copystr.write('\t0\n')
 349                 added += 1
 350
 351
 352         if copystr.tell() > 0:
 353             copystr.seek(0)
 354             cursor.copy_from(copystr, 'word',
 355                              columns=['word', 'word_token', 'class', 'type',
 356                                       'operator', 'search_name_count'])
 357
 358         return added
 359
 360
 361     def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
 362         """ Remove all phrases from the databse that are no longer in the
 363             new phrase list.
 364         """
 365         to_delete = existing_phrases - new_phrases
 366
 367         if to_delete:
 368             psycopg2.extras.execute_values(
 369                 cursor,
 370                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 371                     WHERE word = name and class = in_class and type = in_type
 372                           and ((op = '-' and operator is null) or op = operator)""",
 373                 to_delete)
 374
 375         return len(to_delete)
 376
 377
 378     def add_country_names(self, country_code, names):
 379         """ Add names for the given country to the search index.
 380         """
 381         word_tokens = set()
 382         for name in self._compute_full_names(names):
 383             if name:
 384                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 385
 386         with self.conn.cursor() as cur:
 387             # Get existing names
 388             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 389                         (country_code, ))
 390             word_tokens.difference_update((t[0] for t in cur))
 391
 392             if word_tokens:
 393                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 394                                                  search_name_count)
 395                                (SELECT nextval('seq_word'), token, '{}', 0
 396                                 FROM unnest(%s) as token)
 397                             """.format(country_code), (list(word_tokens),))
 398
 399
 400     def process_place(self, place):
 401         """ Determine tokenizer information about the given place.
 402
 403             Returns a JSON-serialisable structure that will be handed into
 404             the database via the token_info field.
 405         """
 406         token_info = _TokenInfo(self._cache)
 407
 408         names = place.get('name')
 409
 410         if names:
 411             fulls, partials = self._compute_name_tokens(names)
 412
 413             token_info.add_names(fulls, partials)
 414
 415             country_feature = place.get('country_feature')
 416             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 417                 self.add_country_names(country_feature.lower(), names)
 418
 419         address = place.get('address')
 420
 421         if address:
 422             hnrs = []
 423             addr_terms = []
 424             for key, value in address.items():
 425                 if key == 'postcode':
 426                     self._add_postcode(value)
 427                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 428                     hnrs.append(value)
 429                 elif key == 'street':
 430                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 431                 elif key == 'place':
 432                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 433                 elif not key.startswith('_') and \
 434                      key not in ('country', 'full'):
 435                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 436
 437             if hnrs:
 438                 hnrs = self._split_housenumbers(hnrs)
 439                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 440
 441             if addr_terms:
 442                 token_info.add_address_terms(addr_terms)
 443
 444         return token_info.data
 445
 446
 447     def _compute_name_tokens(self, names):
 448         """ Computes the full name and partial name tokens for the given
 449             dictionary of names.
 450         """
 451         full_names = self._compute_full_names(names)
 452         full_tokens = set()
 453         partial_tokens = set()
 454
 455         for name in full_names:
 456             norm_name = self.name_processor.get_normalized(name)
 457             full, part = self._cache.names.get(norm_name, (None, None))
 458             if full is None:
 459                 variants = self.name_processor.get_variants_ascii(norm_name)
 460                 with self.conn.cursor() as cur:
 461                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 462                                 (norm_name, variants))
 463                     full, part = cur.fetchone()
 464
 465                 self._cache.names[norm_name] = (full, part)
 466
 467             full_tokens.add(full)
 468             partial_tokens.update(part)
 469
 470         return full_tokens, partial_tokens
 471
 472
 473     @staticmethod
 474     def _compute_full_names(names):
 475         """ Return the set of all full name word ids to be used with the
 476             given dictionary of names.
 477         """
 478         full_names = set()
 479         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
 480             full_names.add(name.strip())
 481
 482             brace_idx = name.find('(')
 483             if brace_idx >= 0:
 484                 full_names.add(name[:brace_idx].strip())
 485
 486         return full_names
 487
 488
 489     def _add_postcode(self, postcode):
 490         """ Make sure the normalized postcode is present in the word table.
 491         """
 492         if re.search(r'[:,;]', postcode) is None:
 493             postcode = self.normalize_postcode(postcode)
 494
 495             if postcode not in self._cache.postcodes:
 496                 term = self.name_processor.get_search_normalized(postcode)
 497                 if not term:
 498                     return
 499
 500                 with self.conn.cursor() as cur:
 501                     # no word_id needed for postcodes
 502                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 503                                                      search_name_count)
 504                                    (SELECT pc, %s, 'place', 'postcode', 0
 505                                     FROM (VALUES (%s)) as v(pc)
 506                                     WHERE NOT EXISTS
 507                                      (SELECT * FROM word
 508                                       WHERE word = pc and class='place' and type='postcode'))
 509                                 """, (' ' + term, postcode))
 510                 self._cache.postcodes.add(postcode)
 511
 512
 513     @staticmethod
 514     def _split_housenumbers(hnrs):
 515         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 516             # split numbers if necessary
 517             simple_list = []
 518             for hnr in hnrs:
 519                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 520
 521             if len(simple_list) > 1:
 522                 hnrs = list(set(simple_list))
 523             else:
 524                 hnrs = simple_list
 525
 526         return hnrs
 527
 528
 529
 530
 531 class _TokenInfo:
 532     """ Collect token information to be sent back to the database.
 533     """
 534     def __init__(self, cache):
 535         self._cache = cache
 536         self.data = {}
 537
 538     @staticmethod
 539     def _mk_array(tokens):
 540         return '{%s}' % ','.join((str(s) for s in tokens))
 541
 542
 543     def add_names(self, fulls, partials):
 544         """ Adds token information for the normalised names.
 545         """
 546         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 547
 548
 549     def add_housenumbers(self, conn, hnrs):
 550         """ Extract housenumber information from a list of normalised
 551             housenumbers.
 552         """
 553         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 554         self.data['hnr'] = ';'.join(hnrs)
 555
 556
 557     def add_street(self, fulls, _):
 558         """ Add addr:street match terms.
 559         """
 560         if fulls:
 561             self.data['street'] = self._mk_array(fulls)
 562
 563
 564     def add_place(self, fulls, partials):
 565         """ Add addr:place search and match terms.
 566         """
 567         if fulls:
 568             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 569             self.data['place_match'] = self._mk_array(fulls)
 570
 571
 572     def add_address_terms(self, terms):
 573         """ Add additional address terms.
 574         """
 575         tokens = {}
 576
 577         for key, fulls, partials in terms:
 578             if fulls:
 579                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 580                                self._mk_array(fulls)]
 581
 582         if tokens:
 583             self.data['addr'] = tokens
 584
 585
 586 class _TokenCache:
 587     """ Cache for token information to avoid repeated database queries.
 588
 589         This cache is not thread-safe and needs to be instantiated per
 590         analyzer.
 591     """
 592     def __init__(self):
 593         self.names = {}
 594         self.postcodes = set()
 595         self.housenumbers = {}
 596
 597
 598     def get_hnr_tokens(self, conn, terms):
 599         """ Get token ids for a list of housenumbers, looking them up in the
 600             database if necessary.
 601         """
 602         tokens = []
 603         askdb = []
 604
 605         for term in terms:
 606             token = self.housenumbers.get(term)
 607             if token is None:
 608                 askdb.append(term)
 609             else:
 610                 tokens.append(token)
 611
 612         if askdb:
 613             with conn.cursor() as cur:
 614                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 615                             (askdb, ))
 616                 for term, tid in cur:
 617                     self.housenumbers[term] = tid
 618                     tokens.append(tid)
 619
 620         return tokens