nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 import psycopg2.extras
  13
  14 from nominatim.db.connection import connect
  15 from nominatim.db.properties import set_property, get_property
  16 from nominatim.db.utils import CopyBuffer
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer:
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, config):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config):
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  95                               max_word_freq=max_word_freq)
  96
  97
  98     def check_database(self):
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         self.init_from_project()
 102
 103         if self.naming_rules is None:
 104             return "Configuration for tokenizer 'legacy_icu' are missing."
 105
 106         return None
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 125
 126     # pylint: disable=missing-format-attribute
 127     def _install_php(self, phpdir):
 128         """ Install the php script for the tokenizer.
 129         """
 130         php_file = self.data_dir / "tokenizer.php"
 131         php_file.write_text(dedent("""\
 132             <?php
 133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 137             """.format(self, phpdir)))
 138
 139
 140     def _save_config(self, config):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.naming_rules.save_rules(conn)
 146
 147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 149
 150
 151     def _init_db_tables(self, config):
 152         """ Set up the word table and fill it with pre-computed word
 153             frequencies.
 154         """
 155         with connect(self.dsn) as conn:
 156             sqlp = SQLPreprocessor(conn, config)
 157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 158             conn.commit()
 159
 160             LOG.warning("Precomputing word tokens")
 161
 162             # get partial words and their frequencies
 163             words = Counter()
 164             name_proc = ICUNameProcessor(self.naming_rules)
 165             with conn.cursor(name="words") as cur:
 166                 cur.execute(""" SELECT v, count(*) FROM
 167                                   (SELECT svals(name) as v FROM place)x
 168                                 WHERE length(v) < 75 GROUP BY v""")
 169
 170                 for name, cnt in cur:
 171                     terms = set()
 172                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 173                         if ' ' in word:
 174                             terms.update(word.split())
 175                     for term in terms:
 176                         words[term] += cnt
 177
 178             # copy them back into the word table
 179             with CopyBuffer() as copystr:
 180                 for args in words.items():
 181                     copystr.add(*args)
 182
 183                 with conn.cursor() as cur:
 184                     copystr.copy_out(cur, 'word',
 185                                      columns=['word_token', 'search_name_count'])
 186                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 187                                    WHERE word_id is null""")
 188
 189             conn.commit()
 190
 191
 192 class LegacyICUNameAnalyzer:
 193     """ The legacy analyzer uses the ICU library for splitting names.
 194
 195         Each instance opens a connection to the database to request the
 196         normalization.
 197     """
 198
 199     def __init__(self, dsn, name_proc):
 200         self.conn = connect(dsn).connection
 201         self.conn.autocommit = True
 202         self.name_processor = name_proc
 203
 204         self._cache = _TokenCache()
 205
 206
 207     def __enter__(self):
 208         return self
 209
 210
 211     def __exit__(self, exc_type, exc_value, traceback):
 212         self.close()
 213
 214
 215     def close(self):
 216         """ Free all resources used by the analyzer.
 217         """
 218         if self.conn:
 219             self.conn.close()
 220             self.conn = None
 221
 222
 223     def get_word_token_info(self, words):
 224         """ Return token information for the given list of words.
 225             If a word starts with # it is assumed to be a full name
 226             otherwise is a partial name.
 227
 228             The function returns a list of tuples with
 229             (original word, word token, word id).
 230
 231             The function is used for testing and debugging only
 232             and not necessarily efficient.
 233         """
 234         tokens = {}
 235         for word in words:
 236             if word.startswith('#'):
 237                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 238             else:
 239                 tokens[word] = self.name_processor.get_search_normalized(word)
 240
 241         with self.conn.cursor() as cur:
 242             cur.execute("""SELECT word_token, word_id
 243                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 244                            WHERE word_token = t.term
 245                                  and class is null and country_code is null""",
 246                         (list(tokens.values()), ))
 247             ids = {r[0]: r[1] for r in cur}
 248
 249         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 250
 251
 252     @staticmethod
 253     def normalize_postcode(postcode):
 254         """ Convert the postcode to a standardized form.
 255
 256             This function must yield exactly the same result as the SQL function
 257             'token_normalized_postcode()'.
 258         """
 259         return postcode.strip().upper()
 260
 261
 262     def _make_standard_hnr(self, hnr):
 263         """ Create a normalised version of a housenumber.
 264
 265             This function takes minor shortcuts on transliteration.
 266         """
 267         return self.name_processor.get_search_normalized(hnr)
 268
 269     def update_postcodes_from_db(self):
 270         """ Update postcode tokens in the word table from the location_postcode
 271             table.
 272         """
 273         to_delete = []
 274         with self.conn.cursor() as cur:
 275             # This finds us the rows in location_postcode and word that are
 276             # missing in the other table.
 277             cur.execute("""SELECT * FROM
 278                             (SELECT pc, word FROM
 279                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 280                               FULL JOIN
 281                               (SELECT word FROM word
 282                                 WHERE class ='place' and type = 'postcode') w
 283                               ON pc = word) x
 284                            WHERE pc is null or word is null""")
 285
 286             with CopyBuffer() as copystr:
 287                 for postcode, word in cur:
 288                     if postcode is None:
 289                         to_delete.append(word)
 290                     else:
 291                         copystr.add(
 292                             postcode,
 293                             ' ' + self.name_processor.get_search_normalized(postcode),
 294                             'place', 'postcode', 0)
 295
 296                 if to_delete:
 297                     cur.execute("""DELETE FROM WORD
 298                                    WHERE class ='place' and type = 'postcode'
 299                                          and word = any(%s)
 300                                 """, (to_delete, ))
 301
 302                 copystr.copy_out(cur, 'word',
 303                                  columns=['word', 'word_token', 'class', 'type',
 304                                           'search_name_count'])
 305
 306
 307     def update_special_phrases(self, phrases, should_replace):
 308         """ Replace the search index for special phrases with the new phrases.
 309         """
 310         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 311                             for p in phrases))
 312
 313         with self.conn.cursor() as cur:
 314             # Get the old phrases.
 315             existing_phrases = set()
 316             cur.execute("""SELECT word, class, type, operator FROM word
 317                            WHERE class != 'place'
 318                                  OR (type != 'house' AND type != 'postcode')""")
 319             for label, cls, typ, oper in cur:
 320                 existing_phrases.add((label, cls, typ, oper or '-'))
 321
 322             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 323             if should_replace:
 324                 deleted = self._remove_special_phrases(cur, norm_phrases,
 325                                                        existing_phrases)
 326             else:
 327                 deleted = 0
 328
 329         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 330                  len(norm_phrases), added, deleted)
 331
 332
 333     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 334         """ Add all phrases to the database that are not yet there.
 335         """
 336         to_add = new_phrases - existing_phrases
 337
 338         added = 0
 339         with CopyBuffer() as copystr:
 340             for word, cls, typ, oper in to_add:
 341                 term = self.name_processor.get_search_normalized(word)
 342                 if term:
 343                     copystr.add(word, ' ' + term, cls, typ,
 344                                 oper if oper in ('in', 'near')  else None, 0)
 345                     added += 1
 346
 347             copystr.copy_out(cursor, 'word',
 348                              columns=['word', 'word_token', 'class', 'type',
 349                                       'operator', 'search_name_count'])
 350
 351         return added
 352
 353
 354     @staticmethod
 355     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 356         """ Remove all phrases from the databse that are no longer in the
 357             new phrase list.
 358         """
 359         to_delete = existing_phrases - new_phrases
 360
 361         if to_delete:
 362             psycopg2.extras.execute_values(
 363                 cursor,
 364                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 365                     WHERE word = name and class = in_class and type = in_type
 366                           and ((op = '-' and operator is null) or op = operator)""",
 367                 to_delete)
 368
 369         return len(to_delete)
 370
 371
 372     def add_country_names(self, country_code, names):
 373         """ Add names for the given country to the search index.
 374         """
 375         word_tokens = set()
 376         for name in self._compute_full_names(names):
 377             if name:
 378                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 379
 380         with self.conn.cursor() as cur:
 381             # Get existing names
 382             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 383                         (country_code, ))
 384             word_tokens.difference_update((t[0] for t in cur))
 385
 386             if word_tokens:
 387                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 388                                                  search_name_count)
 389                                (SELECT nextval('seq_word'), token, '{}', 0
 390                                 FROM unnest(%s) as token)
 391                             """.format(country_code), (list(word_tokens),))
 392
 393
 394     def process_place(self, place):
 395         """ Determine tokenizer information about the given place.
 396
 397             Returns a JSON-serialisable structure that will be handed into
 398             the database via the token_info field.
 399         """
 400         token_info = _TokenInfo(self._cache)
 401
 402         names = place.get('name')
 403
 404         if names:
 405             fulls, partials = self._compute_name_tokens(names)
 406
 407             token_info.add_names(fulls, partials)
 408
 409             country_feature = place.get('country_feature')
 410             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 411                 self.add_country_names(country_feature.lower(), names)
 412
 413         address = place.get('address')
 414         if address:
 415             self._process_place_address(token_info, address)
 416
 417         return token_info.data
 418
 419
 420     def _process_place_address(self, token_info, address):
 421         hnrs = []
 422         addr_terms = []
 423         for key, value in address.items():
 424             if key == 'postcode':
 425                 self._add_postcode(value)
 426             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 427                 hnrs.append(value)
 428             elif key == 'street':
 429                 token_info.add_street(*self._compute_name_tokens({'name': value}))
 430             elif key == 'place':
 431                 token_info.add_place(*self._compute_name_tokens({'name': value}))
 432             elif not key.startswith('_') and \
 433                  key not in ('country', 'full'):
 434                 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 435
 436         if hnrs:
 437             hnrs = self._split_housenumbers(hnrs)
 438             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 439
 440         if addr_terms:
 441             token_info.add_address_terms(addr_terms)
 442
 443
 444     def _compute_name_tokens(self, names):
 445         """ Computes the full name and partial name tokens for the given
 446             dictionary of names.
 447         """
 448         full_names = self._compute_full_names(names)
 449         full_tokens = set()
 450         partial_tokens = set()
 451
 452         for name in full_names:
 453             norm_name = self.name_processor.get_normalized(name)
 454             full, part = self._cache.names.get(norm_name, (None, None))
 455             if full is None:
 456                 variants = self.name_processor.get_variants_ascii(norm_name)
 457                 if not variants:
 458                     continue
 459
 460                 with self.conn.cursor() as cur:
 461                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 462                                 (norm_name, variants))
 463                     full, part = cur.fetchone()
 464
 465                 self._cache.names[norm_name] = (full, part)
 466
 467             full_tokens.add(full)
 468             partial_tokens.update(part)
 469
 470         return full_tokens, partial_tokens
 471
 472
 473     @staticmethod
 474     def _compute_full_names(names):
 475         """ Return the set of all full name word ids to be used with the
 476             given dictionary of names.
 477         """
 478         full_names = set()
 479         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 480             if name:
 481                 full_names.add(name)
 482
 483                 brace_idx = name.find('(')
 484                 if brace_idx >= 0:
 485                     full_names.add(name[:brace_idx].strip())
 486
 487         return full_names
 488
 489
 490     def _add_postcode(self, postcode):
 491         """ Make sure the normalized postcode is present in the word table.
 492         """
 493         if re.search(r'[:,;]', postcode) is None:
 494             postcode = self.normalize_postcode(postcode)
 495
 496             if postcode not in self._cache.postcodes:
 497                 term = self.name_processor.get_search_normalized(postcode)
 498                 if not term:
 499                     return
 500
 501                 with self.conn.cursor() as cur:
 502                     # no word_id needed for postcodes
 503                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 504                                                      search_name_count)
 505                                    (SELECT pc, %s, 'place', 'postcode', 0
 506                                     FROM (VALUES (%s)) as v(pc)
 507                                     WHERE NOT EXISTS
 508                                      (SELECT * FROM word
 509                                       WHERE word = pc and class='place' and type='postcode'))
 510                                 """, (' ' + term, postcode))
 511                 self._cache.postcodes.add(postcode)
 512
 513
 514     @staticmethod
 515     def _split_housenumbers(hnrs):
 516         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 517             # split numbers if necessary
 518             simple_list = []
 519             for hnr in hnrs:
 520                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 521
 522             if len(simple_list) > 1:
 523                 hnrs = list(set(simple_list))
 524             else:
 525                 hnrs = simple_list
 526
 527         return hnrs
 528
 529
 530
 531
 532 class _TokenInfo:
 533     """ Collect token information to be sent back to the database.
 534     """
 535     def __init__(self, cache):
 536         self._cache = cache
 537         self.data = {}
 538
 539     @staticmethod
 540     def _mk_array(tokens):
 541         return '{%s}' % ','.join((str(s) for s in tokens))
 542
 543
 544     def add_names(self, fulls, partials):
 545         """ Adds token information for the normalised names.
 546         """
 547         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 548
 549
 550     def add_housenumbers(self, conn, hnrs):
 551         """ Extract housenumber information from a list of normalised
 552             housenumbers.
 553         """
 554         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 555         self.data['hnr'] = ';'.join(hnrs)
 556
 557
 558     def add_street(self, fulls, _):
 559         """ Add addr:street match terms.
 560         """
 561         if fulls:
 562             self.data['street'] = self._mk_array(fulls)
 563
 564
 565     def add_place(self, fulls, partials):
 566         """ Add addr:place search and match terms.
 567         """
 568         if fulls:
 569             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 570             self.data['place_match'] = self._mk_array(fulls)
 571
 572
 573     def add_address_terms(self, terms):
 574         """ Add additional address terms.
 575         """
 576         tokens = {}
 577
 578         for key, fulls, partials in terms:
 579             if fulls:
 580                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 581                                self._mk_array(fulls)]
 582
 583         if tokens:
 584             self.data['addr'] = tokens
 585
 586
 587 class _TokenCache:
 588     """ Cache for token information to avoid repeated database queries.
 589
 590         This cache is not thread-safe and needs to be instantiated per
 591         analyzer.
 592     """
 593     def __init__(self):
 594         self.names = {}
 595         self.postcodes = set()
 596         self.housenumbers = {}
 597
 598
 599     def get_hnr_tokens(self, conn, terms):
 600         """ Get token ids for a list of housenumbers, looking them up in the
 601             database if necessary.
 602         """
 603         tokens = []
 604         askdb = []
 605
 606         for term in terms:
 607             token = self.housenumbers.get(term)
 608             if token is None:
 609                 askdb.append(term)
 610             else:
 611                 tokens.append(token)
 612
 613         if askdb:
 614             with conn.cursor() as cur:
 615                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 616                             (askdb, ))
 617                 for term, tid in cur:
 618                     self.housenumbers[term] = tid
 619                     tokens.append(tid)
 620
 621         return tokens