nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 import psycopg2.extras
  13
  14 from nominatim.db.connection import connect
  15 from nominatim.db.properties import set_property, get_property
  16 from nominatim.db.utils import CopyBuffer
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer:
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, config):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config):
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  95                               max_word_freq=max_word_freq)
  96
  97
  98     def check_database(self):
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         self.init_from_project()
 102
 103         if self.naming_rules is None:
 104             return "Configuration for tokenizer 'legacy_icu' are missing."
 105
 106         return None
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 125
 126     # pylint: disable=missing-format-attribute
 127     def _install_php(self, phpdir):
 128         """ Install the php script for the tokenizer.
 129         """
 130         php_file = self.data_dir / "tokenizer.php"
 131         php_file.write_text(dedent("""\
 132             <?php
 133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 137             """.format(self, phpdir)))
 138
 139
 140     def _save_config(self, config):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.naming_rules.save_rules(conn)
 146
 147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 149
 150
 151     def _init_db_tables(self, config):
 152         """ Set up the word table and fill it with pre-computed word
 153             frequencies.
 154         """
 155         with connect(self.dsn) as conn:
 156             sqlp = SQLPreprocessor(conn, config)
 157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 158             conn.commit()
 159
 160             LOG.warning("Precomputing word tokens")
 161
 162             # get partial words and their frequencies
 163             words = Counter()
 164             name_proc = ICUNameProcessor(self.naming_rules)
 165             with conn.cursor(name="words") as cur:
 166                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 167
 168                 for name, cnt in cur:
 169                     terms = set()
 170                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 171                         if ' ' in word:
 172                             terms.update(word.split())
 173                     for term in terms:
 174                         words[term] += cnt
 175
 176             # copy them back into the word table
 177             with CopyBuffer() as copystr:
 178                 for args in words.items():
 179                     copystr.add(*args)
 180
 181                 with conn.cursor() as cur:
 182                     copystr.copy_out(cur, 'word',
 183                                      columns=['word_token', 'search_name_count'])
 184                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 185                                    WHERE word_id is null""")
 186
 187             conn.commit()
 188
 189
 190 class LegacyICUNameAnalyzer:
 191     """ The legacy analyzer uses the ICU library for splitting names.
 192
 193         Each instance opens a connection to the database to request the
 194         normalization.
 195     """
 196
 197     def __init__(self, dsn, name_proc):
 198         self.conn = connect(dsn).connection
 199         self.conn.autocommit = True
 200         self.name_processor = name_proc
 201
 202         self._cache = _TokenCache()
 203
 204
 205     def __enter__(self):
 206         return self
 207
 208
 209     def __exit__(self, exc_type, exc_value, traceback):
 210         self.close()
 211
 212
 213     def close(self):
 214         """ Free all resources used by the analyzer.
 215         """
 216         if self.conn:
 217             self.conn.close()
 218             self.conn = None
 219
 220
 221     def get_word_token_info(self, words):
 222         """ Return token information for the given list of words.
 223             If a word starts with # it is assumed to be a full name
 224             otherwise is a partial name.
 225
 226             The function returns a list of tuples with
 227             (original word, word token, word id).
 228
 229             The function is used for testing and debugging only
 230             and not necessarily efficient.
 231         """
 232         tokens = {}
 233         for word in words:
 234             if word.startswith('#'):
 235                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 236             else:
 237                 tokens[word] = self.name_processor.get_search_normalized(word)
 238
 239         with self.conn.cursor() as cur:
 240             cur.execute("""SELECT word_token, word_id
 241                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 242                            WHERE word_token = t.term
 243                                  and class is null and country_code is null""",
 244                         (list(tokens.values()), ))
 245             ids = {r[0]: r[1] for r in cur}
 246
 247         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 248
 249
 250     @staticmethod
 251     def normalize_postcode(postcode):
 252         """ Convert the postcode to a standardized form.
 253
 254             This function must yield exactly the same result as the SQL function
 255             'token_normalized_postcode()'.
 256         """
 257         return postcode.strip().upper()
 258
 259
 260     def _make_standard_hnr(self, hnr):
 261         """ Create a normalised version of a housenumber.
 262
 263             This function takes minor shortcuts on transliteration.
 264         """
 265         return self.name_processor.get_search_normalized(hnr)
 266
 267     def update_postcodes_from_db(self):
 268         """ Update postcode tokens in the word table from the location_postcode
 269             table.
 270         """
 271         to_delete = []
 272         with self.conn.cursor() as cur:
 273             # This finds us the rows in location_postcode and word that are
 274             # missing in the other table.
 275             cur.execute("""SELECT * FROM
 276                             (SELECT pc, word FROM
 277                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 278                               FULL JOIN
 279                               (SELECT word FROM word
 280                                 WHERE class ='place' and type = 'postcode') w
 281                               ON pc = word) x
 282                            WHERE pc is null or word is null""")
 283
 284             with CopyBuffer() as copystr:
 285                 for postcode, word in cur:
 286                     if postcode is None:
 287                         to_delete.append(word)
 288                     else:
 289                         copystr.add(
 290                             postcode,
 291                             ' ' + self.name_processor.get_search_normalized(postcode),
 292                             'place', 'postcode', 0)
 293
 294                 if to_delete:
 295                     cur.execute("""DELETE FROM WORD
 296                                    WHERE class ='place' and type = 'postcode'
 297                                          and word = any(%s)
 298                                 """, (to_delete, ))
 299
 300                 copystr.copy_out(cur, 'word',
 301                                  columns=['word', 'word_token', 'class', 'type',
 302                                           'search_name_count'])
 303
 304
 305     def update_special_phrases(self, phrases, should_replace):
 306         """ Replace the search index for special phrases with the new phrases.
 307         """
 308         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 309                             for p in phrases))
 310
 311         with self.conn.cursor() as cur:
 312             # Get the old phrases.
 313             existing_phrases = set()
 314             cur.execute("""SELECT word, class, type, operator FROM word
 315                            WHERE class != 'place'
 316                                  OR (type != 'house' AND type != 'postcode')""")
 317             for label, cls, typ, oper in cur:
 318                 existing_phrases.add((label, cls, typ, oper or '-'))
 319
 320             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 321             if should_replace:
 322                 deleted = self._remove_special_phrases(cur, norm_phrases,
 323                                                        existing_phrases)
 324             else:
 325                 deleted = 0
 326
 327         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 328                  len(norm_phrases), added, deleted)
 329
 330
 331     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 332         """ Add all phrases to the database that are not yet there.
 333         """
 334         to_add = new_phrases - existing_phrases
 335
 336         added = 0
 337         with CopyBuffer() as copystr:
 338             for word, cls, typ, oper in to_add:
 339                 term = self.name_processor.get_search_normalized(word)
 340                 if term:
 341                     copystr.add(word, ' ' + term, cls, typ,
 342                                 oper if oper in ('in', 'near')  else None, 0)
 343                     added += 1
 344
 345             copystr.copy_out(cursor, 'word',
 346                              columns=['word', 'word_token', 'class', 'type',
 347                                       'operator', 'search_name_count'])
 348
 349         return added
 350
 351
 352     @staticmethod
 353     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 354         """ Remove all phrases from the databse that are no longer in the
 355             new phrase list.
 356         """
 357         to_delete = existing_phrases - new_phrases
 358
 359         if to_delete:
 360             psycopg2.extras.execute_values(
 361                 cursor,
 362                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 363                     WHERE word = name and class = in_class and type = in_type
 364                           and ((op = '-' and operator is null) or op = operator)""",
 365                 to_delete)
 366
 367         return len(to_delete)
 368
 369
 370     def add_country_names(self, country_code, names):
 371         """ Add names for the given country to the search index.
 372         """
 373         word_tokens = set()
 374         for name in self._compute_full_names(names):
 375             if name:
 376                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 377
 378         with self.conn.cursor() as cur:
 379             # Get existing names
 380             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 381                         (country_code, ))
 382             word_tokens.difference_update((t[0] for t in cur))
 383
 384             if word_tokens:
 385                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 386                                                  search_name_count)
 387                                (SELECT nextval('seq_word'), token, '{}', 0
 388                                 FROM unnest(%s) as token)
 389                             """.format(country_code), (list(word_tokens),))
 390
 391
 392     def process_place(self, place):
 393         """ Determine tokenizer information about the given place.
 394
 395             Returns a JSON-serialisable structure that will be handed into
 396             the database via the token_info field.
 397         """
 398         token_info = _TokenInfo(self._cache)
 399
 400         names = place.get('name')
 401
 402         if names:
 403             fulls, partials = self._compute_name_tokens(names)
 404
 405             token_info.add_names(fulls, partials)
 406
 407             country_feature = place.get('country_feature')
 408             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 409                 self.add_country_names(country_feature.lower(), names)
 410
 411         address = place.get('address')
 412
 413         if address:
 414             hnrs = []
 415             addr_terms = []
 416             for key, value in address.items():
 417                 if key == 'postcode':
 418                     self._add_postcode(value)
 419                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 420                     hnrs.append(value)
 421                 elif key == 'street':
 422                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 423                 elif key == 'place':
 424                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 425                 elif not key.startswith('_') and \
 426                      key not in ('country', 'full'):
 427                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 428
 429             if hnrs:
 430                 hnrs = self._split_housenumbers(hnrs)
 431                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 432
 433             if addr_terms:
 434                 token_info.add_address_terms(addr_terms)
 435
 436         return token_info.data
 437
 438
 439     def _compute_name_tokens(self, names):
 440         """ Computes the full name and partial name tokens for the given
 441             dictionary of names.
 442         """
 443         full_names = self._compute_full_names(names)
 444         full_tokens = set()
 445         partial_tokens = set()
 446
 447         for name in full_names:
 448             norm_name = self.name_processor.get_normalized(name)
 449             full, part = self._cache.names.get(norm_name, (None, None))
 450             if full is None:
 451                 variants = self.name_processor.get_variants_ascii(norm_name)
 452                 if not variants:
 453                     continue
 454
 455                 with self.conn.cursor() as cur:
 456                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 457                                 (norm_name, variants))
 458                     full, part = cur.fetchone()
 459
 460                 self._cache.names[norm_name] = (full, part)
 461
 462             full_tokens.add(full)
 463             partial_tokens.update(part)
 464
 465         return full_tokens, partial_tokens
 466
 467
 468     @staticmethod
 469     def _compute_full_names(names):
 470         """ Return the set of all full name word ids to be used with the
 471             given dictionary of names.
 472         """
 473         full_names = set()
 474         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 475             if name:
 476                 full_names.add(name)
 477
 478                 brace_idx = name.find('(')
 479                 if brace_idx >= 0:
 480                     full_names.add(name[:brace_idx].strip())
 481
 482         return full_names
 483
 484
 485     def _add_postcode(self, postcode):
 486         """ Make sure the normalized postcode is present in the word table.
 487         """
 488         if re.search(r'[:,;]', postcode) is None:
 489             postcode = self.normalize_postcode(postcode)
 490
 491             if postcode not in self._cache.postcodes:
 492                 term = self.name_processor.get_search_normalized(postcode)
 493                 if not term:
 494                     return
 495
 496                 with self.conn.cursor() as cur:
 497                     # no word_id needed for postcodes
 498                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 499                                                      search_name_count)
 500                                    (SELECT pc, %s, 'place', 'postcode', 0
 501                                     FROM (VALUES (%s)) as v(pc)
 502                                     WHERE NOT EXISTS
 503                                      (SELECT * FROM word
 504                                       WHERE word = pc and class='place' and type='postcode'))
 505                                 """, (' ' + term, postcode))
 506                 self._cache.postcodes.add(postcode)
 507
 508
 509     @staticmethod
 510     def _split_housenumbers(hnrs):
 511         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 512             # split numbers if necessary
 513             simple_list = []
 514             for hnr in hnrs:
 515                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 516
 517             if len(simple_list) > 1:
 518                 hnrs = list(set(simple_list))
 519             else:
 520                 hnrs = simple_list
 521
 522         return hnrs
 523
 524
 525
 526
 527 class _TokenInfo:
 528     """ Collect token information to be sent back to the database.
 529     """
 530     def __init__(self, cache):
 531         self._cache = cache
 532         self.data = {}
 533
 534     @staticmethod
 535     def _mk_array(tokens):
 536         return '{%s}' % ','.join((str(s) for s in tokens))
 537
 538
 539     def add_names(self, fulls, partials):
 540         """ Adds token information for the normalised names.
 541         """
 542         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 543
 544
 545     def add_housenumbers(self, conn, hnrs):
 546         """ Extract housenumber information from a list of normalised
 547             housenumbers.
 548         """
 549         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 550         self.data['hnr'] = ';'.join(hnrs)
 551
 552
 553     def add_street(self, fulls, _):
 554         """ Add addr:street match terms.
 555         """
 556         if fulls:
 557             self.data['street'] = self._mk_array(fulls)
 558
 559
 560     def add_place(self, fulls, partials):
 561         """ Add addr:place search and match terms.
 562         """
 563         if fulls:
 564             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 565             self.data['place_match'] = self._mk_array(fulls)
 566
 567
 568     def add_address_terms(self, terms):
 569         """ Add additional address terms.
 570         """
 571         tokens = {}
 572
 573         for key, fulls, partials in terms:
 574             if fulls:
 575                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 576                                self._mk_array(fulls)]
 577
 578         if tokens:
 579             self.data['addr'] = tokens
 580
 581
 582 class _TokenCache:
 583     """ Cache for token information to avoid repeated database queries.
 584
 585         This cache is not thread-safe and needs to be instantiated per
 586         analyzer.
 587     """
 588     def __init__(self):
 589         self.names = {}
 590         self.postcodes = set()
 591         self.housenumbers = {}
 592
 593
 594     def get_hnr_tokens(self, conn, terms):
 595         """ Get token ids for a list of housenumbers, looking them up in the
 596             database if necessary.
 597         """
 598         tokens = []
 599         askdb = []
 600
 601         for term in terms:
 602             token = self.housenumbers.get(term)
 603             if token is None:
 604                 askdb.append(term)
 605             else:
 606                 tokens.append(token)
 607
 608         if askdb:
 609             with conn.cursor() as cur:
 610                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 611                             (askdb, ))
 612                 for term, tid in cur:
 613                     self.housenumbers[term] = tid
 614                     tokens.append(tid)
 615
 616         return tokens