nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 import psycopg2.extras
  13
  14 from nominatim.db.connection import connect
  15 from nominatim.db.properties import set_property, get_property
  16 from nominatim.db.utils import CopyBuffer
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer:
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, config):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config):
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  95                               max_word_freq=max_word_freq)
  96
  97
  98     def check_database(self):
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         self.init_from_project()
 102
 103         if self.naming_rules is None:
 104             return "Configuration for tokenizer 'legacy_icu' are missing."
 105
 106         return None
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 125
 126
 127     def _install_php(self, phpdir):
 128         """ Install the php script for the tokenizer.
 129         """
 130         php_file = self.data_dir / "tokenizer.php"
 131         php_file.write_text(dedent("""\
 132             <?php
 133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 137             """.format(self, phpdir))) # pylint: disable=missing-format-attribute
 138
 139
 140     def _save_config(self, config):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.naming_rules.save_rules(conn)
 146
 147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 149
 150
 151     def _init_db_tables(self, config):
 152         """ Set up the word table and fill it with pre-computed word
 153             frequencies.
 154         """
 155         with connect(self.dsn) as conn:
 156             sqlp = SQLPreprocessor(conn, config)
 157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 158             conn.commit()
 159
 160             LOG.warning("Precomputing word tokens")
 161
 162             # get partial words and their frequencies
 163             words = Counter()
 164             name_proc = ICUNameProcessor(self.naming_rules)
 165             with conn.cursor(name="words") as cur:
 166                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 167
 168                 for name, cnt in cur:
 169                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 170                         for term in word.split():
 171                             words[term] += cnt
 172
 173             # copy them back into the word table
 174             with CopyBuffer() as copystr:
 175                 for args in words.items():
 176                     copystr.add(*args)
 177
 178                 with conn.cursor() as cur:
 179                     copystr.copy_out(cur, 'word',
 180                                      columns=['word_token', 'search_name_count'])
 181                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 182                                    WHERE word_id is null""")
 183
 184             conn.commit()
 185
 186
 187 class LegacyICUNameAnalyzer:
 188     """ The legacy analyzer uses the ICU library for splitting names.
 189
 190         Each instance opens a connection to the database to request the
 191         normalization.
 192     """
 193
 194     def __init__(self, dsn, name_proc):
 195         self.conn = connect(dsn).connection
 196         self.conn.autocommit = True
 197         self.name_processor = name_proc
 198
 199         self._cache = _TokenCache()
 200
 201
 202     def __enter__(self):
 203         return self
 204
 205
 206     def __exit__(self, exc_type, exc_value, traceback):
 207         self.close()
 208
 209
 210     def close(self):
 211         """ Free all resources used by the analyzer.
 212         """
 213         if self.conn:
 214             self.conn.close()
 215             self.conn = None
 216
 217
 218     def get_word_token_info(self, words):
 219         """ Return token information for the given list of words.
 220             If a word starts with # it is assumed to be a full name
 221             otherwise is a partial name.
 222
 223             The function returns a list of tuples with
 224             (original word, word token, word id).
 225
 226             The function is used for testing and debugging only
 227             and not necessarily efficient.
 228         """
 229         tokens = {}
 230         for word in words:
 231             if word.startswith('#'):
 232                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 233             else:
 234                 tokens[word] = self.name_processor.get_search_normalized(word)
 235
 236         with self.conn.cursor() as cur:
 237             cur.execute("""SELECT word_token, word_id
 238                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 239                            WHERE word_token = t.term
 240                                  and class is null and country_code is null""",
 241                         (list(tokens.values()), ))
 242             ids = {r[0]: r[1] for r in cur}
 243
 244         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 245
 246
 247     @staticmethod
 248     def normalize_postcode(postcode):
 249         """ Convert the postcode to a standardized form.
 250
 251             This function must yield exactly the same result as the SQL function
 252             'token_normalized_postcode()'.
 253         """
 254         return postcode.strip().upper()
 255
 256
 257     def _make_standard_hnr(self, hnr):
 258         """ Create a normalised version of a housenumber.
 259
 260             This function takes minor shortcuts on transliteration.
 261         """
 262         return self.name_processor.get_search_normalized(hnr)
 263
 264     def update_postcodes_from_db(self):
 265         """ Update postcode tokens in the word table from the location_postcode
 266             table.
 267         """
 268         to_delete = []
 269         with self.conn.cursor() as cur:
 270             # This finds us the rows in location_postcode and word that are
 271             # missing in the other table.
 272             cur.execute("""SELECT * FROM
 273                             (SELECT pc, word FROM
 274                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 275                               FULL JOIN
 276                               (SELECT word FROM word
 277                                 WHERE class ='place' and type = 'postcode') w
 278                               ON pc = word) x
 279                            WHERE pc is null or word is null""")
 280
 281             with CopyBuffer() as copystr:
 282                 for postcode, word in cur:
 283                     if postcode is None:
 284                         to_delete.append(word)
 285                     else:
 286                         copystr.add(
 287                             postcode,
 288                             ' ' + self.name_processor.get_search_normalized(postcode),
 289                             'place', 'postcode', 0)
 290
 291                 if to_delete:
 292                     cur.execute("""DELETE FROM WORD
 293                                    WHERE class ='place' and type = 'postcode'
 294                                          and word = any(%s)
 295                                 """, (to_delete, ))
 296
 297                 copystr.copy_out(cur, 'word',
 298                                  columns=['word', 'word_token', 'class', 'type',
 299                                           'search_name_count'])
 300
 301
 302     def update_special_phrases(self, phrases, should_replace):
 303         """ Replace the search index for special phrases with the new phrases.
 304         """
 305         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 306                             for p in phrases))
 307
 308         with self.conn.cursor() as cur:
 309             # Get the old phrases.
 310             existing_phrases = set()
 311             cur.execute("""SELECT word, class, type, operator FROM word
 312                            WHERE class != 'place'
 313                                  OR (type != 'house' AND type != 'postcode')""")
 314             for label, cls, typ, oper in cur:
 315                 existing_phrases.add((label, cls, typ, oper or '-'))
 316
 317             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 318             if should_replace:
 319                 deleted = self._remove_special_phrases(cur, norm_phrases,
 320                                                        existing_phrases)
 321             else:
 322                 deleted = 0
 323
 324         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 325                  len(norm_phrases), added, deleted)
 326
 327
 328     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 329         """ Add all phrases to the database that are not yet there.
 330         """
 331         to_add = new_phrases - existing_phrases
 332
 333         added = 0
 334         with CopyBuffer() as copystr:
 335             for word, cls, typ, oper in to_add:
 336                 term = self.name_processor.get_search_normalized(word)
 337                 if term:
 338                     copystr.add(word, term, cls, typ,
 339                                 oper if oper in ('in', 'near')  else None, 0)
 340                     added += 1
 341
 342             copystr.copy_out(cursor, 'word',
 343                              columns=['word', 'word_token', 'class', 'type',
 344                                       'operator', 'search_name_count'])
 345
 346         return added
 347
 348
 349     @staticmethod
 350     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 351         """ Remove all phrases from the databse that are no longer in the
 352             new phrase list.
 353         """
 354         to_delete = existing_phrases - new_phrases
 355
 356         if to_delete:
 357             psycopg2.extras.execute_values(
 358                 cursor,
 359                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 360                     WHERE word = name and class = in_class and type = in_type
 361                           and ((op = '-' and operator is null) or op = operator)""",
 362                 to_delete)
 363
 364         return len(to_delete)
 365
 366
 367     def add_country_names(self, country_code, names):
 368         """ Add names for the given country to the search index.
 369         """
 370         word_tokens = set()
 371         for name in self._compute_full_names(names):
 372             if name:
 373                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 374
 375         with self.conn.cursor() as cur:
 376             # Get existing names
 377             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 378                         (country_code, ))
 379             word_tokens.difference_update((t[0] for t in cur))
 380
 381             if word_tokens:
 382                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 383                                                  search_name_count)
 384                                (SELECT nextval('seq_word'), token, '{}', 0
 385                                 FROM unnest(%s) as token)
 386                             """.format(country_code), (list(word_tokens),))
 387
 388
 389     def process_place(self, place):
 390         """ Determine tokenizer information about the given place.
 391
 392             Returns a JSON-serialisable structure that will be handed into
 393             the database via the token_info field.
 394         """
 395         token_info = _TokenInfo(self._cache)
 396
 397         names = place.get('name')
 398
 399         if names:
 400             fulls, partials = self._compute_name_tokens(names)
 401
 402             token_info.add_names(fulls, partials)
 403
 404             country_feature = place.get('country_feature')
 405             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 406                 self.add_country_names(country_feature.lower(), names)
 407
 408         address = place.get('address')
 409
 410         if address:
 411             hnrs = []
 412             addr_terms = []
 413             for key, value in address.items():
 414                 if key == 'postcode':
 415                     self._add_postcode(value)
 416                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 417                     hnrs.append(value)
 418                 elif key == 'street':
 419                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 420                 elif key == 'place':
 421                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 422                 elif not key.startswith('_') and \
 423                      key not in ('country', 'full'):
 424                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 425
 426             if hnrs:
 427                 hnrs = self._split_housenumbers(hnrs)
 428                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 429
 430             if addr_terms:
 431                 token_info.add_address_terms(addr_terms)
 432
 433         return token_info.data
 434
 435
 436     def _compute_name_tokens(self, names):
 437         """ Computes the full name and partial name tokens for the given
 438             dictionary of names.
 439         """
 440         full_names = self._compute_full_names(names)
 441         full_tokens = set()
 442         partial_tokens = set()
 443
 444         for name in full_names:
 445             norm_name = self.name_processor.get_normalized(name)
 446             full, part = self._cache.names.get(norm_name, (None, None))
 447             if full is None:
 448                 variants = self.name_processor.get_variants_ascii(norm_name)
 449                 with self.conn.cursor() as cur:
 450                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 451                                 (norm_name, variants))
 452                     full, part = cur.fetchone()
 453
 454                 self._cache.names[norm_name] = (full, part)
 455
 456             full_tokens.add(full)
 457             partial_tokens.update(part)
 458
 459         return full_tokens, partial_tokens
 460
 461
 462     @staticmethod
 463     def _compute_full_names(names):
 464         """ Return the set of all full name word ids to be used with the
 465             given dictionary of names.
 466         """
 467         full_names = set()
 468         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
 469             full_names.add(name.strip())
 470
 471             brace_idx = name.find('(')
 472             if brace_idx >= 0:
 473                 full_names.add(name[:brace_idx].strip())
 474
 475         return full_names
 476
 477
 478     def _add_postcode(self, postcode):
 479         """ Make sure the normalized postcode is present in the word table.
 480         """
 481         if re.search(r'[:,;]', postcode) is None:
 482             postcode = self.normalize_postcode(postcode)
 483
 484             if postcode not in self._cache.postcodes:
 485                 term = self.name_processor.get_search_normalized(postcode)
 486                 if not term:
 487                     return
 488
 489                 with self.conn.cursor() as cur:
 490                     # no word_id needed for postcodes
 491                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 492                                                      search_name_count)
 493                                    (SELECT pc, %s, 'place', 'postcode', 0
 494                                     FROM (VALUES (%s)) as v(pc)
 495                                     WHERE NOT EXISTS
 496                                      (SELECT * FROM word
 497                                       WHERE word = pc and class='place' and type='postcode'))
 498                                 """, (' ' + term, postcode))
 499                 self._cache.postcodes.add(postcode)
 500
 501
 502     @staticmethod
 503     def _split_housenumbers(hnrs):
 504         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 505             # split numbers if necessary
 506             simple_list = []
 507             for hnr in hnrs:
 508                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 509
 510             if len(simple_list) > 1:
 511                 hnrs = list(set(simple_list))
 512             else:
 513                 hnrs = simple_list
 514
 515         return hnrs
 516
 517
 518
 519
 520 class _TokenInfo:
 521     """ Collect token information to be sent back to the database.
 522     """
 523     def __init__(self, cache):
 524         self._cache = cache
 525         self.data = {}
 526
 527     @staticmethod
 528     def _mk_array(tokens):
 529         return '{%s}' % ','.join((str(s) for s in tokens))
 530
 531
 532     def add_names(self, fulls, partials):
 533         """ Adds token information for the normalised names.
 534         """
 535         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 536
 537
 538     def add_housenumbers(self, conn, hnrs):
 539         """ Extract housenumber information from a list of normalised
 540             housenumbers.
 541         """
 542         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 543         self.data['hnr'] = ';'.join(hnrs)
 544
 545
 546     def add_street(self, fulls, _):
 547         """ Add addr:street match terms.
 548         """
 549         if fulls:
 550             self.data['street'] = self._mk_array(fulls)
 551
 552
 553     def add_place(self, fulls, partials):
 554         """ Add addr:place search and match terms.
 555         """
 556         if fulls:
 557             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 558             self.data['place_match'] = self._mk_array(fulls)
 559
 560
 561     def add_address_terms(self, terms):
 562         """ Add additional address terms.
 563         """
 564         tokens = {}
 565
 566         for key, fulls, partials in terms:
 567             if fulls:
 568                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 569                                self._mk_array(fulls)]
 570
 571         if tokens:
 572             self.data['addr'] = tokens
 573
 574
 575 class _TokenCache:
 576     """ Cache for token information to avoid repeated database queries.
 577
 578         This cache is not thread-safe and needs to be instantiated per
 579         analyzer.
 580     """
 581     def __init__(self):
 582         self.names = {}
 583         self.postcodes = set()
 584         self.housenumbers = {}
 585
 586
 587     def get_hnr_tokens(self, conn, terms):
 588         """ Get token ids for a list of housenumbers, looking them up in the
 589             database if necessary.
 590         """
 591         tokens = []
 592         askdb = []
 593
 594         for term in terms:
 595             token = self.housenumbers.get(term)
 596             if token is None:
 597                 askdb.append(term)
 598             else:
 599                 tokens.append(token)
 600
 601         if askdb:
 602             with conn.cursor() as cur:
 603                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 604                             (askdb, ))
 605                 for term, tid in cur:
 606                     self.housenumbers[term] = tid
 607                     tokens.append(tid)
 608
 609         return tokens