nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import functools
   7 import io
   8 import itertools
   9 import json
  10 import logging
  11 import re
  12 from textwrap import dedent
  13 from pathlib import Path
  14
  15 from icu import Transliterator
  16 import psycopg2.extras
  17
  18 from nominatim.db.connection import connect
  19 from nominatim.db.properties import set_property, get_property
  20 from nominatim.db.sql_preprocessor import SQLPreprocessor
  21
  22 DBCFG_NORMALIZATION = "tokenizer_normalization"
  23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  26
  27 LOG = logging.getLogger()
  28
  29 def create(dsn, data_dir):
  30     """ Create a new instance of the tokenizer provided by this module.
  31     """
  32     return LegacyICUTokenizer(dsn, data_dir)
  33
  34
  35 class LegacyICUTokenizer:
  36     """ This tokenizer uses libICU to covert names and queries to ASCII.
  37         Otherwise it uses the same algorithms and data structures as the
  38         normalization routines in Nominatim 3.
  39     """
  40
  41     def __init__(self, dsn, data_dir):
  42         self.dsn = dsn
  43         self.data_dir = data_dir
  44         self.normalization = None
  45         self.transliteration = None
  46         self.abbreviations = None
  47
  48
  49     def init_new_db(self, config, init_db=True):
  50         """ Set up a new tokenizer for the database.
  51
  52             This copies all necessary data in the project directory to make
  53             sure the tokenizer remains stable even over updates.
  54         """
  55         if config.TOKENIZER_CONFIG:
  56             cfgfile = Path(config.TOKENIZER_CONFIG)
  57         else:
  58             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  59
  60         rules = json.loads(cfgfile.read_text())
  61         self.transliteration = ';'.join(rules['normalization']) + ';'
  62         self.abbreviations = rules["abbreviations"]
  63         self.normalization = config.TERM_NORMALIZATION
  64
  65         self._install_php(config)
  66         self._save_config(config)
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._init_db_tables(config)
  71
  72
  73     def init_from_project(self):
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         with connect(self.dsn) as conn:
  77             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  78             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  79             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  80
  81
  82     def finalize_import(self, config):
  83         """ Do any required postprocessing to make the tokenizer data ready
  84             for use.
  85         """
  86         with connect(self.dsn) as conn:
  87             sqlp = SQLPreprocessor(conn, config)
  88             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  89
  90
  91     def update_sql_functions(self, config):
  92         """ Reimport the SQL functions for this tokenizer.
  93         """
  94         with connect(self.dsn) as conn:
  95             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  98                               max_word_freq=max_word_freq)
  99
 100
 101     def check_database(self):
 102         """ Check that the tokenizer is set up correctly.
 103         """
 104         self.init_from_project()
 105
 106         if self.normalization is None\
 107            or self.transliteration is None\
 108            or self.abbreviations is None:
 109             return "Configuration for tokenizer 'legacy_icu' are missing."
 110
 111         return None
 112
 113
 114     def name_analyzer(self):
 115         """ Create a new analyzer for tokenizing names and queries
 116             using this tokinzer. Analyzers are context managers and should
 117             be used accordingly:
 118
 119             ```
 120             with tokenizer.name_analyzer() as analyzer:
 121                 analyser.tokenize()
 122             ```
 123
 124             When used outside the with construct, the caller must ensure to
 125             call the close() function before destructing the analyzer.
 126
 127             Analyzers are not thread-safe. You need to instantiate one per thread.
 128         """
 129         norm = Transliterator.createFromRules("normalizer", self.normalization)
 130         trans = Transliterator.createFromRules("trans", self.transliteration)
 131         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 132
 133
 134     def _install_php(self, config):
 135         """ Install the php script for the tokenizer.
 136         """
 137         abbr_inverse = list(zip(*self.abbreviations))
 138         php_file = self.data_dir / "tokenizer.php"
 139         php_file.write_text(dedent("""\
 140             <?php
 141             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 142             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 143             @define('CONST_Transliteration', "{0.transliteration}");
 144             @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
 145             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 146             """.format(self, config,
 147                        "','".join(abbr_inverse[0]),
 148                        "','".join(abbr_inverse[1]))))
 149
 150
 151     def _save_config(self, config):
 152         """ Save the configuration that needs to remain stable for the given
 153             database as database properties.
 154         """
 155         with connect(self.dsn) as conn:
 156             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 157             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 158             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 159             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 160
 161
 162     def _init_db_tables(self, config):
 163         """ Set up the word table and fill it with pre-computed word
 164             frequencies.
 165         """
 166         with connect(self.dsn) as conn:
 167             sqlp = SQLPreprocessor(conn, config)
 168             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 169             conn.commit()
 170
 171             LOG.warning("Precomputing word tokens")
 172
 173             # get partial words and their frequencies
 174             words = Counter()
 175             with self.name_analyzer() as analyzer:
 176                 with conn.cursor(name="words") as cur:
 177                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 178
 179                     for name, cnt in cur:
 180                         term = analyzer.make_standard_word(name)
 181                         if term:
 182                             for word in term.split():
 183                                 words[word] += cnt
 184
 185             # copy them back into the word table
 186             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 187
 188
 189             with conn.cursor() as cur:
 190                 copystr.seek(0)
 191                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 192                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 193                                WHERE word_id is null""")
 194
 195             conn.commit()
 196
 197
 198 class LegacyICUNameAnalyzer:
 199     """ The legacy analyzer uses the ICU library for splitting names.
 200
 201         Each instance opens a connection to the database to request the
 202         normalization.
 203     """
 204
 205     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 206         self.conn = connect(dsn).connection
 207         self.conn.autocommit = True
 208         self.normalizer = normalizer
 209         self.transliterator = transliterator
 210         self.abbreviations = abbreviations
 211
 212         self._cache = _TokenCache()
 213
 214
 215     def __enter__(self):
 216         return self
 217
 218
 219     def __exit__(self, exc_type, exc_value, traceback):
 220         self.close()
 221
 222
 223     def close(self):
 224         """ Free all resources used by the analyzer.
 225         """
 226         if self.conn:
 227             self.conn.close()
 228             self.conn = None
 229
 230
 231     def get_word_token_info(self, conn, words):
 232         """ Return token information for the given list of words.
 233             If a word starts with # it is assumed to be a full name
 234             otherwise is a partial name.
 235
 236             The function returns a list of tuples with
 237             (original word, word token, word id).
 238
 239             The function is used for testing and debugging only
 240             and not necessarily efficient.
 241         """
 242         tokens = {}
 243         for word in words:
 244             if word.startswith('#'):
 245                 tokens[word] = ' ' + self.make_standard_word(word[1:])
 246             else:
 247                 tokens[word] = self.make_standard_word(word)
 248
 249         with conn.cursor() as cur:
 250             cur.execute("""SELECT word_token, word_id
 251                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 252                            WHERE word_token = t.term
 253                                  and class is null and country_code is null""",
 254                         (list(tokens.values()), ))
 255             ids = {r[0]: r[1] for r in cur}
 256
 257         return [(k, v, ids[v]) for k, v in tokens.items()]
 258
 259
 260     def normalize(self, phrase):
 261         """ Normalize the given phrase, i.e. remove all properties that
 262             are irrelevant for search.
 263         """
 264         return self.normalizer.transliterate(phrase)
 265
 266     @staticmethod
 267     def normalize_postcode(postcode):
 268         """ Convert the postcode to a standardized form.
 269
 270             This function must yield exactly the same result as the SQL function
 271             'token_normalized_postcode()'.
 272         """
 273         return postcode.strip().upper()
 274
 275
 276     @functools.lru_cache(maxsize=1024)
 277     def make_standard_word(self, name):
 278         """ Create the normalised version of the input.
 279         """
 280         norm = ' ' + self.transliterator.transliterate(name) + ' '
 281         for full, abbr in self.abbreviations:
 282             if full in norm:
 283                 norm = norm.replace(full, abbr)
 284
 285         return norm.strip()
 286
 287
 288     def _make_standard_hnr(self, hnr):
 289         """ Create a normalised version of a housenumber.
 290
 291             This function takes minor shortcuts on transliteration.
 292         """
 293         if hnr.isdigit():
 294             return hnr
 295
 296         return self.transliterator.transliterate(hnr)
 297
 298     def update_postcodes_from_db(self):
 299         """ Update postcode tokens in the word table from the location_postcode
 300             table.
 301         """
 302         to_delete = []
 303         copystr = io.StringIO()
 304         with self.conn.cursor() as cur:
 305             # This finds us the rows in location_postcode and word that are
 306             # missing in the other table.
 307             cur.execute("""SELECT * FROM
 308                             (SELECT pc, word FROM
 309                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 310                               FULL JOIN
 311                               (SELECT word FROM word
 312                                 WHERE class ='place' and type = 'postcode') w
 313                               ON pc = word) x
 314                            WHERE pc is null or word is null""")
 315
 316             for postcode, word in cur:
 317                 if postcode is None:
 318                     to_delete.append(word)
 319                 else:
 320                     copystr.write(postcode)
 321                     copystr.write('\t ')
 322                     copystr.write(self.transliterator.transliterate(postcode))
 323                     copystr.write('\tplace\tpostcode\t0\n')
 324
 325             if to_delete:
 326                 cur.execute("""DELETE FROM WORD
 327                                WHERE class ='place' and type = 'postcode'
 328                                      and word = any(%s)
 329                             """, (to_delete, ))
 330
 331             if copystr.getvalue():
 332                 copystr.seek(0)
 333                 cur.copy_from(copystr, 'word',
 334                               columns=['word', 'word_token', 'class', 'type',
 335                                        'search_name_count'])
 336
 337
 338     def update_special_phrases(self, phrases, should_replace):
 339         """ Replace the search index for special phrases with the new phrases.
 340         """
 341         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 342                             for p in phrases))
 343
 344         with self.conn.cursor() as cur:
 345             # Get the old phrases.
 346             existing_phrases = set()
 347             cur.execute("""SELECT word, class, type, operator FROM word
 348                            WHERE class != 'place'
 349                                  OR (type != 'house' AND type != 'postcode')""")
 350             for label, cls, typ, oper in cur:
 351                 existing_phrases.add((label, cls, typ, oper or '-'))
 352
 353             to_add = norm_phrases - existing_phrases
 354             to_delete = existing_phrases - norm_phrases
 355
 356             if to_add:
 357                 copystr = io.StringIO()
 358                 for word, cls, typ, oper in to_add:
 359                     term = self.make_standard_word(word)
 360                     if term:
 361                         copystr.write(word)
 362                         copystr.write('\t ')
 363                         copystr.write(term)
 364                         copystr.write('\t')
 365                         copystr.write(cls)
 366                         copystr.write('\t')
 367                         copystr.write(typ)
 368                         copystr.write('\t')
 369                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 370                         copystr.write('\t0\n')
 371
 372                 copystr.seek(0)
 373                 cur.copy_from(copystr, 'word',
 374                               columns=['word', 'word_token', 'class', 'type',
 375                                        'operator', 'search_name_count'])
 376
 377             if to_delete and should_replace:
 378                 psycopg2.extras.execute_values(
 379                     cur,
 380                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 381                         WHERE word = name and class = in_class and type = in_type
 382                               and ((op = '-' and operator is null) or op = operator)""",
 383                     to_delete)
 384
 385         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 386                  len(norm_phrases), len(to_add), len(to_delete))
 387
 388
 389     def add_country_names(self, country_code, names):
 390         """ Add names for the given country to the search index.
 391         """
 392         full_names = set((self.make_standard_word(n) for n in names))
 393         full_names.discard('')
 394         self._add_normalized_country_names(country_code, full_names)
 395
 396
 397     def _add_normalized_country_names(self, country_code, names):
 398         """ Add names for the given country to the search index.
 399         """
 400         word_tokens = set((' ' + name for name in names))
 401         with self.conn.cursor() as cur:
 402             # Get existing names
 403             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 404                         (country_code, ))
 405             word_tokens.difference_update((t[0] for t in cur))
 406
 407             if word_tokens:
 408                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 409                                                  search_name_count)
 410                                (SELECT nextval('seq_word'), token, '{}', 0
 411                                 FROM unnest(%s) as token)
 412                             """.format(country_code), (list(word_tokens),))
 413
 414
 415     def process_place(self, place):
 416         """ Determine tokenizer information about the given place.
 417
 418             Returns a JSON-serialisable structure that will be handed into
 419             the database via the token_info field.
 420         """
 421         token_info = _TokenInfo(self._cache)
 422
 423         names = place.get('name')
 424
 425         if names:
 426             full_names = set((self.make_standard_word(name) for name in names.values()))
 427             full_names.discard('')
 428
 429             token_info.add_names(self.conn, full_names)
 430
 431             country_feature = place.get('country_feature')
 432             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 433                 self._add_normalized_country_names(country_feature.lower(),
 434                                                    full_names)
 435
 436         address = place.get('address')
 437
 438         if address:
 439             hnrs = []
 440             addr_terms = []
 441             for key, value in address.items():
 442                 if key == 'postcode':
 443                     self._add_postcode(value)
 444                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 445                     hnrs.append(value)
 446                 elif key == 'street':
 447                     token_info.add_street(self.conn, self.make_standard_word(value))
 448                 elif key == 'place':
 449                     token_info.add_place(self.conn, self.make_standard_word(value))
 450                 elif not key.startswith('_') and \
 451                      key not in ('country', 'full'):
 452                     addr_terms.append((key, self.make_standard_word(value)))
 453
 454             if hnrs:
 455                 hnrs = self._split_housenumbers(hnrs)
 456                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 457
 458             if addr_terms:
 459                 token_info.add_address_terms(self.conn, addr_terms)
 460
 461         return token_info.data
 462
 463
 464     def _add_postcode(self, postcode):
 465         """ Make sure the normalized postcode is present in the word table.
 466         """
 467         if re.search(r'[:,;]', postcode) is None:
 468             postcode = self.normalize_postcode(postcode)
 469
 470             if postcode not in self._cache.postcodes:
 471                 term = self.make_standard_word(postcode)
 472                 if not term:
 473                     return
 474
 475                 with self.conn.cursor() as cur:
 476                     # no word_id needed for postcodes
 477                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 478                                                      search_name_count)
 479                                    (SELECT pc, %s, 'place', 'postcode', 0
 480                                     FROM (VALUES (%s)) as v(pc)
 481                                     WHERE NOT EXISTS
 482                                      (SELECT * FROM word
 483                                       WHERE word = pc and class='place' and type='postcode'))
 484                                 """, (' ' + term, postcode))
 485                 self._cache.postcodes.add(postcode)
 486
 487     @staticmethod
 488     def _split_housenumbers(hnrs):
 489         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 490             # split numbers if necessary
 491             simple_list = []
 492             for hnr in hnrs:
 493                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 494
 495             if len(simple_list) > 1:
 496                 hnrs = list(set(simple_list))
 497             else:
 498                 hnrs = simple_list
 499
 500         return hnrs
 501
 502
 503
 504
 505 class _TokenInfo:
 506     """ Collect token information to be sent back to the database.
 507     """
 508     def __init__(self, cache):
 509         self.cache = cache
 510         self.data = {}
 511
 512     @staticmethod
 513     def _mk_array(tokens):
 514         return '{%s}' % ','.join((str(s) for s in tokens))
 515
 516
 517     def add_names(self, conn, names):
 518         """ Adds token information for the normalised names.
 519         """
 520         # Start with all partial names
 521         terms = set((part for ns in names for part in ns.split()))
 522         # Add partials for the full terms (TO BE REMOVED)
 523         terms.update((n for n in names))
 524         # Add the full names
 525         terms.update((' ' + n for n in names))
 526
 527         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 528
 529
 530     def add_housenumbers(self, conn, hnrs):
 531         """ Extract housenumber information from a list of normalised
 532             housenumbers.
 533         """
 534         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 535         self.data['hnr'] = ';'.join(hnrs)
 536
 537
 538     def add_street(self, conn, street):
 539         """ Add addr:street match terms.
 540         """
 541         if not street:
 542             return
 543
 544         term = ' ' + street
 545
 546         tid = self.cache.names.get(term)
 547
 548         if tid is None:
 549             with conn.cursor() as cur:
 550                 cur.execute("""SELECT word_id FROM word
 551                                 WHERE word_token = %s
 552                                       and class is null and type is null""",
 553                             (term, ))
 554                 if cur.rowcount > 0:
 555                     tid = cur.fetchone()[0]
 556                     self.cache.names[term] = tid
 557
 558         if tid is not None:
 559             self.data['street'] = '{%d}' % tid
 560
 561
 562     def add_place(self, conn, place):
 563         """ Add addr:place search and match terms.
 564         """
 565         if not place:
 566             return
 567
 568         partial_ids = self.cache.get_term_tokens(conn, place.split())
 569         tid = self.cache.get_term_tokens(conn, [' ' + place])
 570
 571         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 572         self.data['place_match'] = '{%s}' % tid[0]
 573
 574
 575     def add_address_terms(self, conn, terms):
 576         """ Add additional address terms.
 577         """
 578         tokens = {}
 579
 580         for key, value in terms:
 581             if not value:
 582                 continue
 583             partial_ids = self.cache.get_term_tokens(conn, value.split())
 584             term = ' ' + value
 585             tid = self.cache.names.get(term)
 586
 587             if tid is None:
 588                 with conn.cursor() as cur:
 589                     cur.execute("""SELECT word_id FROM word
 590                                     WHERE word_token = %s
 591                                           and class is null and type is null""",
 592                                 (term, ))
 593                     if cur.rowcount > 0:
 594                         tid = cur.fetchone()[0]
 595                         self.cache.names[term] = tid
 596
 597             tokens[key] = [self._mk_array(partial_ids),
 598                            '{%s}' % ('' if tid is None else str(tid))]
 599
 600         if tokens:
 601             self.data['addr'] = tokens
 602
 603
 604 class _TokenCache:
 605     """ Cache for token information to avoid repeated database queries.
 606
 607         This cache is not thread-safe and needs to be instantiated per
 608         analyzer.
 609     """
 610     def __init__(self):
 611         self.names = {}
 612         self.postcodes = set()
 613         self.housenumbers = {}
 614
 615
 616     def get_term_tokens(self, conn, terms):
 617         """ Get token ids for a list of terms, looking them up in the database
 618             if necessary.
 619         """
 620         tokens = []
 621         askdb = []
 622
 623         for term in terms:
 624             token = self.names.get(term)
 625             if token is None:
 626                 askdb.append(term)
 627             elif token != 0:
 628                 tokens.append(token)
 629
 630         if askdb:
 631             with conn.cursor() as cur:
 632                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 633                             (askdb, ))
 634                 for term, tid in cur:
 635                     self.names[term] = tid
 636                     if tid != 0:
 637                         tokens.append(tid)
 638
 639         return tokens
 640
 641
 642     def get_hnr_tokens(self, conn, terms):
 643         """ Get token ids for a list of housenumbers, looking them up in the
 644             database if necessary.
 645         """
 646         tokens = []
 647         askdb = []
 648
 649         for term in terms:
 650             token = self.housenumbers.get(term)
 651             if token is None:
 652                 askdb.append(term)
 653             else:
 654                 tokens.append(token)
 655
 656         if askdb:
 657             with conn.cursor() as cur:
 658                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 659                             (askdb, ))
 660                 for term, tid in cur:
 661                     self.housenumbers[term] = tid
 662                     tokens.append(tid)
 663
 664         return tokens