nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import functools
   7 import io
   8 import itertools
   9 import json
  10 import logging
  11 import re
  12 from textwrap import dedent
  13 from pathlib import Path
  14
  15 from icu import Transliterator
  16 import psycopg2.extras
  17
  18 from nominatim.db.connection import connect
  19 from nominatim.db.properties import set_property, get_property
  20 from nominatim.db.sql_preprocessor import SQLPreprocessor
  21
  22 DBCFG_NORMALIZATION = "tokenizer_normalization"
  23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  26
  27 LOG = logging.getLogger()
  28
  29 def create(dsn, data_dir):
  30     """ Create a new instance of the tokenizer provided by this module.
  31     """
  32     return LegacyICUTokenizer(dsn, data_dir)
  33
  34
  35 class LegacyICUTokenizer:
  36     """ This tokenizer uses libICU to covert names and queries to ASCII.
  37         Otherwise it uses the same algorithms and data structures as the
  38         normalization routines in Nominatim 3.
  39     """
  40
  41     def __init__(self, dsn, data_dir):
  42         self.dsn = dsn
  43         self.data_dir = data_dir
  44         self.normalization = None
  45         self.transliteration = None
  46         self.abbreviations = None
  47
  48
  49     def init_new_db(self, config, init_db=True):
  50         """ Set up a new tokenizer for the database.
  51
  52             This copies all necessary data in the project directory to make
  53             sure the tokenizer remains stable even over updates.
  54         """
  55         if config.TOKENIZER_CONFIG:
  56             cfgfile = Path(config.TOKENIZER_CONFIG)
  57         else:
  58             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  59
  60         rules = json.loads(cfgfile.read_text())
  61         self.transliteration = ';'.join(rules['normalization']) + ';'
  62         self.abbreviations = rules["abbreviations"]
  63         self.normalization = config.TERM_NORMALIZATION
  64
  65         self._install_php(config)
  66         self._save_config(config)
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._init_db_tables(config)
  71
  72
  73     def init_from_project(self):
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         with connect(self.dsn) as conn:
  77             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  78             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  79             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  80
  81
  82     def finalize_import(self, config):
  83         """ Do any required postprocessing to make the tokenizer data ready
  84             for use.
  85         """
  86         with connect(self.dsn) as conn:
  87             sqlp = SQLPreprocessor(conn, config)
  88             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  89
  90
  91     def update_sql_functions(self, config):
  92         """ Reimport the SQL functions for this tokenizer.
  93         """
  94         with connect(self.dsn) as conn:
  95             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  98                               max_word_freq=max_word_freq)
  99
 100
 101     def check_database(self):
 102         """ Check that the tokenizer is set up correctly.
 103         """
 104         self.init_from_project()
 105
 106         if self.normalization is None\
 107            or self.transliteration is None\
 108            or self.abbreviations is None:
 109             return "Configuration for tokenizer 'legacy_icu' are missing."
 110
 111         return None
 112
 113
 114     def name_analyzer(self):
 115         """ Create a new analyzer for tokenizing names and queries
 116             using this tokinzer. Analyzers are context managers and should
 117             be used accordingly:
 118
 119             ```
 120             with tokenizer.name_analyzer() as analyzer:
 121                 analyser.tokenize()
 122             ```
 123
 124             When used outside the with construct, the caller must ensure to
 125             call the close() function before destructing the analyzer.
 126
 127             Analyzers are not thread-safe. You need to instantiate one per thread.
 128         """
 129         norm = Transliterator.createFromRules("normalizer", self.normalization)
 130         trans = Transliterator.createFromRules("normalizer", self.transliteration)
 131         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 132
 133
 134     def _install_php(self, config):
 135         """ Install the php script for the tokenizer.
 136         """
 137         abbr_inverse = list(zip(*self.abbreviations))
 138         php_file = self.data_dir / "tokenizer.php"
 139         php_file.write_text(dedent("""\
 140             <?php
 141             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 142             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 143             @define('CONST_Transliteration', "{0.transliteration}");
 144             @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
 145             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 146             """.format(self, config,
 147                        "','".join(abbr_inverse[0]),
 148                        "','".join(abbr_inverse[1]))))
 149
 150
 151     def _save_config(self, config):
 152         """ Save the configuration that needs to remain stable for the given
 153             database as database properties.
 154         """
 155         with connect(self.dsn) as conn:
 156             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 157             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 158             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 159             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 160
 161
 162     def _init_db_tables(self, config):
 163         """ Set up the word table and fill it with pre-computed word
 164             frequencies.
 165         """
 166         with connect(self.dsn) as conn:
 167             sqlp = SQLPreprocessor(conn, config)
 168             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 169             conn.commit()
 170
 171             LOG.warning("Precomputing word tokens")
 172
 173             # get partial words and their frequencies
 174             words = Counter()
 175             with self.name_analyzer() as analyzer:
 176                 with conn.cursor(name="words") as cur:
 177                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 178
 179                     for name, cnt in cur:
 180                         term = analyzer.make_standard_word(name)
 181                         if term:
 182                             for word in term.split():
 183                                 words[word] += cnt
 184
 185             # copy them back into the word table
 186             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 187
 188
 189             with conn.cursor() as cur:
 190                 copystr.seek(0)
 191                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 192                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 193                                WHERE word_id is null""")
 194
 195             conn.commit()
 196
 197
 198 class LegacyICUNameAnalyzer:
 199     """ The legacy analyzer uses the ICU library for splitting names.
 200
 201         Each instance opens a connection to the database to request the
 202         normalization.
 203     """
 204
 205     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 206         self.conn = connect(dsn).connection
 207         self.conn.autocommit = True
 208         self.normalizer = normalizer
 209         self.transliterator = transliterator
 210         self.abbreviations = abbreviations
 211
 212         self._cache = _TokenCache()
 213
 214
 215     def __enter__(self):
 216         return self
 217
 218
 219     def __exit__(self, exc_type, exc_value, traceback):
 220         self.close()
 221
 222
 223     def close(self):
 224         """ Free all resources used by the analyzer.
 225         """
 226         if self.conn:
 227             self.conn.close()
 228             self.conn = None
 229
 230
 231     def get_word_token_info(self, conn, words):
 232         """ Return token information for the given list of words.
 233             If a word starts with # it is assumed to be a full name
 234             otherwise is a partial name.
 235
 236             The function returns a list of tuples with
 237             (original word, word token, word id).
 238
 239             The function is used for testing and debugging only
 240             and not necessarily efficient.
 241         """
 242         tokens = {}
 243         for word in words:
 244             if word.startswith('#'):
 245                 tokens[word] = ' ' + self.make_standard_word(word[1:])
 246             else:
 247                 tokens[word] = self.make_standard_word(word)
 248
 249         with conn.cursor() as cur:
 250             cur.execute("""SELECT word_token, word_id
 251                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 252                            WHERE word_token = t.term
 253                                  and class is null and country_code is null""",
 254                         (list(tokens.values()), ))
 255             ids = {r[0]: r[1] for r in cur}
 256
 257         return [(k, v, ids[v]) for k, v in tokens.items()]
 258
 259
 260     def normalize(self, phrase):
 261         """ Normalize the given phrase, i.e. remove all properties that
 262             are irrelevant for search.
 263         """
 264         return self.normalizer.transliterate(phrase)
 265
 266     @functools.lru_cache(maxsize=1024)
 267     def make_standard_word(self, name):
 268         """ Create the normalised version of the input.
 269         """
 270         norm = ' ' + self.transliterator.transliterate(name) + ' '
 271         for full, abbr in self.abbreviations:
 272             if full in norm:
 273                 norm = norm.replace(full, abbr)
 274
 275         return norm.strip()
 276
 277
 278     def _make_standard_hnr(self, hnr):
 279         """ Create a normalised version of a housenumber.
 280
 281             This function takes minor shortcuts on transliteration.
 282         """
 283         if hnr.isdigit():
 284             return hnr
 285
 286         return self.transliterator.transliterate(hnr)
 287
 288     def add_postcodes_from_db(self):
 289         """ Add postcodes from the location_postcode table to the word table.
 290         """
 291         copystr = io.StringIO()
 292         with self.conn.cursor() as cur:
 293             cur.execute("SELECT distinct(postcode) FROM location_postcode")
 294             for (postcode, ) in cur:
 295                 copystr.write(postcode)
 296                 copystr.write('\t ')
 297                 copystr.write(self.transliterator.transliterate(postcode))
 298                 copystr.write('\tplace\tpostcode\t0\n')
 299
 300             copystr.seek(0)
 301             cur.copy_from(copystr, 'word',
 302                           columns=['word', 'word_token', 'class', 'type',
 303                                    'search_name_count'])
 304             # Don't really need an ID for postcodes....
 305             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 306             #                WHERE word_id is null and type = 'postcode'""")
 307
 308
 309     def update_special_phrases(self, phrases):
 310         """ Replace the search index for special phrases with the new phrases.
 311         """
 312         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 313                             for p in phrases))
 314
 315         with self.conn.cursor() as cur:
 316             # Get the old phrases.
 317             existing_phrases = set()
 318             cur.execute("""SELECT word, class, type, operator FROM word
 319                            WHERE class != 'place'
 320                                  OR (type != 'house' AND type != 'postcode')""")
 321             for label, cls, typ, oper in cur:
 322                 existing_phrases.add((label, cls, typ, oper or '-'))
 323
 324             to_add = norm_phrases - existing_phrases
 325             to_delete = existing_phrases - norm_phrases
 326
 327             if to_add:
 328                 copystr = io.StringIO()
 329                 for word, cls, typ, oper in to_add:
 330                     term = self.make_standard_word(word)
 331                     if term:
 332                         copystr.write(word)
 333                         copystr.write('\t ')
 334                         copystr.write(term)
 335                         copystr.write('\t')
 336                         copystr.write(cls)
 337                         copystr.write('\t')
 338                         copystr.write(typ)
 339                         copystr.write('\t')
 340                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 341                         copystr.write('\t0\n')
 342
 343                 copystr.seek(0)
 344                 cur.copy_from(copystr, 'word',
 345                               columns=['word', 'word_token', 'class', 'type',
 346                                        'operator', 'search_name_count'])
 347
 348             if to_delete:
 349                 psycopg2.extras.execute_values(
 350                     cur,
 351                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 352                         WHERE word = name and class = in_class and type = in_type
 353                               and ((op = '-' and operator is null) or op = operator)""",
 354                     to_delete)
 355
 356         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 357                  len(norm_phrases), len(to_add), len(to_delete))
 358
 359
 360     def add_country_names(self, country_code, names):
 361         """ Add names for the given country to the search index.
 362         """
 363         full_names = set((self.make_standard_word(n) for n in names))
 364         full_names.discard('')
 365         self._add_normalized_country_names(country_code, full_names)
 366
 367
 368     def _add_normalized_country_names(self, country_code, names):
 369         """ Add names for the given country to the search index.
 370         """
 371         word_tokens = set((' ' + name for name in names))
 372         with self.conn.cursor() as cur:
 373             # Get existing names
 374             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 375                         (country_code, ))
 376             word_tokens.difference_update((t[0] for t in cur))
 377
 378             if word_tokens:
 379                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 380                                                  search_name_count)
 381                                (SELECT nextval('seq_word'), token, '{}', 0
 382                                 FROM unnest(%s) as token)
 383                             """.format(country_code), (list(word_tokens),))
 384
 385
 386     def process_place(self, place):
 387         """ Determine tokenizer information about the given place.
 388
 389             Returns a JSON-serialisable structure that will be handed into
 390             the database via the token_info field.
 391         """
 392         token_info = _TokenInfo(self._cache)
 393
 394         names = place.get('name')
 395
 396         if names:
 397             full_names = set((self.make_standard_word(name) for name in names.values()))
 398             full_names.discard('')
 399
 400             token_info.add_names(self.conn, full_names)
 401
 402             country_feature = place.get('country_feature')
 403             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 404                 self._add_normalized_country_names(country_feature.lower(),
 405                                                    full_names)
 406
 407         address = place.get('address')
 408
 409         if address:
 410             hnrs = []
 411             addr_terms = []
 412             for key, value in address.items():
 413                 if key == 'postcode':
 414                     self._add_postcode(value)
 415                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 416                     hnrs.append(value)
 417                 elif key == 'street':
 418                     token_info.add_street(self.conn, self.make_standard_word(value))
 419                 elif key == 'place':
 420                     token_info.add_place(self.conn, self.make_standard_word(value))
 421                 elif not key.startswith('_') and \
 422                      key not in ('country', 'full'):
 423                     addr_terms.append((key, self.make_standard_word(value)))
 424
 425             if hnrs:
 426                 hnrs = self._split_housenumbers(hnrs)
 427                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 428
 429             if addr_terms:
 430                 token_info.add_address_terms(self.conn, addr_terms)
 431
 432         return token_info.data
 433
 434
 435     def _add_postcode(self, postcode):
 436         """ Make sure the normalized postcode is present in the word table.
 437         """
 438         if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
 439             term = self.make_standard_word(postcode)
 440             if not term:
 441                 return
 442
 443             with self.conn.cursor() as cur:
 444                 # no word_id needed for postcodes
 445                 cur.execute("""INSERT INTO word (word, word_token, class, type,
 446                                                  search_name_count)
 447                                (SELECT pc, %s, 'place', 'postcode', 0
 448                                 FROM (VALUES (%s)) as v(pc)
 449                                 WHERE NOT EXISTS
 450                                  (SELECT * FROM word
 451                                   WHERE word = pc and class='place' and type='postcode'))
 452                             """, (' ' + term, postcode))
 453             self._cache.postcodes.add(postcode)
 454
 455     @staticmethod
 456     def _split_housenumbers(hnrs):
 457         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 458             # split numbers if necessary
 459             simple_list = []
 460             for hnr in hnrs:
 461                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 462
 463             if len(simple_list) > 1:
 464                 hnrs = list(set(simple_list))
 465             else:
 466                 hnrs = simple_list
 467
 468         return hnrs
 469
 470
 471
 472
 473 class _TokenInfo:
 474     """ Collect token information to be sent back to the database.
 475     """
 476     def __init__(self, cache):
 477         self.cache = cache
 478         self.data = {}
 479
 480     @staticmethod
 481     def _mk_array(tokens):
 482         return '{%s}' % ','.join((str(s) for s in tokens))
 483
 484
 485     def add_names(self, conn, names):
 486         """ Adds token information for the normalised names.
 487         """
 488         # Start with all partial names
 489         terms = set((part for ns in names for part in ns.split()))
 490         # Add partials for the full terms (TO BE REMOVED)
 491         terms.update((n for n in names))
 492         # Add the full names
 493         terms.update((' ' + n for n in names))
 494
 495         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 496
 497
 498     def add_housenumbers(self, conn, hnrs):
 499         """ Extract housenumber information from a list of normalised
 500             housenumbers.
 501         """
 502         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 503         self.data['hnr'] = ';'.join(hnrs)
 504
 505
 506     def add_street(self, conn, street):
 507         """ Add addr:street match terms.
 508         """
 509         if not street:
 510             return
 511
 512         term = ' ' + street
 513
 514         tid = self.cache.names.get(term)
 515
 516         if tid is None:
 517             with conn.cursor() as cur:
 518                 cur.execute("""SELECT word_id FROM word
 519                                 WHERE word_token = %s
 520                                       and class is null and type is null""",
 521                             (term, ))
 522                 if cur.rowcount > 0:
 523                     tid = cur.fetchone()[0]
 524                     self.cache.names[term] = tid
 525
 526         if tid is not None:
 527             self.data['street'] = '{%d}' % tid
 528
 529
 530     def add_place(self, conn, place):
 531         """ Add addr:place search and match terms.
 532         """
 533         if not place:
 534             return
 535
 536         partial_ids = self.cache.get_term_tokens(conn, place.split())
 537         tid = self.cache.get_term_tokens(conn, [' ' + place])
 538
 539         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 540         self.data['place_match'] = '{%s}' % tid[0]
 541
 542
 543     def add_address_terms(self, conn, terms):
 544         """ Add additional address terms.
 545         """
 546         tokens = {}
 547
 548         for key, value in terms:
 549             if not value:
 550                 continue
 551             partial_ids = self.cache.get_term_tokens(conn, value.split())
 552             term = ' ' + value
 553             tid = self.cache.names.get(term)
 554
 555             if tid is None:
 556                 with conn.cursor() as cur:
 557                     cur.execute("""SELECT word_id FROM word
 558                                     WHERE word_token = %s
 559                                           and class is null and type is null""",
 560                                 (term, ))
 561                     if cur.rowcount > 0:
 562                         tid = cur.fetchone()[0]
 563                         self.cache.names[term] = tid
 564
 565             tokens[key] = [self._mk_array(partial_ids),
 566                            '{%s}' % ('' if tid is None else str(tid))]
 567
 568         if tokens:
 569             self.data['addr'] = tokens
 570
 571
 572 class _TokenCache:
 573     """ Cache for token information to avoid repeated database queries.
 574
 575         This cache is not thread-safe and needs to be instantiated per
 576         analyzer.
 577     """
 578     def __init__(self):
 579         self.names = {}
 580         self.postcodes = set()
 581         self.housenumbers = {}
 582
 583
 584     def get_term_tokens(self, conn, terms):
 585         """ Get token ids for a list of terms, looking them up in the database
 586             if necessary.
 587         """
 588         tokens = []
 589         askdb = []
 590
 591         for term in terms:
 592             token = self.names.get(term)
 593             if token is None:
 594                 askdb.append(term)
 595             elif token != 0:
 596                 tokens.append(token)
 597
 598         if askdb:
 599             with conn.cursor() as cur:
 600                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 601                             (askdb, ))
 602                 for term, tid in cur:
 603                     self.names[term] = tid
 604                     if tid != 0:
 605                         tokens.append(tid)
 606
 607         return tokens
 608
 609
 610     def get_hnr_tokens(self, conn, terms):
 611         """ Get token ids for a list of housenumbers, looking them up in the
 612             database if necessary.
 613         """
 614         tokens = []
 615         askdb = []
 616
 617         for term in terms:
 618             token = self.housenumbers.get(term)
 619             if token is None:
 620                 askdb.append(term)
 621             else:
 622                 tokens.append(token)
 623
 624         if askdb:
 625             with conn.cursor() as cur:
 626                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 627                             (askdb, ))
 628                 for term, tid in cur:
 629                     self.housenumbers[term] = tid
 630                     tokens.append(tid)
 631
 632         return tokens