nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11 from pathlib import Path
  12
  13 from nominatim.db.connection import connect
  14 from nominatim.db.properties import set_property, get_property
  15 from nominatim.db.utils import CopyBuffer
  16 from nominatim.db.sql_preprocessor import SQLPreprocessor
  17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  18 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  19 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer(AbstractTokenizer):
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, _):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83
  84
  85     def update_sql_functions(self, config):
  86         """ Reimport the SQL functions for this tokenizer.
  87         """
  88         with connect(self.dsn) as conn:
  89             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  90             sqlp = SQLPreprocessor(conn, config)
  91             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
  92                               max_word_freq=max_word_freq)
  93
  94
  95     def check_database(self):
  96         """ Check that the tokenizer is set up correctly.
  97         """
  98         self.init_from_project()
  99
 100         if self.naming_rules is None:
 101             return "Configuration for tokenizer 'icu' are missing."
 102
 103         return None
 104
 105
 106     def name_analyzer(self):
 107         """ Create a new analyzer for tokenizing names and queries
 108             using this tokinzer. Analyzers are context managers and should
 109             be used accordingly:
 110
 111             ```
 112             with tokenizer.name_analyzer() as analyzer:
 113                 analyser.tokenize()
 114             ```
 115
 116             When used outside the with construct, the caller must ensure to
 117             call the close() function before destructing the analyzer.
 118
 119             Analyzers are not thread-safe. You need to instantiate one per thread.
 120         """
 121         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 122
 123
 124     def _install_php(self, phpdir):
 125         """ Install the php script for the tokenizer.
 126         """
 127         php_file = self.data_dir / "tokenizer.php"
 128         php_file.write_text(dedent(f"""\
 129             <?php
 130             @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
 131             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 132             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
 133             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 134
 135
 136     def _save_config(self, config):
 137         """ Save the configuration that needs to remain stable for the given
 138             database as database properties.
 139         """
 140         with connect(self.dsn) as conn:
 141             self.naming_rules.save_rules(conn)
 142
 143             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 144             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 145
 146
 147     def _init_db_tables(self, config):
 148         """ Set up the word table and fill it with pre-computed word
 149             frequencies.
 150         """
 151         with connect(self.dsn) as conn:
 152             sqlp = SQLPreprocessor(conn, config)
 153             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 154             conn.commit()
 155
 156             LOG.warning("Precomputing word tokens")
 157
 158             # get partial words and their frequencies
 159             words = self._count_partial_terms(conn)
 160
 161             # copy them back into the word table
 162             with CopyBuffer() as copystr:
 163                 for term, cnt in words.items():
 164                     copystr.add('w', term, json.dumps({'count': cnt}))
 165
 166                 with conn.cursor() as cur:
 167                     copystr.copy_out(cur, 'word',
 168                                      columns=['type', 'word_token', 'info'])
 169                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 170                                    WHERE word_id is null and type = 'w'""")
 171
 172             conn.commit()
 173
 174     def _count_partial_terms(self, conn):
 175         """ Count the partial terms from the names in the place table.
 176         """
 177         words = Counter()
 178         name_proc = ICUNameProcessor(self.naming_rules)
 179
 180         with conn.cursor(name="words") as cur:
 181             cur.execute(""" SELECT v, count(*) FROM
 182                               (SELECT svals(name) as v FROM place)x
 183                             WHERE length(v) < 75 GROUP BY v""")
 184
 185             for name, cnt in cur:
 186                 terms = set()
 187                 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 188                     if ' ' in word:
 189                         terms.update(word.split())
 190                 for term in terms:
 191                     words[term] += cnt
 192
 193         return words
 194
 195
 196 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 197     """ The legacy analyzer uses the ICU library for splitting names.
 198
 199         Each instance opens a connection to the database to request the
 200         normalization.
 201     """
 202
 203     def __init__(self, dsn, name_proc):
 204         self.conn = connect(dsn).connection
 205         self.conn.autocommit = True
 206         self.name_processor = name_proc
 207
 208         self._cache = _TokenCache()
 209
 210
 211     def close(self):
 212         """ Free all resources used by the analyzer.
 213         """
 214         if self.conn:
 215             self.conn.close()
 216             self.conn = None
 217
 218
 219     def get_word_token_info(self, words):
 220         """ Return token information for the given list of words.
 221             If a word starts with # it is assumed to be a full name
 222             otherwise is a partial name.
 223
 224             The function returns a list of tuples with
 225             (original word, word token, word id).
 226
 227             The function is used for testing and debugging only
 228             and not necessarily efficient.
 229         """
 230         full_tokens = {}
 231         partial_tokens = {}
 232         for word in words:
 233             if word.startswith('#'):
 234                 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
 235             else:
 236                 partial_tokens[word] = self.name_processor.get_search_normalized(word)
 237
 238         with self.conn.cursor() as cur:
 239             cur.execute("""SELECT word_token, word_id
 240                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 241                         """, (list(full_tokens.values()),))
 242             full_ids = {r[0]: r[1] for r in cur}
 243             cur.execute("""SELECT word_token, word_id
 244                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 245                         (list(partial_tokens.values()),))
 246             part_ids = {r[0]: r[1] for r in cur}
 247
 248         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 249                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 250
 251
 252     @staticmethod
 253     def normalize_postcode(postcode):
 254         """ Convert the postcode to a standardized form.
 255
 256             This function must yield exactly the same result as the SQL function
 257             'token_normalized_postcode()'.
 258         """
 259         return postcode.strip().upper()
 260
 261
 262     def _make_standard_hnr(self, hnr):
 263         """ Create a normalised version of a housenumber.
 264
 265             This function takes minor shortcuts on transliteration.
 266         """
 267         return self.name_processor.get_search_normalized(hnr)
 268
 269     def update_postcodes_from_db(self):
 270         """ Update postcode tokens in the word table from the location_postcode
 271             table.
 272         """
 273         to_delete = []
 274         with self.conn.cursor() as cur:
 275             # This finds us the rows in location_postcode and word that are
 276             # missing in the other table.
 277             cur.execute("""SELECT * FROM
 278                             (SELECT pc, word FROM
 279                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 280                               FULL JOIN
 281                               (SELECT word FROM word WHERE type = 'P') w
 282                               ON pc = word) x
 283                            WHERE pc is null or word is null""")
 284
 285             with CopyBuffer() as copystr:
 286                 for postcode, word in cur:
 287                     if postcode is None:
 288                         to_delete.append(word)
 289                     else:
 290                         copystr.add(self.name_processor.get_search_normalized(postcode),
 291                                     'P', postcode)
 292
 293                 if to_delete:
 294                     cur.execute("""DELETE FROM WORD
 295                                    WHERE type ='P' and word = any(%s)
 296                                 """, (to_delete, ))
 297
 298                 copystr.copy_out(cur, 'word',
 299                                  columns=['word_token', 'type', 'word'])
 300
 301
 302     def update_special_phrases(self, phrases, should_replace):
 303         """ Replace the search index for special phrases with the new phrases.
 304             If `should_replace` is True, then the previous set of will be
 305             completely replaced. Otherwise the phrases are added to the
 306             already existing ones.
 307         """
 308         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 309                             for p in phrases))
 310
 311         with self.conn.cursor() as cur:
 312             # Get the old phrases.
 313             existing_phrases = set()
 314             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 315             for word, info in cur:
 316                 existing_phrases.add((word, info['class'], info['type'],
 317                                       info.get('op') or '-'))
 318
 319             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 320             if should_replace:
 321                 deleted = self._remove_special_phrases(cur, norm_phrases,
 322                                                        existing_phrases)
 323             else:
 324                 deleted = 0
 325
 326         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 327                  len(norm_phrases), added, deleted)
 328
 329
 330     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 331         """ Add all phrases to the database that are not yet there.
 332         """
 333         to_add = new_phrases - existing_phrases
 334
 335         added = 0
 336         with CopyBuffer() as copystr:
 337             for word, cls, typ, oper in to_add:
 338                 term = self.name_processor.get_search_normalized(word)
 339                 if term:
 340                     copystr.add(term, 'S', word,
 341                                 json.dumps({'class': cls, 'type': typ,
 342                                             'op': oper if oper in ('in', 'near') else None}))
 343                     added += 1
 344
 345             copystr.copy_out(cursor, 'word',
 346                              columns=['word_token', 'type', 'word', 'info'])
 347
 348         return added
 349
 350
 351     @staticmethod
 352     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 353         """ Remove all phrases from the databse that are no longer in the
 354             new phrase list.
 355         """
 356         to_delete = existing_phrases - new_phrases
 357
 358         if to_delete:
 359             cursor.execute_values(
 360                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 361                     WHERE type = 'S' and word = name
 362                           and info->>'class' = in_class and info->>'type' = in_type
 363                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 364                 """, to_delete)
 365
 366         return len(to_delete)
 367
 368
 369     def add_country_names(self, country_code, names):
 370         """ Add names for the given country to the search index.
 371         """
 372         word_tokens = set()
 373         for name in self._compute_full_names(names):
 374             norm_name = self.name_processor.get_search_normalized(name)
 375             if norm_name:
 376                 word_tokens.add(norm_name)
 377
 378         with self.conn.cursor() as cur:
 379             # Get existing names
 380             cur.execute("""SELECT word_token FROM word
 381                             WHERE type = 'C' and word = %s""",
 382                         (country_code, ))
 383             word_tokens.difference_update((t[0] for t in cur))
 384
 385             # Only add those names that are not yet in the list.
 386             if word_tokens:
 387                 cur.execute("""INSERT INTO word (word_token, type, word)
 388                                (SELECT token, 'C', %s
 389                                 FROM unnest(%s) as token)
 390                             """, (country_code, list(word_tokens)))
 391
 392             # No names are deleted at the moment.
 393             # If deletion is made possible, then the static names from the
 394             # initial 'country_name' table should be kept.
 395
 396
 397     def process_place(self, place):
 398         """ Determine tokenizer information about the given place.
 399
 400             Returns a JSON-serialisable structure that will be handed into
 401             the database via the token_info field.
 402         """
 403         token_info = _TokenInfo(self._cache)
 404
 405         names = place.get('name')
 406
 407         if names:
 408             fulls, partials = self._compute_name_tokens(names)
 409
 410             token_info.add_names(fulls, partials)
 411
 412             country_feature = place.get('country_feature')
 413             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 414                 self.add_country_names(country_feature.lower(), names)
 415
 416         address = place.get('address')
 417         if address:
 418             self._process_place_address(token_info, address)
 419
 420         return token_info.data
 421
 422
 423     def _process_place_address(self, token_info, address):
 424         hnrs = []
 425         addr_terms = []
 426         for key, value in address.items():
 427             if key == 'postcode':
 428                 self._add_postcode(value)
 429             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 430                 hnrs.append(value)
 431             elif key == 'street':
 432                 token_info.add_street(*self._compute_name_tokens({'name': value}))
 433             elif key == 'place':
 434                 token_info.add_place(*self._compute_name_tokens({'name': value}))
 435             elif not key.startswith('_') and \
 436                  key not in ('country', 'full'):
 437                 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 438
 439         if hnrs:
 440             hnrs = self._split_housenumbers(hnrs)
 441             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 442
 443         if addr_terms:
 444             token_info.add_address_terms(addr_terms)
 445
 446
 447     def _compute_name_tokens(self, names):
 448         """ Computes the full name and partial name tokens for the given
 449             dictionary of names.
 450         """
 451         full_names = self._compute_full_names(names)
 452         full_tokens = set()
 453         partial_tokens = set()
 454
 455         for name in full_names:
 456             norm_name = self.name_processor.get_normalized(name)
 457             full, part = self._cache.names.get(norm_name, (None, None))
 458             if full is None:
 459                 variants = self.name_processor.get_variants_ascii(norm_name)
 460                 if not variants:
 461                     continue
 462
 463                 with self.conn.cursor() as cur:
 464                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 465                                 (norm_name, variants))
 466                     full, part = cur.fetchone()
 467
 468                 self._cache.names[norm_name] = (full, part)
 469
 470             full_tokens.add(full)
 471             partial_tokens.update(part)
 472
 473         return full_tokens, partial_tokens
 474
 475
 476     @staticmethod
 477     def _compute_full_names(names):
 478         """ Return the set of all full name word ids to be used with the
 479             given dictionary of names.
 480         """
 481         full_names = set()
 482         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 483             if name:
 484                 full_names.add(name)
 485
 486                 brace_idx = name.find('(')
 487                 if brace_idx >= 0:
 488                     full_names.add(name[:brace_idx].strip())
 489
 490         return full_names
 491
 492
 493     def _add_postcode(self, postcode):
 494         """ Make sure the normalized postcode is present in the word table.
 495         """
 496         if re.search(r'[:,;]', postcode) is None:
 497             postcode = self.normalize_postcode(postcode)
 498
 499             if postcode not in self._cache.postcodes:
 500                 term = self.name_processor.get_search_normalized(postcode)
 501                 if not term:
 502                     return
 503
 504                 with self.conn.cursor() as cur:
 505                     # no word_id needed for postcodes
 506                     cur.execute("""INSERT INTO word (word_token, type, word)
 507                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 508                                     WHERE NOT EXISTS
 509                                      (SELECT * FROM word
 510                                       WHERE type = 'P' and word = pc))
 511                                 """, (term, postcode))
 512                 self._cache.postcodes.add(postcode)
 513
 514
 515     @staticmethod
 516     def _split_housenumbers(hnrs):
 517         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 518             # split numbers if necessary
 519             simple_list = []
 520             for hnr in hnrs:
 521                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 522
 523             if len(simple_list) > 1:
 524                 hnrs = list(set(simple_list))
 525             else:
 526                 hnrs = simple_list
 527
 528         return hnrs
 529
 530
 531
 532
 533 class _TokenInfo:
 534     """ Collect token information to be sent back to the database.
 535     """
 536     def __init__(self, cache):
 537         self._cache = cache
 538         self.data = {}
 539
 540     @staticmethod
 541     def _mk_array(tokens):
 542         return '{%s}' % ','.join((str(s) for s in tokens))
 543
 544
 545     def add_names(self, fulls, partials):
 546         """ Adds token information for the normalised names.
 547         """
 548         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 549
 550
 551     def add_housenumbers(self, conn, hnrs):
 552         """ Extract housenumber information from a list of normalised
 553             housenumbers.
 554         """
 555         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 556         self.data['hnr'] = ';'.join(hnrs)
 557
 558
 559     def add_street(self, fulls, _):
 560         """ Add addr:street match terms.
 561         """
 562         if fulls:
 563             self.data['street'] = self._mk_array(fulls)
 564
 565
 566     def add_place(self, fulls, partials):
 567         """ Add addr:place search and match terms.
 568         """
 569         if fulls:
 570             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 571             self.data['place_match'] = self._mk_array(fulls)
 572
 573
 574     def add_address_terms(self, terms):
 575         """ Add additional address terms.
 576         """
 577         tokens = {}
 578
 579         for key, fulls, partials in terms:
 580             if fulls:
 581                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 582                                self._mk_array(fulls)]
 583
 584         if tokens:
 585             self.data['addr'] = tokens
 586
 587
 588 class _TokenCache:
 589     """ Cache for token information to avoid repeated database queries.
 590
 591         This cache is not thread-safe and needs to be instantiated per
 592         analyzer.
 593     """
 594     def __init__(self):
 595         self.names = {}
 596         self.postcodes = set()
 597         self.housenumbers = {}
 598
 599
 600     def get_hnr_tokens(self, conn, terms):
 601         """ Get token ids for a list of housenumbers, looking them up in the
 602             database if necessary. `terms` is an iterable of normalized
 603             housenumbers.
 604         """
 605         tokens = []
 606         askdb = []
 607
 608         for term in terms:
 609             token = self.housenumbers.get(term)
 610             if token is None:
 611                 askdb.append(term)
 612             else:
 613                 tokens.append(token)
 614
 615         if askdb:
 616             with conn.cursor() as cur:
 617                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 618                             (askdb, ))
 619                 for term, tid in cur:
 620                     self.housenumbers[term] = tid
 621                     tokens.append(tid)
 622
 623         return tokens