nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import json
   8 import logging
   9 import re
  10 from textwrap import dedent
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  18 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  19
  20 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  21 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  22
  23 LOG = logging.getLogger()
  24
  25 def create(dsn, data_dir):
  26     """ Create a new instance of the tokenizer provided by this module.
  27     """
  28     return LegacyICUTokenizer(dsn, data_dir)
  29
  30
  31 class LegacyICUTokenizer(AbstractTokenizer):
  32     """ This tokenizer uses libICU to covert names and queries to ASCII.
  33         Otherwise it uses the same algorithms and data structures as the
  34         normalization routines in Nominatim 3.
  35     """
  36
  37     def __init__(self, dsn, data_dir):
  38         self.dsn = dsn
  39         self.data_dir = data_dir
  40         self.naming_rules = None
  41         self.term_normalization = None
  42         self.max_word_frequency = None
  43
  44
  45     def init_new_db(self, config, init_db=True):
  46         """ Set up a new tokenizer for the database.
  47
  48             This copies all necessary data in the project directory to make
  49             sure the tokenizer remains stable even over updates.
  50         """
  51         loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
  52                                               config='TOKENIZER_CONFIG'))
  53         self.naming_rules = ICUNameProcessorRules(loader=loader)
  54         self.term_normalization = config.TERM_NORMALIZATION
  55         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  56
  57         self._install_php(config.lib_dir.php)
  58         self._save_config(config)
  59
  60         if init_db:
  61             self.update_sql_functions(config)
  62             self._init_db_tables(config)
  63
  64
  65     def init_from_project(self):
  66         """ Initialise the tokenizer from the project directory.
  67         """
  68         with connect(self.dsn) as conn:
  69             self.naming_rules = ICUNameProcessorRules(conn=conn)
  70             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  71             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  72
  73
  74     def finalize_import(self, _):
  75         """ Do any required postprocessing to make the tokenizer data ready
  76             for use.
  77         """
  78
  79
  80     def update_sql_functions(self, config):
  81         """ Reimport the SQL functions for this tokenizer.
  82         """
  83         with connect(self.dsn) as conn:
  84             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  85             sqlp = SQLPreprocessor(conn, config)
  86             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
  87                               max_word_freq=max_word_freq)
  88
  89
  90     def check_database(self):
  91         """ Check that the tokenizer is set up correctly.
  92         """
  93         self.init_from_project()
  94
  95         if self.naming_rules is None:
  96             return "Configuration for tokenizer 'icu' are missing."
  97
  98         return None
  99
 100
 101     def name_analyzer(self):
 102         """ Create a new analyzer for tokenizing names and queries
 103             using this tokinzer. Analyzers are context managers and should
 104             be used accordingly:
 105
 106             ```
 107             with tokenizer.name_analyzer() as analyzer:
 108                 analyser.tokenize()
 109             ```
 110
 111             When used outside the with construct, the caller must ensure to
 112             call the close() function before destructing the analyzer.
 113
 114             Analyzers are not thread-safe. You need to instantiate one per thread.
 115         """
 116         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 117
 118
 119     def _install_php(self, phpdir):
 120         """ Install the php script for the tokenizer.
 121         """
 122         php_file = self.data_dir / "tokenizer.php"
 123         php_file.write_text(dedent(f"""\
 124             <?php
 125             @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
 126             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 127             @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
 128             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 129
 130
 131     def _save_config(self, config):
 132         """ Save the configuration that needs to remain stable for the given
 133             database as database properties.
 134         """
 135         with connect(self.dsn) as conn:
 136             self.naming_rules.save_rules(conn)
 137
 138             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 139             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 140
 141
 142     def _init_db_tables(self, config):
 143         """ Set up the word table and fill it with pre-computed word
 144             frequencies.
 145         """
 146         with connect(self.dsn) as conn:
 147             sqlp = SQLPreprocessor(conn, config)
 148             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 149             conn.commit()
 150
 151             LOG.warning("Precomputing word tokens")
 152
 153             # get partial words and their frequencies
 154             words = self._count_partial_terms(conn)
 155
 156             # copy them back into the word table
 157             with CopyBuffer() as copystr:
 158                 for term, cnt in words.items():
 159                     copystr.add('w', term, json.dumps({'count': cnt}))
 160
 161                 with conn.cursor() as cur:
 162                     copystr.copy_out(cur, 'word',
 163                                      columns=['type', 'word_token', 'info'])
 164                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 165                                    WHERE word_id is null and type = 'w'""")
 166
 167             conn.commit()
 168
 169     def _count_partial_terms(self, conn):
 170         """ Count the partial terms from the names in the place table.
 171         """
 172         words = Counter()
 173         name_proc = ICUNameProcessor(self.naming_rules)
 174
 175         with conn.cursor(name="words") as cur:
 176             cur.execute(""" SELECT v, count(*) FROM
 177                               (SELECT svals(name) as v FROM place)x
 178                             WHERE length(v) < 75 GROUP BY v""")
 179
 180             for name, cnt in cur:
 181                 terms = set()
 182                 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 183                     if ' ' in word:
 184                         terms.update(word.split())
 185                 for term in terms:
 186                     words[term] += cnt
 187
 188         return words
 189
 190
 191 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 192     """ The legacy analyzer uses the ICU library for splitting names.
 193
 194         Each instance opens a connection to the database to request the
 195         normalization.
 196     """
 197
 198     def __init__(self, dsn, name_proc):
 199         self.conn = connect(dsn).connection
 200         self.conn.autocommit = True
 201         self.name_processor = name_proc
 202
 203         self._cache = _TokenCache()
 204
 205
 206     def close(self):
 207         """ Free all resources used by the analyzer.
 208         """
 209         if self.conn:
 210             self.conn.close()
 211             self.conn = None
 212
 213
 214     def get_word_token_info(self, words):
 215         """ Return token information for the given list of words.
 216             If a word starts with # it is assumed to be a full name
 217             otherwise is a partial name.
 218
 219             The function returns a list of tuples with
 220             (original word, word token, word id).
 221
 222             The function is used for testing and debugging only
 223             and not necessarily efficient.
 224         """
 225         full_tokens = {}
 226         partial_tokens = {}
 227         for word in words:
 228             if word.startswith('#'):
 229                 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
 230             else:
 231                 partial_tokens[word] = self.name_processor.get_search_normalized(word)
 232
 233         with self.conn.cursor() as cur:
 234             cur.execute("""SELECT word_token, word_id
 235                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 236                         """, (list(full_tokens.values()),))
 237             full_ids = {r[0]: r[1] for r in cur}
 238             cur.execute("""SELECT word_token, word_id
 239                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 240                         (list(partial_tokens.values()),))
 241             part_ids = {r[0]: r[1] for r in cur}
 242
 243         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 244                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 245
 246
 247     @staticmethod
 248     def normalize_postcode(postcode):
 249         """ Convert the postcode to a standardized form.
 250
 251             This function must yield exactly the same result as the SQL function
 252             'token_normalized_postcode()'.
 253         """
 254         return postcode.strip().upper()
 255
 256
 257     def _make_standard_hnr(self, hnr):
 258         """ Create a normalised version of a housenumber.
 259
 260             This function takes minor shortcuts on transliteration.
 261         """
 262         return self.name_processor.get_search_normalized(hnr)
 263
 264     def update_postcodes_from_db(self):
 265         """ Update postcode tokens in the word table from the location_postcode
 266             table.
 267         """
 268         to_delete = []
 269         with self.conn.cursor() as cur:
 270             # This finds us the rows in location_postcode and word that are
 271             # missing in the other table.
 272             cur.execute("""SELECT * FROM
 273                             (SELECT pc, word FROM
 274                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 275                               FULL JOIN
 276                               (SELECT word FROM word WHERE type = 'P') w
 277                               ON pc = word) x
 278                            WHERE pc is null or word is null""")
 279
 280             with CopyBuffer() as copystr:
 281                 for postcode, word in cur:
 282                     if postcode is None:
 283                         to_delete.append(word)
 284                     else:
 285                         copystr.add(self.name_processor.get_search_normalized(postcode),
 286                                     'P', postcode)
 287
 288                 if to_delete:
 289                     cur.execute("""DELETE FROM WORD
 290                                    WHERE type ='P' and word = any(%s)
 291                                 """, (to_delete, ))
 292
 293                 copystr.copy_out(cur, 'word',
 294                                  columns=['word_token', 'type', 'word'])
 295
 296
 297     def update_special_phrases(self, phrases, should_replace):
 298         """ Replace the search index for special phrases with the new phrases.
 299             If `should_replace` is True, then the previous set of will be
 300             completely replaced. Otherwise the phrases are added to the
 301             already existing ones.
 302         """
 303         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 304                             for p in phrases))
 305
 306         with self.conn.cursor() as cur:
 307             # Get the old phrases.
 308             existing_phrases = set()
 309             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 310             for word, info in cur:
 311                 existing_phrases.add((word, info['class'], info['type'],
 312                                       info.get('op') or '-'))
 313
 314             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 315             if should_replace:
 316                 deleted = self._remove_special_phrases(cur, norm_phrases,
 317                                                        existing_phrases)
 318             else:
 319                 deleted = 0
 320
 321         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 322                  len(norm_phrases), added, deleted)
 323
 324
 325     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 326         """ Add all phrases to the database that are not yet there.
 327         """
 328         to_add = new_phrases - existing_phrases
 329
 330         added = 0
 331         with CopyBuffer() as copystr:
 332             for word, cls, typ, oper in to_add:
 333                 term = self.name_processor.get_search_normalized(word)
 334                 if term:
 335                     copystr.add(term, 'S', word,
 336                                 json.dumps({'class': cls, 'type': typ,
 337                                             'op': oper if oper in ('in', 'near') else None}))
 338                     added += 1
 339
 340             copystr.copy_out(cursor, 'word',
 341                              columns=['word_token', 'type', 'word', 'info'])
 342
 343         return added
 344
 345
 346     @staticmethod
 347     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 348         """ Remove all phrases from the databse that are no longer in the
 349             new phrase list.
 350         """
 351         to_delete = existing_phrases - new_phrases
 352
 353         if to_delete:
 354             cursor.execute_values(
 355                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 356                     WHERE type = 'S' and word = name
 357                           and info->>'class' = in_class and info->>'type' = in_type
 358                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 359                 """, to_delete)
 360
 361         return len(to_delete)
 362
 363
 364     def add_country_names(self, country_code, names):
 365         """ Add names for the given country to the search index.
 366         """
 367         word_tokens = set()
 368         for name in self._compute_full_names(names):
 369             norm_name = self.name_processor.get_search_normalized(name)
 370             if norm_name:
 371                 word_tokens.add(norm_name)
 372
 373         with self.conn.cursor() as cur:
 374             # Get existing names
 375             cur.execute("""SELECT word_token FROM word
 376                             WHERE type = 'C' and word = %s""",
 377                         (country_code, ))
 378             word_tokens.difference_update((t[0] for t in cur))
 379
 380             # Only add those names that are not yet in the list.
 381             if word_tokens:
 382                 cur.execute("""INSERT INTO word (word_token, type, word)
 383                                (SELECT token, 'C', %s
 384                                 FROM unnest(%s) as token)
 385                             """, (country_code, list(word_tokens)))
 386
 387             # No names are deleted at the moment.
 388             # If deletion is made possible, then the static names from the
 389             # initial 'country_name' table should be kept.
 390
 391
 392     def process_place(self, place):
 393         """ Determine tokenizer information about the given place.
 394
 395             Returns a JSON-serialisable structure that will be handed into
 396             the database via the token_info field.
 397         """
 398         token_info = _TokenInfo(self._cache)
 399
 400         names = place.get('name')
 401
 402         if names:
 403             fulls, partials = self._compute_name_tokens(names)
 404
 405             token_info.add_names(fulls, partials)
 406
 407             country_feature = place.get('country_feature')
 408             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 409                 self.add_country_names(country_feature.lower(), names)
 410
 411         address = place.get('address')
 412         if address:
 413             self._process_place_address(token_info, address)
 414
 415         return token_info.data
 416
 417
 418     def _process_place_address(self, token_info, address):
 419         hnrs = []
 420         addr_terms = []
 421         for key, value in address.items():
 422             if key == 'postcode':
 423                 self._add_postcode(value)
 424             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 425                 hnrs.append(value)
 426             elif key == 'street':
 427                 token_info.add_street(*self._compute_name_tokens({'name': value}))
 428             elif key == 'place':
 429                 token_info.add_place(*self._compute_name_tokens({'name': value}))
 430             elif not key.startswith('_') and \
 431                  key not in ('country', 'full'):
 432                 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 433
 434         if hnrs:
 435             hnrs = self._split_housenumbers(hnrs)
 436             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 437
 438         if addr_terms:
 439             token_info.add_address_terms(addr_terms)
 440
 441
 442     def _compute_name_tokens(self, names):
 443         """ Computes the full name and partial name tokens for the given
 444             dictionary of names.
 445         """
 446         full_names = self._compute_full_names(names)
 447         full_tokens = set()
 448         partial_tokens = set()
 449
 450         for name in full_names:
 451             norm_name = self.name_processor.get_normalized(name)
 452             full, part = self._cache.names.get(norm_name, (None, None))
 453             if full is None:
 454                 variants = self.name_processor.get_variants_ascii(norm_name)
 455                 if not variants:
 456                     continue
 457
 458                 with self.conn.cursor() as cur:
 459                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 460                                 (norm_name, variants))
 461                     full, part = cur.fetchone()
 462
 463                 self._cache.names[norm_name] = (full, part)
 464
 465             full_tokens.add(full)
 466             partial_tokens.update(part)
 467
 468         return full_tokens, partial_tokens
 469
 470
 471     @staticmethod
 472     def _compute_full_names(names):
 473         """ Return the set of all full name word ids to be used with the
 474             given dictionary of names.
 475         """
 476         full_names = set()
 477         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 478             if name:
 479                 full_names.add(name)
 480
 481                 brace_idx = name.find('(')
 482                 if brace_idx >= 0:
 483                     full_names.add(name[:brace_idx].strip())
 484
 485         return full_names
 486
 487
 488     def _add_postcode(self, postcode):
 489         """ Make sure the normalized postcode is present in the word table.
 490         """
 491         if re.search(r'[:,;]', postcode) is None:
 492             postcode = self.normalize_postcode(postcode)
 493
 494             if postcode not in self._cache.postcodes:
 495                 term = self.name_processor.get_search_normalized(postcode)
 496                 if not term:
 497                     return
 498
 499                 with self.conn.cursor() as cur:
 500                     # no word_id needed for postcodes
 501                     cur.execute("""INSERT INTO word (word_token, type, word)
 502                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 503                                     WHERE NOT EXISTS
 504                                      (SELECT * FROM word
 505                                       WHERE type = 'P' and word = pc))
 506                                 """, (term, postcode))
 507                 self._cache.postcodes.add(postcode)
 508
 509
 510     @staticmethod
 511     def _split_housenumbers(hnrs):
 512         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 513             # split numbers if necessary
 514             simple_list = []
 515             for hnr in hnrs:
 516                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 517
 518             if len(simple_list) > 1:
 519                 hnrs = list(set(simple_list))
 520             else:
 521                 hnrs = simple_list
 522
 523         return hnrs
 524
 525
 526
 527
 528 class _TokenInfo:
 529     """ Collect token information to be sent back to the database.
 530     """
 531     def __init__(self, cache):
 532         self._cache = cache
 533         self.data = {}
 534
 535     @staticmethod
 536     def _mk_array(tokens):
 537         return '{%s}' % ','.join((str(s) for s in tokens))
 538
 539
 540     def add_names(self, fulls, partials):
 541         """ Adds token information for the normalised names.
 542         """
 543         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 544
 545
 546     def add_housenumbers(self, conn, hnrs):
 547         """ Extract housenumber information from a list of normalised
 548             housenumbers.
 549         """
 550         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 551         self.data['hnr'] = ';'.join(hnrs)
 552
 553
 554     def add_street(self, fulls, _):
 555         """ Add addr:street match terms.
 556         """
 557         if fulls:
 558             self.data['street'] = self._mk_array(fulls)
 559
 560
 561     def add_place(self, fulls, partials):
 562         """ Add addr:place search and match terms.
 563         """
 564         if fulls:
 565             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 566             self.data['place_match'] = self._mk_array(fulls)
 567
 568
 569     def add_address_terms(self, terms):
 570         """ Add additional address terms.
 571         """
 572         tokens = {}
 573
 574         for key, fulls, partials in terms:
 575             if fulls:
 576                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 577                                self._mk_array(fulls)]
 578
 579         if tokens:
 580             self.data['addr'] = tokens
 581
 582
 583 class _TokenCache:
 584     """ Cache for token information to avoid repeated database queries.
 585
 586         This cache is not thread-safe and needs to be instantiated per
 587         analyzer.
 588     """
 589     def __init__(self):
 590         self.names = {}
 591         self.postcodes = set()
 592         self.housenumbers = {}
 593
 594
 595     def get_hnr_tokens(self, conn, terms):
 596         """ Get token ids for a list of housenumbers, looking them up in the
 597             database if necessary. `terms` is an iterable of normalized
 598             housenumbers.
 599         """
 600         tokens = []
 601         askdb = []
 602
 603         for term in terms:
 604             token = self.housenumbers.get(term)
 605             if token is None:
 606                 askdb.append(term)
 607             else:
 608                 tokens.append(token)
 609
 610         if askdb:
 611             with conn.cursor() as cur:
 612                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 613                             (askdb, ))
 614                 for term, tid in cur:
 615                     self.housenumbers[term] = tid
 616                     tokens.append(tid)
 617
 618         return tokens