nominatim/tokenizer/legacy_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4.
   3 """
   4 from collections import OrderedDict
   5 import logging
   6 import re
   7 import shutil
   8 from textwrap import dedent
   9
  10 from icu import Transliterator
  11 import psycopg2
  12 import psycopg2.extras
  13
  14 from nominatim.db.connection import connect
  15 from nominatim.db import properties
  16 from nominatim.db import utils as db_utils
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.errors import UsageError
  19
  20 DBCFG_NORMALIZATION = "tokenizer_normalization"
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22
  23 LOG = logging.getLogger()
  24
  25 def create(dsn, data_dir):
  26     """ Create a new instance of the tokenizer provided by this module.
  27     """
  28     return LegacyTokenizer(dsn, data_dir)
  29
  30
  31 def _install_module(config_module_path, src_dir, module_dir):
  32     """ Copies the PostgreSQL normalisation module into the project
  33         directory if necessary. For historical reasons the module is
  34         saved in the '/module' subdirectory and not with the other tokenizer
  35         data.
  36
  37         The function detects when the installation is run from the
  38         build directory. It doesn't touch the module in that case.
  39     """
  40     # Custom module locations are simply used as is.
  41     if config_module_path:
  42         LOG.info("Using custom path for database module at '%s'", config_module_path)
  43         return config_module_path
  44
  45     # Compatibility mode for builddir installations.
  46     if module_dir.exists() and src_dir.samefile(module_dir):
  47         LOG.info('Running from build directory. Leaving database module as is.')
  48         return module_dir
  49
  50     # In any other case install the module in the project directory.
  51     if not module_dir.exists():
  52         module_dir.mkdir()
  53
  54     destfile = module_dir / 'nominatim.so'
  55     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  56     destfile.chmod(0o755)
  57
  58     LOG.info('Database module installed at %s', str(destfile))
  59
  60     return module_dir
  61
  62
  63 def _check_module(module_dir, conn):
  64     """ Try to use the PostgreSQL module to confirm that it is correctly
  65         installed and accessible from PostgreSQL.
  66     """
  67     with conn.cursor() as cur:
  68         try:
  69             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
  70                            RETURNS text AS '{}/nominatim.so', 'transliteration'
  71                            LANGUAGE c IMMUTABLE STRICT;
  72                            DROP FUNCTION nominatim_test_import_func(text)
  73                         """.format(module_dir))
  74         except psycopg2.DatabaseError as err:
  75             LOG.fatal("Error accessing database module: %s", err)
  76             raise UsageError("Database module cannot be accessed.") from err
  77
  78
  79 class LegacyTokenizer:
  80     """ The legacy tokenizer uses a special PostgreSQL module to normalize
  81         names and queries. The tokenizer thus implements normalization through
  82         calls to the database.
  83     """
  84
  85     def __init__(self, dsn, data_dir):
  86         self.dsn = dsn
  87         self.data_dir = data_dir
  88         self.normalization = None
  89
  90
  91     def init_new_db(self, config, init_db=True):
  92         """ Set up a new tokenizer for the database.
  93
  94             This copies all necessary data in the project directory to make
  95             sure the tokenizer remains stable even over updates.
  96         """
  97         module_dir = _install_module(config.DATABASE_MODULE_PATH,
  98                                      config.lib_dir.module,
  99                                      config.project_dir / 'module')
 100
 101         self.normalization = config.TERM_NORMALIZATION
 102
 103         self._install_php(config)
 104
 105         with connect(self.dsn) as conn:
 106             _check_module(module_dir, conn)
 107             self._save_config(conn, config)
 108             conn.commit()
 109
 110         if init_db:
 111             self.update_sql_functions(config)
 112             self._init_db_tables(config)
 113
 114
 115     def init_from_project(self):
 116         """ Initialise the tokenizer from the project directory.
 117         """
 118         with connect(self.dsn) as conn:
 119             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 120
 121
 122     def finalize_import(self, config):
 123         """ Do any required postprocessing to make the tokenizer data ready
 124             for use.
 125         """
 126         with connect(self.dsn) as conn:
 127             sqlp = SQLPreprocessor(conn, config)
 128             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
 129
 130
 131     def update_sql_functions(self, config):
 132         """ Reimport the SQL functions for this tokenizer.
 133         """
 134         with connect(self.dsn) as conn:
 135             max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
 136             modulepath = config.DATABASE_MODULE_PATH or \
 137                          str((config.project_dir / 'module').resolve())
 138             sqlp = SQLPreprocessor(conn, config)
 139             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
 140                               max_word_freq=max_word_freq,
 141                               modulepath=modulepath)
 142
 143
 144     def check_database(self):
 145         """ Check that the tokenizer is set up correctly.
 146         """
 147         hint = """\
 148              The Postgresql extension nominatim.so was not correctly loaded.
 149
 150              Error: {error}
 151
 152              Hints:
 153              * Check the output of the CMmake/make installation step
 154              * Does nominatim.so exist?
 155              * Does nominatim.so exist on the database server?
 156              * Can nominatim.so be accessed by the database user?
 157              """
 158         with connect(self.dsn) as conn:
 159             with conn.cursor() as cur:
 160                 try:
 161                     out = cur.scalar("SELECT make_standard_name('a')")
 162                 except psycopg2.Error as err:
 163                     return hint.format(error=str(err))
 164
 165         if out != 'a':
 166             return hint.format(error='Unexpected result for make_standard_name()')
 167
 168         return None
 169
 170
 171     def migrate_database(self, config):
 172         """ Initialise the project directory of an existing database for
 173             use with this tokenizer.
 174
 175             This is a special migration function for updating existing databases
 176             to new software versions.
 177         """
 178         self.normalization = config.TERM_NORMALIZATION
 179         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 180                                      config.lib_dir.module,
 181                                      config.project_dir / 'module')
 182
 183         with connect(self.dsn) as conn:
 184             _check_module(module_dir, conn)
 185             self._save_config(conn, config)
 186
 187
 188     def name_analyzer(self):
 189         """ Create a new analyzer for tokenizing names and queries
 190             using this tokinzer. Analyzers are context managers and should
 191             be used accordingly:
 192
 193             ```
 194             with tokenizer.name_analyzer() as analyzer:
 195                 analyser.tokenize()
 196             ```
 197
 198             When used outside the with construct, the caller must ensure to
 199             call the close() function before destructing the analyzer.
 200
 201             Analyzers are not thread-safe. You need to instantiate one per thread.
 202         """
 203         normalizer = Transliterator.createFromRules("phrase normalizer",
 204                                                     self.normalization)
 205         return LegacyNameAnalyzer(self.dsn, normalizer)
 206
 207
 208     def _install_php(self, config):
 209         """ Install the php script for the tokenizer.
 210         """
 211         php_file = self.data_dir / "tokenizer.php"
 212         php_file.write_text(dedent("""\
 213             <?php
 214             @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
 215             @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
 216             require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
 217             """.format(config)))
 218
 219
 220     def _init_db_tables(self, config):
 221         """ Set up the word table and fill it with pre-computed word
 222             frequencies.
 223         """
 224         with connect(self.dsn) as conn:
 225             sqlp = SQLPreprocessor(conn, config)
 226             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 227             conn.commit()
 228
 229         LOG.warning("Precomputing word tokens")
 230         db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
 231
 232
 233     def _save_config(self, conn, config):
 234         """ Save the configuration that needs to remain stable for the given
 235             database as database properties.
 236         """
 237         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 238         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 239
 240
 241 class LegacyNameAnalyzer:
 242     """ The legacy analyzer uses the special Postgresql module for
 243         splitting names.
 244
 245         Each instance opens a connection to the database to request the
 246         normalization.
 247     """
 248
 249     def __init__(self, dsn, normalizer):
 250         self.conn = connect(dsn).connection
 251         self.conn.autocommit = True
 252         self.normalizer = normalizer
 253         psycopg2.extras.register_hstore(self.conn)
 254
 255         self._cache = _TokenCache(self.conn)
 256
 257
 258     def __enter__(self):
 259         return self
 260
 261
 262     def __exit__(self, exc_type, exc_value, traceback):
 263         self.close()
 264
 265
 266     def close(self):
 267         """ Free all resources used by the analyzer.
 268         """
 269         if self.conn:
 270             self.conn.close()
 271             self.conn = None
 272
 273
 274     @staticmethod
 275     def get_word_token_info(conn, words):
 276         """ Return token information for the given list of words.
 277             If a word starts with # it is assumed to be a full name
 278             otherwise is a partial name.
 279
 280             The function returns a list of tuples with
 281             (original word, word token, word id).
 282
 283             The function is used for testing and debugging only
 284             and not necessarily efficient.
 285         """
 286         with conn.cursor() as cur:
 287             cur.execute("""SELECT t.term, word_token, word_id
 288                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 289                            WHERE word_token = (CASE
 290                                    WHEN left(t.term, 1) = '#' THEN
 291                                      ' ' || make_standard_name(substring(t.term from 2))
 292                                    ELSE
 293                                      make_standard_name(t.term)
 294                                    END)
 295                                  and class is null and country_code is null""",
 296                         (words, ))
 297
 298             return [(r[0], r[1], r[2]) for r in cur]
 299
 300
 301     def normalize(self, phrase):
 302         """ Normalize the given phrase, i.e. remove all properties that
 303             are irrelevant for search.
 304         """
 305         return self.normalizer.transliterate(phrase)
 306
 307
 308     def add_postcodes_from_db(self):
 309         """ Add postcodes from the location_postcode table to the word table.
 310         """
 311         with self.conn.cursor() as cur:
 312             cur.execute("""SELECT count(create_postcode_id(pc))
 313                            FROM (SELECT distinct(postcode) as pc
 314                                  FROM location_postcode) x""")
 315
 316
 317     def update_special_phrases(self, phrases, should_replace):
 318         """ Replace the search index for special phrases with the new phrases.
 319         """
 320         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 321                             for p in phrases))
 322
 323         with self.conn.cursor() as cur:
 324             # Get the old phrases.
 325             existing_phrases = set()
 326             cur.execute("""SELECT word, class, type, operator FROM word
 327                            WHERE class != 'place'
 328                                  OR (type != 'house' AND type != 'postcode')""")
 329             for label, cls, typ, oper in cur:
 330                 existing_phrases.add((label, cls, typ, oper or '-'))
 331
 332             to_add = norm_phrases - existing_phrases
 333             to_delete = existing_phrases - norm_phrases
 334
 335             if to_add:
 336                 psycopg2.extras.execute_values(
 337                     cur,
 338                     """ INSERT INTO word (word_id, word_token, word, class, type,
 339                                           search_name_count, operator)
 340                         (SELECT nextval('seq_word'), make_standard_name(name), name,
 341                                 class, type, 0,
 342                                 CASE WHEN op in ('in', 'near') THEN op ELSE null END
 343                            FROM (VALUES %s) as v(name, class, type, op))""",
 344                     to_add)
 345
 346             if to_delete and should_replace:
 347                 psycopg2.extras.execute_values(
 348                     cur,
 349                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 350                         WHERE word = name and class = in_class and type = in_type
 351                               and ((op = '-' and operator is null) or op = operator)""",
 352                     to_delete)
 353
 354         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 355                  len(norm_phrases), len(to_add), len(to_delete))
 356
 357
 358     def add_country_names(self, country_code, names):
 359         """ Add names for the given country to the search index.
 360         """
 361         with self.conn.cursor() as cur:
 362             cur.execute(
 363                 """INSERT INTO word (word_id, word_token, country_code)
 364                    (SELECT nextval('seq_word'), lookup_token, %s
 365                       FROM (SELECT ' ' || make_standard_name(n) as lookup_token
 366                             FROM unnest(%s)n) y
 367                       WHERE NOT EXISTS(SELECT * FROM word
 368                                        WHERE word_token = lookup_token and country_code = %s))
 369                 """, (country_code, names, country_code))
 370
 371
 372     def process_place(self, place):
 373         """ Determine tokenizer information about the given place.
 374
 375             Returns a JSON-serialisable structure that will be handed into
 376             the database via the token_info field.
 377         """
 378         token_info = _TokenInfo(self._cache)
 379
 380         names = place.get('name')
 381
 382         if names:
 383             token_info.add_names(self.conn, names)
 384
 385             country_feature = place.get('country_feature')
 386             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 387                 self.add_country_names(country_feature.lower(), list(names.values()))
 388
 389         address = place.get('address')
 390
 391         if address:
 392             hnrs = []
 393             addr_terms = []
 394             for key, value in address.items():
 395                 if key == 'postcode':
 396                     self._add_postcode(value)
 397                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 398                     hnrs.append(value)
 399                 elif key == 'street':
 400                     token_info.add_street(self.conn, value)
 401                 elif key == 'place':
 402                     token_info.add_place(self.conn, value)
 403                 elif not key.startswith('_') and \
 404                      key not in ('country', 'full'):
 405                     addr_terms.append((key, value))
 406
 407             if hnrs:
 408                 token_info.add_housenumbers(self.conn, hnrs)
 409
 410             if addr_terms:
 411                 token_info.add_address_terms(self.conn, addr_terms)
 412
 413         return token_info.data
 414
 415
 416     def _add_postcode(self, postcode):
 417         """ Make sure the normalized postcode is present in the word table.
 418         """
 419         def _create_postcode_from_db(pcode):
 420             with self.conn.cursor() as cur:
 421                 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
 422
 423         if re.search(r'[:,;]', postcode) is None:
 424             self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
 425
 426
 427 class _TokenInfo:
 428     """ Collect token information to be sent back to the database.
 429     """
 430     def __init__(self, cache):
 431         self.cache = cache
 432         self.data = {}
 433
 434
 435     def add_names(self, conn, names):
 436         """ Add token information for the names of the place.
 437         """
 438         with conn.cursor() as cur:
 439             # Create the token IDs for all names.
 440             self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
 441                                             (names, ))
 442
 443
 444     def add_housenumbers(self, conn, hnrs):
 445         """ Extract housenumber information from the address.
 446         """
 447         if len(hnrs) == 1:
 448             token = self.cache.get_housenumber(hnrs[0])
 449             if token is not None:
 450                 self.data['hnr_tokens'] = token
 451                 self.data['hnr'] = hnrs[0]
 452                 return
 453
 454         # split numbers if necessary
 455         simple_list = []
 456         for hnr in hnrs:
 457             simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 458
 459         if len(simple_list) > 1:
 460             simple_list = list(set(simple_list))
 461
 462         with conn.cursor() as cur:
 463             cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
 464             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 465
 466
 467     def add_street(self, conn, street):
 468         """ Add addr:street match terms.
 469         """
 470         def _get_street(name):
 471             with conn.cursor() as cur:
 472                 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
 473
 474         self.data['street'] = self.cache.streets.get(street, _get_street)
 475
 476
 477     def add_place(self, conn, place):
 478         """ Add addr:place search and match terms.
 479         """
 480         def _get_place(name):
 481             with conn.cursor() as cur:
 482                 cur.execute("""SELECT (addr_ids_from_name(%s)
 483                                        || getorcreate_name_id(make_standard_name(%s), ''))::text,
 484                                       word_ids_from_name(%s)::text""",
 485                             (name, name, name))
 486                 return cur.fetchone()
 487
 488         self.data['place_search'], self.data['place_match'] = \
 489             self.cache.places.get(place, _get_place)
 490
 491
 492     def add_address_terms(self, conn, terms):
 493         """ Add additional address terms.
 494         """
 495         def _get_address_term(name):
 496             with conn.cursor() as cur:
 497                 cur.execute("""SELECT addr_ids_from_name(%s)::text,
 498                                       word_ids_from_name(%s)::text""",
 499                             (name, name))
 500                 return cur.fetchone()
 501
 502         tokens = {}
 503         for key, value in terms:
 504             tokens[key] = self.cache.address_terms.get(value, _get_address_term)
 505
 506         self.data['addr'] = tokens
 507
 508
 509 class _LRU:
 510     """ Least recently used cache that accepts a generator function to
 511         produce the item when there is a cache miss.
 512     """
 513
 514     def __init__(self, maxsize=128, init_data=None):
 515         self.data = init_data or OrderedDict()
 516         self.maxsize = maxsize
 517         if init_data is not None and len(init_data) > maxsize:
 518             self.maxsize = len(init_data)
 519
 520     def get(self, key, generator):
 521         """ Get the item with the given key from the cache. If nothing
 522             is found in the cache, generate the value through the
 523             generator function and store it in the cache.
 524         """
 525         value = self.data.get(key)
 526         if value is not None:
 527             self.data.move_to_end(key)
 528         else:
 529             value = generator(key)
 530             if len(self.data) >= self.maxsize:
 531                 self.data.popitem(last=False)
 532             self.data[key] = value
 533
 534         return value
 535
 536
 537 class _TokenCache:
 538     """ Cache for token information to avoid repeated database queries.
 539
 540         This cache is not thread-safe and needs to be instantiated per
 541         analyzer.
 542     """
 543     def __init__(self, conn):
 544         # various LRU caches
 545         self.streets = _LRU(maxsize=256)
 546         self.places = _LRU(maxsize=128)
 547         self.address_terms = _LRU(maxsize=1024)
 548
 549         # Lookup houseunumbers up to 100 and cache them
 550         with conn.cursor() as cur:
 551             cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
 552                            FROM generate_series(1, 100) as i""")
 553             self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
 554
 555         # Get postcodes that are already saved
 556         postcodes = OrderedDict()
 557         with conn.cursor() as cur:
 558             cur.execute("""SELECT word FROM word
 559                            WHERE class ='place' and type = 'postcode'""")
 560             for row in cur:
 561                 postcodes[row[0]] = None
 562         self.postcodes = _LRU(maxsize=32, init_data=postcodes)
 563
 564     def get_housenumber(self, number):
 565         """ Get a housenumber token from the cache.
 566         """
 567         return self._cached_housenumbers.get(number)