2 Tokenizer implementing normalisation as used before Nominatim 4.
11 from nominatim.db.connection import connect
12 from nominatim.db import properties
13 from nominatim.db import utils as db_utils
14 from nominatim.db.sql_preprocessor import SQLPreprocessor
15 from nominatim.errors import UsageError
17 DBCFG_NORMALIZATION = "tokenizer_normalization"
18 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
20 LOG = logging.getLogger()
22 def create(dsn, data_dir):
23 """ Create a new instance of the tokenizer provided by this module.
25 return LegacyTokenizer(dsn, data_dir)
28 def _install_module(config_module_path, src_dir, module_dir):
29 """ Copies the PostgreSQL normalisation module into the project
30 directory if necessary. For historical reasons the module is
31 saved in the '/module' subdirectory and not with the other tokenizer
34 The function detects when the installation is run from the
35 build directory. It doesn't touch the module in that case.
37 # Custom module locations are simply used as is.
38 if config_module_path:
39 LOG.info("Using custom path for database module at '%s'", config_module_path)
40 return config_module_path
42 # Compatibility mode for builddir installations.
43 if module_dir.exists() and src_dir.samefile(module_dir):
44 LOG.info('Running from build directory. Leaving database module as is.')
47 # In any other case install the module in the project directory.
48 if not module_dir.exists():
51 destfile = module_dir / 'nominatim.so'
52 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
55 LOG.info('Database module installed at %s', str(destfile))
60 def _check_module(module_dir, conn):
61 """ Try to use the PostgreSQL module to confirm that it is correctly
62 installed and accessible from PostgreSQL.
64 with conn.cursor() as cur:
66 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
67 RETURNS text AS '{}/nominatim.so', 'transliteration'
68 LANGUAGE c IMMUTABLE STRICT;
69 DROP FUNCTION nominatim_test_import_func(text)
70 """.format(module_dir))
71 except psycopg2.DatabaseError as err:
72 LOG.fatal("Error accessing database module: %s", err)
73 raise UsageError("Database module cannot be accessed.") from err
76 class LegacyTokenizer:
77 """ The legacy tokenizer uses a special PostgreSQL module to normalize
78 names and queries. The tokenizer thus implements normalization through
79 calls to the database.
82 def __init__(self, dsn, data_dir):
84 self.data_dir = data_dir
85 self.normalization = None
88 def init_new_db(self, config):
89 """ Set up a new tokenizer for the database.
91 This copies all necessary data in the project directory to make
92 sure the tokenizer remains stable even over updates.
94 module_dir = _install_module(config.DATABASE_MODULE_PATH,
95 config.lib_dir.module,
96 config.project_dir / 'module')
98 self.normalization = config.TERM_NORMALIZATION
100 with connect(self.dsn) as conn:
101 _check_module(module_dir, conn)
102 self._save_config(conn, config)
105 self.update_sql_functions(config)
106 self._init_db_tables(config)
109 def init_from_project(self):
110 """ Initialise the tokenizer from the project directory.
112 with connect(self.dsn) as conn:
113 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
116 def update_sql_functions(self, config):
117 """ Reimport the SQL functions for this tokenizer.
119 with connect(self.dsn) as conn:
120 max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
121 modulepath = config.DATABASE_MODULE_PATH or \
122 str((config.project_dir / 'module').resolve())
123 sqlp = SQLPreprocessor(conn, config)
124 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
125 max_word_freq=max_word_freq,
126 modulepath=modulepath)
129 def migrate_database(self, config):
130 """ Initialise the project directory of an existing database for
131 use with this tokenizer.
133 This is a special migration function for updating existing databases
134 to new software versions.
136 module_dir = _install_module(config.DATABASE_MODULE_PATH,
137 config.lib_dir.module,
138 config.project_dir / 'module')
140 with connect(self.dsn) as conn:
141 _check_module(module_dir, conn)
142 self._save_config(conn, config)
145 def name_analyzer(self):
146 """ Create a new analyzer for tokenizing names and queries
147 using this tokinzer. Analyzers are context managers and should
151 with tokenizer.name_analyzer() as analyzer:
155 When used outside the with construct, the caller must ensure to
156 call the close() function before destructing the analyzer.
158 Analyzers are not thread-safe. You need to instantiate one per thread.
160 return LegacyNameAnalyzer(self.dsn)
163 def _init_db_tables(self, config):
164 """ Set up the word table and fill it with pre-computed word
167 with connect(self.dsn) as conn:
168 sqlp = SQLPreprocessor(conn, config)
169 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
172 LOG.warning("Precomputing word tokens")
173 db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
176 def _save_config(self, conn, config):
177 """ Save the configuration that needs to remain stable for the given
178 database as database properties.
180 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
181 properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
185 class LegacyNameAnalyzer:
186 """ The legacy analyzer uses the special Postgresql module for
189 Each instance opens a connection to the database to request the
193 def __init__(self, dsn):
194 self.conn = connect(dsn).connection
195 self.conn.autocommit = True
196 psycopg2.extras.register_hstore(self.conn)
203 def __exit__(self, exc_type, exc_value, traceback):
208 """ Free all resources used by the analyzer.
214 def process_place(self, place):
215 """ Determine tokenizer information about the given place.
217 Returns a JSON-serialisable structure that will be handed into
218 the database via the token_info field.
220 token_info = _TokenInfo()
222 token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
224 return token_info.data
233 def add_names(self, conn, names, country_feature):
234 """ Add token information for the names of the place.
239 with conn.cursor() as cur:
240 # Create the token IDs for all names.
241 self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
244 # Add country tokens to word table if necessary.
245 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
246 cur.execute("SELECT create_country(%s, %s)",
247 (names, country_feature.lower()))