from textwrap import dedent
from nominatim.db.connection import connect
-from nominatim.db.properties import set_property, get_property
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.indexer.place_info import PlaceInfo
self.dsn = dsn
self.data_dir = data_dir
self.loader = None
- self.term_normalization = None
def init_new_db(self, config, init_db=True):
"""
self.loader = ICURuleLoader(config)
- self.term_normalization = config.TERM_NORMALIZATION
-
self._install_php(config.lib_dir.php)
self._save_config()
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
- self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
def finalize_import(self, config):
def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
+ # Will throw an error if there is an issue.
self.init_from_project(config)
- if self.term_normalization is None:
- return "Configuration for tokenizer 'icu' are missing."
-
- return None
-
def update_statistics(self):
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
- with conn.cursor() as cur:
- cur.drop_table("word_frequencies")
- LOG.info("Computing word frequencies")
- cur.execute("""CREATE TEMP TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*)
- FROM search_name GROUP BY id""")
- cur.execute("CREATE INDEX ON word_frequencies(id)")
- LOG.info("Update word table with recomputed frequencies")
- cur.execute("""UPDATE word
- SET info = info || jsonb_build_object('count', count)
- FROM word_frequencies WHERE word_id = id""")
- cur.drop_table("word_frequencies")
+ if conn.table_exists('search_name'):
+ with conn.cursor() as cur:
+ cur.drop_table("word_frequencies")
+ LOG.info("Computing word frequencies")
+ cur.execute("""CREATE TEMP TABLE word_frequencies AS
+ SELECT unnest(name_vector) as id, count(*)
+ FROM search_name GROUP BY id""")
+ cur.execute("CREATE INDEX ON word_frequencies(id)")
+ LOG.info("Update word table with recomputed frequencies")
+ cur.execute("""UPDATE word
+ SET info = info || jsonb_build_object('count', count)
+ FROM word_frequencies WHERE word_id = id""")
+ cur.drop_table("word_frequencies")
conn.commit()
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
- @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+ @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
"""
with connect(self.dsn) as conn:
self.loader.save_config_to_db(conn)
- set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
def _init_db_tables(self, config):
def _process_place_address(self, token_info, address):
hnrs = []
addr_terms = []
+ streets = []
for item in address:
if item.kind == 'postcode':
self._add_postcode(item.name)
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(item.name)
elif item.kind == 'street':
- token_info.add_street(self._compute_partial_tokens(item.name))
+ token = self._retrieve_full_token(item.name)
+ if token:
+ streets.append(token)
elif item.kind == 'place':
- token_info.add_place(self._compute_partial_tokens(item.name))
- elif not item.kind.startswith('_') and \
+ if not item.suffix:
+ token_info.add_place(self._compute_partial_tokens(item.name))
+ elif not item.kind.startswith('_') and not item.suffix and \
item.kind not in ('country', 'full'):
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
if addr_terms:
token_info.add_address_terms(addr_terms)
+ if streets:
+ token_info.add_street(streets)
+
def _compute_partial_tokens(self, name):
""" Normalize the given term, split it into partial words and return
return tokens
+ def _retrieve_full_token(self, name):
+ """ Get the full name token for the given name, if it exists.
+ The name is only retrived for the standard analyser.
+ """
+ norm_name = self._normalized(name)
+
+ # return cached if possible
+ if norm_name in self._cache.fulls:
+ return self._cache.fulls[norm_name]
+
+ # otherwise compute
+ full, _ = self._cache.names.get(norm_name, (None, None))
+
+ if full is None:
+ with self.conn.cursor() as cur:
+ cur.execute("SELECT word_id FROM word WHERE word = %s and type = 'W' LIMIT 1",
+ (norm_name, ))
+ if cur.rowcount > 0:
+ full = cur.fetchone()[0]
+
+ self._cache.fulls[norm_name] = full
+
+ return full
+
+
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
def add_street(self, tokens):
""" Add addr:street match terms.
"""
- if tokens:
- self.data['street'] = self._mk_array(tokens)
+ self.data['street'] = self._mk_array(tokens)
def add_place(self, tokens):
def __init__(self):
self.names = {}
self.partials = {}
+ self.fulls = {}
self.postcodes = set()
self.housenumbers = {}