X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/3eb4d8805700ba12bd601e552c3bc48064423083..a5970d75486bae0f0532b4e4bb12e82d625c4750:/nominatim/tokenizer/legacy_tokenizer.py diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index d6755835..6040f88f 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -119,6 +119,15 @@ class LegacyTokenizer: self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) + def finalize_import(self, config): + """ Do any required postprocessing to make the tokenizer data ready + for use. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + + def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ @@ -132,6 +141,33 @@ class LegacyTokenizer: modulepath=modulepath) + def check_database(self): + """ Check that the tokenizer is set up correctly. + """ + hint = """\ + The Postgresql extension nominatim.so was not correctly loaded. + + Error: {error} + + Hints: + * Check the output of the CMmake/make installation step + * Does nominatim.so exist? + * Does nominatim.so exist on the database server? + * Can nominatim.so be accessed by the database user? + """ + with connect(self.dsn) as conn: + with conn.cursor() as cur: + try: + out = cur.scalar("SELECT make_standard_name('a')") + except psycopg2.Error as err: + return hint.format(error=str(err)) + + if out != 'a': + return hint.format(error='Unexpected result for make_standard_name()') + + return None + + def migrate_database(self, config): """ Initialise the project directory of an existing database for use with this tokenizer. @@ -235,6 +271,32 @@ class LegacyNameAnalyzer: self.conn = None + def get_word_token_info(self, words): + """ Return token information for the given list of words. + If a word starts with # it is assumed to be a full name + otherwise is a partial name. + + The function returns a list of tuples with + (original word, word token, word id). + + The function is used for testing and debugging only + and not necessarily efficient. + """ + with self.conn.cursor() as cur: + cur.execute("""SELECT t.term, word_token, word_id + FROM word, (SELECT unnest(%s::TEXT[]) as term) t + WHERE word_token = (CASE + WHEN left(t.term, 1) = '#' THEN + ' ' || make_standard_name(substring(t.term from 2)) + ELSE + make_standard_name(t.term) + END) + and class is null and country_code is null""", + (words, )) + + return [(r[0], r[1], r[2]) for r in cur] + + def normalize(self, phrase): """ Normalize the given phrase, i.e. remove all properties that are irrelevant for search. @@ -242,16 +304,54 @@ class LegacyNameAnalyzer: return self.normalizer.transliterate(phrase) - def add_postcodes_from_db(self): - """ Add postcodes from the location_postcode table to the word table. + @staticmethod + def normalize_postcode(postcode): + """ Convert the postcode to a standardized form. + + This function must yield exactly the same result as the SQL function + 'token_normalized_postcode()'. + """ + return postcode.strip().upper() + + + def update_postcodes_from_db(self): + """ Update postcode tokens in the word table from the location_postcode + table. """ with self.conn.cursor() as cur: - cur.execute("""SELECT count(create_postcode_id(pc)) - FROM (SELECT distinct(postcode) as pc - FROM location_postcode) x""") + # This finds us the rows in location_postcode and word that are + # missing in the other table. + cur.execute("""SELECT * FROM + (SELECT pc, word FROM + (SELECT distinct(postcode) as pc FROM location_postcode) p + FULL JOIN + (SELECT word FROM word + WHERE class ='place' and type = 'postcode') w + ON pc = word) x + WHERE pc is null or word is null""") + + to_delete = [] + to_add = [] + + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + to_add.append(postcode) + + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) + if to_add: + cur.execute("""SELECT count(create_postcode_id(pc)) + FROM unnest(%s) as pc + """, (to_add, )) - def update_special_phrases(self, phrases): + + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@ -274,13 +374,13 @@ class LegacyNameAnalyzer: cur, """ INSERT INTO word (word_id, word_token, word, class, type, search_name_count, operator) - (SELECT nextval('seq_word'), make_standard_name(name), name, + (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name, class, type, 0, CASE WHEN op in ('in', 'near') THEN op ELSE null END FROM (VALUES %s) as v(name, class, type, op))""", to_add) - if to_delete: + if to_delete and should_replace: psycopg2.extras.execute_values( cur, """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) @@ -299,11 +399,11 @@ class LegacyNameAnalyzer: cur.execute( """INSERT INTO word (word_id, word_token, country_code) (SELECT nextval('seq_word'), lookup_token, %s - FROM (SELECT ' ' || make_standard_name(n) as lookup_token + FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token FROM unnest(%s)n) y WHERE NOT EXISTS(SELECT * FROM word WHERE word_token = lookup_token and country_code = %s)) - """, (country_code, names, country_code)) + """, (country_code, list(names.values()), country_code)) def process_place(self, place): @@ -321,7 +421,7 @@ class LegacyNameAnalyzer: country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), list(names.values())) + self.add_country_names(country_feature.lower(), names) address = place.get('address') @@ -353,12 +453,8 @@ class LegacyNameAnalyzer: def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - def _create_postcode_from_db(pcode): - with self.conn.cursor() as cur: - cur.execute('SELECT create_postcode_id(%s)', (pcode, )) - if re.search(r'[:,;]', postcode) is None: - self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db) + self._cache.add_postcode(self.conn, self.normalize_postcode(postcode)) class _TokenInfo: @@ -416,10 +512,9 @@ class _TokenInfo: """ def _get_place(name): with conn.cursor() as cur: - cur.execute("""SELECT (addr_ids_from_name(%s) - || getorcreate_name_id(make_standard_name(%s), ''))::text, + cur.execute("""SELECT make_keywords(hstore('name' , %s))::text, word_ids_from_name(%s)::text""", - (name, name, name)) + (name, name)) return cur.fetchone() self.data['place_search'], self.data['place_match'] = \ @@ -489,16 +584,19 @@ class _TokenCache: FROM generate_series(1, 100) as i""") self._cached_housenumbers = {str(r[0]) : r[1] for r in cur} - # Get postcodes that are already saved - postcodes = OrderedDict() - with conn.cursor() as cur: - cur.execute("""SELECT word FROM word - WHERE class ='place' and type = 'postcode'""") - for row in cur: - postcodes[row[0]] = None - self.postcodes = _LRU(maxsize=32, init_data=postcodes) + # For postcodes remember the ones that have already been added + self.postcodes = set() def get_housenumber(self, number): """ Get a housenumber token from the cache. """ return self._cached_housenumbers.get(number) + + + def add_postcode(self, conn, postcode): + """ Make sure the given postcode is in the database. + """ + if postcode not in self.postcodes: + with conn.cursor() as cur: + cur.execute('SELECT create_postcode_id(%s)', (postcode, )) + self.postcodes.add(postcode)