X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/63e35574d4998ea64c1b26b1680d14ff4fd14036..78fcabade88c885f2506911201e52de912204b56:/nominatim/tokenizer/legacy_tokenizer.py diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index bf117783..8957426b 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -16,6 +16,7 @@ from nominatim.db import properties from nominatim.db import utils as db_utils from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.errors import UsageError +from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" @@ -76,7 +77,7 @@ def _check_module(module_dir, conn): raise UsageError("Database module cannot be accessed.") from err -class LegacyTokenizer: +class LegacyTokenizer(AbstractTokenizer): """ The legacy tokenizer uses a special PostgreSQL module to normalize names and queries. The tokenizer thus implements normalization through calls to the database. @@ -238,7 +239,7 @@ class LegacyTokenizer: properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY) -class LegacyNameAnalyzer: +class LegacyNameAnalyzer(AbstractAnalyzer): """ The legacy analyzer uses the special Postgresql module for splitting names. @@ -255,14 +256,6 @@ class LegacyNameAnalyzer: self._cache = _TokenCache(self.conn) - def __enter__(self): - return self - - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - - def close(self): """ Free all resources used by the analyzer. """ @@ -271,8 +264,7 @@ class LegacyNameAnalyzer: self.conn = None - @staticmethod - def get_word_token_info(conn, words): + def get_word_token_info(self, words): """ Return token information for the given list of words. If a word starts with # it is assumed to be a full name otherwise is a partial name. @@ -283,7 +275,7 @@ class LegacyNameAnalyzer: The function is used for testing and debugging only and not necessarily efficient. """ - with conn.cursor() as cur: + with self.conn.cursor() as cur: cur.execute("""SELECT t.term, word_token, word_id FROM word, (SELECT unnest(%s::TEXT[]) as term) t WHERE word_token = (CASE @@ -352,7 +344,7 @@ class LegacyNameAnalyzer: - def update_special_phrases(self, phrases): + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@ -371,19 +363,17 @@ class LegacyNameAnalyzer: to_delete = existing_phrases - norm_phrases if to_add: - psycopg2.extras.execute_values( - cur, + cur.execute_values( """ INSERT INTO word (word_id, word_token, word, class, type, search_name_count, operator) - (SELECT nextval('seq_word'), make_standard_name(name), name, + (SELECT nextval('seq_word'), ' ' || make_standard_name(name), name, class, type, 0, CASE WHEN op in ('in', 'near') THEN op ELSE null END FROM (VALUES %s) as v(name, class, type, op))""", to_add) - if to_delete: - psycopg2.extras.execute_values( - cur, + if to_delete and should_replace: + cur.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) WHERE word = name and class = in_class and type = in_type and ((op = '-' and operator is null) or op = operator)""", @@ -400,11 +390,11 @@ class LegacyNameAnalyzer: cur.execute( """INSERT INTO word (word_id, word_token, country_code) (SELECT nextval('seq_word'), lookup_token, %s - FROM (SELECT ' ' || make_standard_name(n) as lookup_token + FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token FROM unnest(%s)n) y WHERE NOT EXISTS(SELECT * FROM word WHERE word_token = lookup_token and country_code = %s)) - """, (country_code, names, country_code)) + """, (country_code, list(names.values()), country_code)) def process_place(self, place): @@ -422,45 +412,40 @@ class LegacyNameAnalyzer: country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), list(names.values())) + self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(self.conn, value) - elif key == 'place': - token_info.add_place(self.conn, value) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, value)) - - if hnrs: - token_info.add_housenumbers(self.conn, hnrs) - - if addr_terms: - token_info.add_address_terms(self.conn, addr_terms) + self._process_place_address(token_info, address) return token_info.data - def _add_postcode(self, postcode): - """ Make sure the normalized postcode is present in the word table. - """ - def _create_postcode_from_db(pcode): - with self.conn.cursor() as cur: - cur.execute('SELECT create_postcode_id(%s)', (pcode, )) + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + + for key, value in address.items(): + if key == 'postcode': + # Make sure the normalized postcode is present in the word table. + if re.search(r'[:,;]', value) is None: + self._cache.add_postcode(self.conn, + self.normalize_postcode(value)) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(self.conn, value) + elif key == 'place': + token_info.add_place(self.conn, value) + elif not key.startswith('_') and key not in ('country', 'full'): + addr_terms.append((key, value)) + + if hnrs: + token_info.add_housenumbers(self.conn, hnrs) + + if addr_terms: + token_info.add_address_terms(self.conn, addr_terms) - if re.search(r'[:,;]', postcode) is None: - self._cache.postcodes.get(self.normalize_postcode(postcode), - _create_postcode_from_db) class _TokenInfo: @@ -518,10 +503,9 @@ class _TokenInfo: """ def _get_place(name): with conn.cursor() as cur: - cur.execute("""SELECT (addr_ids_from_name(%s) - || getorcreate_name_id(make_standard_name(%s), ''))::text, + cur.execute("""SELECT make_keywords(hstore('name' , %s))::text, word_ids_from_name(%s)::text""", - (name, name, name)) + (name, name)) return cur.fetchone() self.data['place_search'], self.data['place_match'] = \ @@ -589,18 +573,21 @@ class _TokenCache: with conn.cursor() as cur: cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text FROM generate_series(1, 100) as i""") - self._cached_housenumbers = {str(r[0]) : r[1] for r in cur} + self._cached_housenumbers = {str(r[0]): r[1] for r in cur} - # Get postcodes that are already saved - postcodes = OrderedDict() - with conn.cursor() as cur: - cur.execute("""SELECT word FROM word - WHERE class ='place' and type = 'postcode'""") - for row in cur: - postcodes[row[0]] = None - self.postcodes = _LRU(maxsize=32, init_data=postcodes) + # For postcodes remember the ones that have already been added + self.postcodes = set() def get_housenumber(self, number): """ Get a housenumber token from the cache. """ return self._cached_housenumbers.get(number) + + + def add_postcode(self, conn, postcode): + """ Make sure the given postcode is in the database. + """ + if postcode not in self.postcodes: + with conn.cursor() as cur: + cur.execute('SELECT create_postcode_id(%s)', (postcode, )) + self.postcodes.add(postcode)