CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
RETURNS BOOLEAN
AS $$
- SELECT info->>'place_match' is not null;
+ SELECT info->>'place' is not null;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
- SELECT (info->>'street')::INTEGER[] && street_tokens
+ SELECT (info->>'street')::INTEGER[] <@ street_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
RETURNS BOOLEAN
AS $$
- SELECT (info->>'place_match')::INTEGER[] && place_tokens
+ SELECT (info->>'place')::INTEGER[] <@ place_tokens
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
RETURNS INTEGER[]
AS $$
- SELECT (info->>'place_search')::INTEGER[]
+ SELECT (info->>'place')::INTEGER[]
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
RETURNS INTEGER[]
AS $$
- SELECT (info->'addr'->key->>0)::INTEGER[];
+ SELECT (info->'addr'->>key)::INTEGER[];
$$ LANGUAGE SQL IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
RETURNS BOOLEAN
AS $$
- SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
+ SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
$$ LANGUAGE SQL IMMUTABLE STRICT;
VALUES (term_id, term, 'w', json_build_object('count', term_count));
END IF;
- IF term_count < {{ max_word_freq }} THEN
- partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
- END IF;
+ partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END LOOP;
END;
$$
LANGUAGE plpgsql;
+CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ token INTEGER;
+BEGIN
+ SELECT min(word_id) INTO token
+ FROM word WHERE word_token = partial and type = 'w';
+
+ IF token IS NULL THEN
+ token := nextval('seq_word');
+ INSERT INTO word (word_id, word_token, type, info)
+ VALUES (token, partial, 'w', json_build_object('count', 0));
+ END IF;
+
+ RETURN token;
+END;
+$$
+LANGUAGE plpgsql;
+
+
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
RETURNS INTEGER
AS $$
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger()
self.data_dir = data_dir
self.naming_rules = None
self.term_normalization = None
- self.max_word_frequency = None
def init_new_db(self, config, init_db=True):
config='TOKENIZER_CONFIG'))
self.naming_rules = ICUNameProcessorRules(loader=loader)
self.term_normalization = config.TERM_NORMALIZATION
- self.max_word_frequency = config.MAX_WORD_FREQUENCY
self._install_php(config.lib_dir.php)
self._save_config(config)
with connect(self.dsn) as conn:
self.naming_rules = ICUNameProcessorRules(conn=conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
- self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, _):
""" Reimport the SQL functions for this tokenizer.
"""
with connect(self.dsn) as conn:
- max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
sqlp = SQLPreprocessor(conn, config)
- sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
- max_word_freq=max_word_freq)
+ sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self):
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent(f"""\
<?php
- @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+ @define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
with connect(self.dsn) as conn:
self.naming_rules.save_rules(conn)
- set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
- token_info.add_street(*self._compute_name_tokens({'name': value}))
+ token_info.add_street(self._compute_partial_tokens(value))
elif key == 'place':
- token_info.add_place(*self._compute_name_tokens({'name': value}))
+ token_info.add_place(self._compute_partial_tokens(value))
elif not key.startswith('_') and \
key not in ('country', 'full'):
- addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+ addr_terms.append((key, self._compute_partial_tokens(value)))
if hnrs:
hnrs = self._split_housenumbers(hnrs)
if addr_terms:
token_info.add_address_terms(addr_terms)
+ def _compute_partial_tokens(self, name):
+ """ Normalize the given term, split it into partial words and return
+ then token list for them.
+ """
+ norm_name = self.name_processor.get_search_normalized(name)
+
+ tokens = []
+ need_lookup = []
+ for partial in norm_name.split():
+ token = self._cache.partials.get(partial)
+ if token:
+ tokens.append(token)
+ else:
+ need_lookup.append(partial)
+
+ if need_lookup:
+ with self.conn.cursor() as cur:
+ cur.execute("""SELECT word, getorcreate_partial_word(word)
+ FROM unnest(%s) word""",
+ (need_lookup, ))
+
+ for partial, token in cur:
+ tokens.append(token)
+ self._cache.partials[partial] = token
+
+ return tokens
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
self.data['hnr'] = ';'.join(hnrs)
- def add_street(self, fulls, _):
+ def add_street(self, tokens):
""" Add addr:street match terms.
"""
- if fulls:
- self.data['street'] = self._mk_array(fulls)
+ if tokens:
+ self.data['street'] = self._mk_array(tokens)
- def add_place(self, fulls, partials):
+ def add_place(self, tokens):
""" Add addr:place search and match terms.
"""
- if fulls:
- self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
- self.data['place_match'] = self._mk_array(fulls)
+ if tokens:
+ self.data['place'] = self._mk_array(tokens)
def add_address_terms(self, terms):
""" Add additional address terms.
"""
- tokens = {}
-
- for key, fulls, partials in terms:
- if fulls:
- tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
- self._mk_array(fulls)]
+ tokens = {key: self._mk_array(partials)
+ for key, partials in terms if partials}
if tokens:
self.data['addr'] = tokens
"""
def __init__(self):
self.names = {}
+ self.partials = {}
self.postcodes = set()
self.housenumbers = {}
Then placex contains
| object | parent_place_id |
| N1 | N2 |
- Then search_name contains
- | object | name_vector | nameaddress_vector |
- | N1 | #Walltown | Strange, Town |
When sending search query "23 Rose Street"
Then exactly 1 results are returned
And results contain
| W1 | highway | residential | Rose Street | :w-north |
| N2 | place | city | Strange Town | :p-N1 |
When importing
- Then search_name contains
- | object | name_vector | nameaddress_vector |
- | N1 | #Walltown, #Blue house | Walltown, Strange, Town |
When sending search query "23 Walltown, Strange Town"
Then results contain
| osm | display_name |
| W1 | highway | residential | Rose Street | :w-north |
| N2 | place | city | Strange Town | :p-N1 |
When importing
- Then search_name contains
- | object | name_vector | nameaddress_vector |
- | N1 | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
When sending search query "23 Moon Sun, Strange Town"
Then results contain
| osm | display_name |
| W1 | highway | residential | Rose Street | Walltown | :w-north |
| N2 | place | suburb | Strange Town | Walltown | :p-N1 |
When importing
- Then search_name contains
- | object | name_vector | nameaddress_vector |
- | N1 | #Walltown | Strange, Town |
When sending search query "23 Rose Street, Walltown"
Then exactly 1 result is returned
And results contain
| W1 | highway | residential | Rose Street | :w-north |
| N2 | place | suburb | Strange Town | :p-N1 |
When importing
- Then search_name contains
- | object | name_vector | nameaddress_vector |
- | N1 | #Green Moss | Walltown |
When sending search query "Green Moss, Rose Street, Walltown"
Then exactly 0 result is returned
When sending search query "Green Moss, Walltown"