addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_tokens = list({t.token for t in addr_partials})
- exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
+ exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1))
if (len(name_partials) > 3 or exp_count < 8000):
yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
if name_fulls:
fulls_count = sum(t.count for t in name_fulls)
- if fulls_count < 50000 or addr_count < 50000:
+ if fulls_count < 80000 or addr_count < 50000:
yield penalty, fulls_count / (2**len(addr_tokens)), \
self.get_full_name_ranking(name_fulls, addr_partials,
fulls_count > 30000 / max(1, len(addr_tokens)))
address lookups will use the index, when the occurrences are not
too many.
"""
- # At this point drop unindexed partials from the address.
- # This might yield wrong results, nothing we can do about that.
if use_lookup:
addr_restrict_tokens = []
addr_lookup_tokens = [t.token for t in addr_partials]
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id,
- (CASE WHEN w.count is null THEN '{}'::JSONB
+ (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END
||
- CASE WHEN a.count is null THEN '{}'::JSONB
+ CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""")
drop_tables(conn, 'tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
- (CASE WHEN wf.info is null THEN word.info
- ELSE coalesce(word.info, '{}'::jsonb) || wf.info
- END) as info
+ coalesce(word.info, '{}'::jsonb)
+ - 'count' - 'addr_count' ||
+ coalesce(wf.info, '{}'::jsonb)
+ as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
+ ORDER BY word_id
""")
drop_tables(conn, 'word_frequencies')
if word_id:
result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
- variants = analyzer.compute_variants(word_id)
+ varout = analyzer.compute_variants(word_id)
+ if isinstance(varout, tuple):
+ variants = varout[0]
+ else:
+ variants = varout
if variants:
hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
- (word_id, list(variants)))
+ (word_id, variants))
result = hid, variants[0]
self._cache.housenumbers[word_id] = result
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
- variants = analyzer.compute_variants(word_id)
+ varset = analyzer.compute_variants(word_id)
+ if isinstance(varset, tuple):
+ variants, lookups = varset
+ else:
+ variants, lookups = varset, None
if not variants:
continue
with self.conn.cursor() as cur:
- cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
- (token_id, variants))
+ cur.execute("SELECT * FROM getorcreate_full_word(%s, %s, %s)",
+ (token_id, variants, lookups))
full, part = cast(Tuple[int, List[int]], cur.fetchone())
self._cache.names[token_id] = (full, part)