self.loader.make_token_analysis())
+ def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+ """ Return a list of the `num` most frequent full words
+ in the database.
+ """
+ with conn.cursor() as cur:
+ cur.execute("""SELECT word, sum((info->>'count')::int) as count
+ FROM word WHERE type = 'W'
+ GROUP BY word
+ ORDER BY count DESC LIMIT %s""", (num,))
+ return list(s[0].split('@')[0] for s in cur)
+
+
def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
with self.conn.cursor() as cur:
- cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
- result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+ hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+ result = hid, norm_name
self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
- cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
- (word_id, list(variants)))
- result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
+ hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+ (word_id, list(variants)))
+ result = hid, variants[0]
self._cache.housenumbers[word_id] = result
return result
with self.conn.cursor() as cur:
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
(token_id, variants))
- full, part = cast(Tuple[int, List[int]],
- cur.fetchone()) # type: ignore[no-untyped-call]
+ full, part = cast(Tuple[int, List[int]], cur.fetchone())
self._cache.names[token_id] = (full, part)
self.names: Optional[str] = None
self.housenumbers: Set[str] = set()
self.housenumber_tokens: Set[int] = set()
- self.street_tokens: Set[int] = set()
+ self.street_tokens: Optional[Set[int]] = None
self.place_tokens: Set[int] = set()
self.address_tokens: Dict[str, str] = {}
self.postcode: Optional[str] = None
out['hnr'] = ';'.join(self.housenumbers)
out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
- if self.street_tokens:
+ if self.street_tokens is not None:
out['street'] = self._mk_array(self.street_tokens)
if self.place_tokens:
def add_street(self, tokens: Iterable[int]) -> None:
""" Add addr:street match terms.
"""
+ if self.street_tokens is None:
+ self.street_tokens = set()
self.street_tokens.update(tokens)