"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
-from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, cast, Dict, Set
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
+ cast, Dict, Set, Iterable
from collections import OrderedDict
import logging
from pathlib import Path
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
+ assert config.project_dir is not None
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
+ assert config.project_dir is not None
+
with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
+ assert config.project_dir is not None
+
with connect(self.dsn) as conn:
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
modulepath = config.DATABASE_MODULE_PATH or \
This is a special migration function for updating existing databases
to new software versions.
"""
+ assert config.project_dir is not None
+
self.normalization = config.TERM_NORMALIZATION
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
self._save_config(conn, config)
- def update_statistics(self) -> None:
+ def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
return LegacyNameAnalyzer(self.dsn, normalizer)
+ def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+ """ Return a list of the `num` most frequent full words
+ in the database.
+ """
+ with conn.cursor() as cur:
+ cur.execute(""" SELECT word FROM word WHERE word is not null
+ ORDER BY search_name_count DESC LIMIT %s""", (num,))
+ return list(s[0] for s in cur)
+
+
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
- php_file = self.data_dir / "tokenizer.php"
+ if config.lib_dir.php is not None:
+ php_file = self.data_dir / "tokenizer.php"
- if not php_file.exists() or overwrite:
- php_file.write_text(dedent(f"""\
- <?php
- @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
- @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
- require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
- """), encoding='utf-8')
+ if not php_file.exists() or overwrite:
+ php_file.write_text(dedent(f"""\
+ <?php
+ @define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
+ @define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
+ require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+ """), encoding='utf-8')
def _init_db_tables(self, config: Configuration) -> None:
- def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
+ def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
"""
with conn.cursor() as cur:
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
- self.data['hnr_tokens'], self.data['hnr'] = \
- cur.fetchone() # type: ignore[no-untyped-call]
+ result = cur.fetchone()
+ assert result is not None
+ self.data['hnr_tokens'], self.data['hnr'] = result
def set_postcode(self, postcode: str) -> None:
def add_street(self, conn: Connection, street: str) -> None:
""" Add addr:street match terms.
"""
- def _get_street(name: str) -> List[int]:
+ def _get_street(name: str) -> Optional[str]:
with conn.cursor() as cur:
- return cast(List[int],
+ return cast(Optional[str],
cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
tokens = self.cache.streets.get(street, _get_street)
- if tokens:
- self.data['street'] = tokens
+ self.data['street'] = tokens or '{}'
def add_place(self, conn: Connection, place: str) -> None:
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
word_ids_from_name(%s)::text""",
(name, name))
- return cast(Tuple[List[int], List[int]],
- cur.fetchone()) # type: ignore[no-untyped-call]
+ return cast(Tuple[List[int], List[int]], cur.fetchone())
self.data['place_search'], self.data['place_match'] = \
self.cache.places.get(place, _get_place)
cur.execute("""SELECT addr_ids_from_name(%s)::text,
word_ids_from_name(%s)::text""",
(name, name))
- return cast(Tuple[List[int], List[int]],
- cur.fetchone()) # type: ignore[no-untyped-call]
+ return cast(Tuple[List[int], List[int]], cur.fetchone())
tokens = {}
for key, value in terms: