X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/9963261d8d572f7a0d88ef27f5d938f085c603ba..1c33cb3186a38ceb5cc4de0975ae1956c861f9b5:/nominatim/tokenizer/base.py?ds=sidebyside diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 1c1ca9f7..f0fd9dd0 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -5,17 +5,17 @@ # Copyright (C) 2022 by the Nominatim developer community. # For a full list of authors see the git log. """ -Abstract class defintions for tokenizers. These base classes are here +Abstract class definitions for tokenizers. These base classes are here mainly for documentation purposes. """ from abc import ABC, abstractmethod from typing import List, Tuple, Dict, Any, Optional, Iterable from pathlib import Path -from typing_extensions import Protocol - from nominatim.config import Configuration +from nominatim.db.connection import Connection from nominatim.data.place_info import PlaceInfo +from nominatim.typing import Protocol class AbstractAnalyzer(ABC): """ The analyzer provides the functions for analysing names and building @@ -114,7 +114,7 @@ class AbstractAnalyzer(ABC): the search index. Arguments: - place: Place information retrived from the database. + place: Place information retrieved from the database. Returns: A JSON-serialisable structure that will be handed into @@ -142,7 +142,7 @@ class AbstractTokenizer(ABC): init_db: When set to False, then initialisation of database tables should be skipped. This option is only required for - migration purposes and can be savely ignored by custom + migration purposes and can be safely ignored by custom tokenizers. TODO: can we move the init_db parameter somewhere else? @@ -234,6 +234,13 @@ class AbstractTokenizer(ABC): """ + @abstractmethod + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + + class TokenizerModule(Protocol): """ Interface that must be exported by modules that implement their own tokenizer.