installation to your needs. If the provided modules are not enough, you can
also provide your own implementations. This section describes how to do that.
+!!! warning
+ This API is currently in early alpha status. While this API is meant to
+ be a public API on which other sanitizers and token analyzers may be
+ implemented, it is not guaranteed to be stable at the moment.
+
+
## Using non-standard sanitizers and token analyzers
Sanitizer names (in the `step` property) and token analysis names (in the
postcode_name = place.name.strip().upper()
variant_base = None
else:
- postcode_name = analyzer.normalize(place.name)
+ postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
if analyzer is None:
variants = [term]
else:
- variants = analyzer.get_variants_ascii(variant)
+ variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
- norm_name = analyzer.normalize(hnr.name)
- if norm_name:
- result = self._cache.housenumbers.get(norm_name, result)
+ word_id = analyzer.get_canonical_id(hnr)
+ if word_id:
+ result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
- variants = analyzer.get_variants_ascii(norm_name)
+ variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
- (norm_name, list(variants)))
+ (word_id, list(variants)))
result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
- self._cache.housenumbers[norm_name] = result
+ self._cache.housenumbers[word_id] = result
return result
for name in names:
analyzer_id = name.get_attr('analyzer')
analyzer = self.token_analysis.get_analyzer(analyzer_id)
- norm_name = analyzer.normalize(name.name)
+ word_id = analyzer.get_canonical_id(name)
if analyzer_id is None:
- token_id = norm_name
+ token_id = word_id
else:
- token_id = f'{norm_name}@{analyzer_id}'
+ token_id = f'{word_id}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
- variants = analyzer.get_variants_ascii(norm_name)
+ variants = analyzer.compute_variants(word_id)
if not variants:
continue
postcode_name = item.name.strip().upper()
variant_base = None
else:
- postcode_name = analyzer.normalize(item.name)
+ postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
variants = {term}
if analyzer is not None and variant_base:
- variants.update(analyzer.get_variants_ascii(variant_base))
+ variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
from typing import Mapping, List, Any
from nominatim.typing import Protocol
+from nominatim.data.place_name import PlaceName
class Analyzer(Protocol):
""" The `create()` function of an analysis module needs to return an
object that implements the following functions.
"""
- def normalize(self, name: str) -> str:
- """ Return the normalized form of the name. This is the standard form
- from which possible variants for the name can be derived.
+ def get_canonical_id(self, name: PlaceName) -> str:
+ """ Return the canonical form of the given name. The canonical ID must
+ be unique (the same ID must always yield the same variants) and
+ must be a form from which the variants can be derived.
+
+ Arguments:
+ name: Extended place name description as prepared by
+ the sanitizers.
+
+ Returns:
+ ID string with a canonical form of the name. The string may
+ be empty, when the analyzer cannot analyze the name at all,
+ for example because the character set in use does not match.
"""
- def get_variants_ascii(self, norm_name: str) -> List[str]:
- """ Compute the spelling variants for the given normalized name
- and transliterate the result.
+ def compute_variants(self, canonical_id: str) -> List[str]:
+ """ Compute the transliterated spelling variants for the given
+ canonical ID.
+
+ Arguments:
+ canonical_id: ID string previously computed with
+ `get_canonical_id()`.
+
+ Returns:
+ A list of possible spelling variants. All strings must have
+ been transformed with the global normalizer and
+ transliterator ICU rules. Otherwise they cannot be matched
+ against the query later.
+ The list may be empty, when there are no useful
+ spelling variants. This may happen, when an analyzer only
+ produces extra variants to the canonical spelling.
"""
+
class AnalysisModule(Protocol):
""" Protocol for analysis modules.
"""
as specified in the tokenizer configuration.
normalizer: an ICU Transliterator with the compiled normalization
rules.
- transliterator: an ICU tranliterator with the compiled
+ transliterator: an ICU transliterator with the compiled
transliteration rules.
Returns:
import datrie
from nominatim.errors import UsageError
+from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
- def normalize(self, name: str) -> str:
+ def get_canonical_id(self, name: PlaceName) -> str:
""" Return the normalized form of the name. This is the standard form
from which possible variants for the name can be derived.
"""
- return cast(str, self.norm.transliterate(name)).strip()
+ return cast(str, self.norm.transliterate(name.name)).strip()
- def get_variants_ascii(self, norm_name: str) -> List[str]:
+ def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
from typing import Any, List, cast
import re
+from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
RE_NON_DIGIT = re.compile('[^0-9]')
self.mutator = MutationVariantGenerator('␣', (' ', ''))
- def normalize(self, name: str) -> str:
+ def get_canonical_id(self, name: PlaceName) -> str:
""" Return the normalized form of the housenumber.
"""
# shortcut for number-only numbers, which make up 90% of the data.
- if RE_NON_DIGIT.search(name) is None:
- return name
+ if RE_NON_DIGIT.search(name.name) is None:
+ return name.name
- norm = cast(str, self.trans.transliterate(self.norm.transliterate(name)))
+ norm = cast(str, self.trans.transliterate(self.norm.transliterate(name.name)))
# If there is a significant non-numeric part, use as is.
if RE_NAMED_PART.search(norm) is None:
# Otherwise add optional spaces between digits and letters.
return norm
- def get_variants_ascii(self, norm_name: str) -> List[str]:
+ def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized housenumber.
Generates variants for optional spaces (marked with '␣').
from typing import Any, List
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
+from nominatim.data.place_name import PlaceName
### Configuration section
""" Special normalization and variant generation for postcodes.
This analyser must not be used with anything but postcodes as
- it follows some special rules: `normalize` doesn't necessarily
- need to return a standard form as per normalization rules. It
- needs to return the canonical form of the postcode that is also
- used for output. `get_variants_ascii` then needs to ensure that
+ it follows some special rules: the canonial ID is the form that
+ is used for the output. `compute_variants` then needs to ensure that
the generated variants once more follow the standard normalization
and transliteration, so that postcodes are correctly recognised by
the search algorithm.
self.mutator = MutationVariantGenerator(' ', (' ', ''))
- def normalize(self, name: str) -> str:
+ def get_canonical_id(self, name: PlaceName) -> str:
""" Return the standard form of the postcode.
"""
- return name.strip().upper()
+ return name.name.strip().upper()
- def get_variants_ascii(self, norm_name: str) -> List[str]:
+ def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized postcode.
Takes the canonical form of the postcode, normalizes it using the
from icu import Transliterator
import nominatim.tokenizer.token_analysis.postcodes as module
+from nominatim.data.place_name import PlaceName
from nominatim.errors import UsageError
DEFAULT_NORMALIZATION = """ :: NFD ();
def get_normalized_variants(proc, name):
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
- return proc.get_variants_ascii(norm.transliterate(name).strip())
+ return proc.compute_variants(norm.transliterate(name).strip())
@pytest.mark.parametrize('name,norm', [('12', '12'),
('A 34 ', 'A 34'),
('34-av', '34-AV')])
-def test_normalize(analyser, name, norm):
- assert analyser.normalize(name) == norm
+def test_get_canonical_id(analyser, name, norm):
+ assert analyser.get_canonical_id(PlaceName(name=name, kind='', suffix='')) == norm
@pytest.mark.parametrize('postcode,variants', [('12345', {'12345'}),
('AB-998', {'ab 998', 'ab998'}),
('23 FGH D3', {'23 fgh d3', '23fgh d3',
'23 fghd3', '23fghd3'})])
-def test_get_variants_ascii(analyser, postcode, variants):
- out = analyser.get_variants_ascii(postcode)
+def test_compute_variants(analyser, postcode, variants):
+ out = analyser.compute_variants(postcode)
assert len(out) == len(set(out))
assert set(out) == variants
def get_normalized_variants(proc, name):
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
- return proc.get_variants_ascii(norm.transliterate(name).strip())
+ return proc.compute_variants(norm.transliterate(name).strip())
def test_no_variants():
def variants(self, name):
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
- return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip()))
+ return set(self.analysis.compute_variants(norm.transliterate(name).strip()))
@pytest.mark.parametrize('pattern', ('(capture)', ['a list']))