from nominatim.db.properties import set_property, get_property
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
- return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+ return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+ self.loader.make_token_analysis())
def _install_php(self, phpdir):
normalization.
"""
- def __init__(self, dsn, name_proc):
+ def __init__(self, dsn, sanitizer, token_analysis):
self.conn = connect(dsn).connection
self.conn.autocommit = True
- self.name_processor = name_proc
+ self.sanitizer = sanitizer
+ self.token_analysis = token_analysis
self._cache = _TokenCache()
self.conn = None
+ def _search_normalized(self, name):
+ """ Return the search token transliteration of the given name.
+ """
+ return self.token_analysis.get_search_normalized(name)
+
+
+ def _normalized(self, name):
+ """ Return the normalized version of the given name with all
+ non-relevant information removed.
+ """
+ return self.token_analysis.get_normalized(name)
+
+
def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
partial_tokens = {}
for word in words:
if word.startswith('#'):
- full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+ full_tokens[word] = self._search_normalized(word[1:])
else:
- partial_tokens[word] = self.name_processor.get_search_normalized(word)
+ partial_tokens[word] = self._search_normalized(word)
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
This function takes minor shortcuts on transliteration.
"""
- return self.name_processor.get_search_normalized(hnr)
+ return self._search_normalized(hnr)
def update_postcodes_from_db(self):
""" Update postcode tokens in the word table from the location_postcode
if postcode is None:
to_delete.append(word)
else:
- copystr.add(self.name_processor.get_search_normalized(postcode),
+ copystr.add(self._search_normalized(postcode),
'P', postcode)
if to_delete:
completely replaced. Otherwise the phrases are added to the
already existing ones.
"""
- norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+ norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
added = 0
with CopyBuffer() as copystr:
for word, cls, typ, oper in to_add:
- term = self.name_processor.get_search_normalized(word)
+ term = self._search_normalized(word)
if term:
copystr.add(term, 'S', word,
json.dumps({'class': cls, 'type': typ,
def add_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
+ # Make sure any name preprocessing for country names applies.
+ info = PlaceInfo({'name': names, 'country_code': country_code,
+ 'rank_address': 4, 'class': 'boundary',
+ 'type': 'administrative'})
+ self._add_country_full_names(country_code,
+ self.sanitizer.process_names(info)[0])
+
+
+ def _add_country_full_names(self, country_code, names):
+ """ Add names for the given country from an already sanitized
+ name list.
+ """
word_tokens = set()
- for name in self._compute_full_names(names):
- norm_name = self.name_processor.get_search_normalized(name)
+ for name in names:
+ norm_name = self._search_normalized(name.name)
if norm_name:
word_tokens.add(norm_name)
def process_place(self, place):
""" Determine tokenizer information about the given place.
- Returns a JSON-serialisable structure that will be handed into
+ Returns a JSON-serializable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo(self._cache)
- names = place.name
+ names, address = self.sanitizer.process_names(place)
if names:
fulls, partials = self._compute_name_tokens(names)
token_info.add_names(fulls, partials)
if place.is_country():
- self.add_country_names(place.country_code, names)
+ self._add_country_full_names(place.country_code, names)
- address = place.address
if address:
self._process_place_address(token_info, address)
def _process_place_address(self, token_info, address):
hnrs = []
addr_terms = []
- for key, value in address.items():
- if key == 'postcode':
- self._add_postcode(value)
- elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
- hnrs.append(value)
- elif key == 'street':
- token_info.add_street(self._compute_partial_tokens(value))
- elif key == 'place':
- token_info.add_place(self._compute_partial_tokens(value))
- elif not key.startswith('_') and \
- key not in ('country', 'full'):
- addr_terms.append((key, self._compute_partial_tokens(value)))
+ for item in address:
+ if item.kind == 'postcode':
+ self._add_postcode(item.name)
+ elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+ hnrs.append(item.name)
+ elif item.kind == 'street':
+ token_info.add_street(self._compute_partial_tokens(item.name))
+ elif item.kind == 'place':
+ token_info.add_place(self._compute_partial_tokens(item.name))
+ elif not item.kind.startswith('_') and \
+ item.kind not in ('country', 'full'):
+ addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
if hnrs:
hnrs = self._split_housenumbers(hnrs)
""" Normalize the given term, split it into partial words and return
then token list for them.
"""
- norm_name = self.name_processor.get_search_normalized(name)
+ norm_name = self._search_normalized(name)
tokens = []
need_lookup = []
return tokens
+
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
- full_names = self._compute_full_names(names)
full_tokens = set()
partial_tokens = set()
- for name in full_names:
- norm_name = self.name_processor.get_normalized(name)
+ for name in names:
+ norm_name = self._normalized(name.name)
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
- variants = self.name_processor.get_variants_ascii(norm_name)
+ variants = self.token_analysis.get_variants_ascii(norm_name)
if not variants:
continue
return full_tokens, partial_tokens
- @staticmethod
- def _compute_full_names(names):
- """ Return the set of all full name word ids to be used with the
- given dictionary of names.
- """
- full_names = set()
- for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
- if name:
- full_names.add(name)
-
- brace_idx = name.find('(')
- if brace_idx >= 0:
- full_names.add(name[:brace_idx].strip())
-
- return full_names
-
-
def _add_postcode(self, postcode):
""" Make sure the normalized postcode is present in the word table.
"""
postcode = self.normalize_postcode(postcode)
if postcode not in self._cache.postcodes:
- term = self.name_processor.get_search_normalized(postcode)
+ term = self._search_normalized(postcode)
if not term:
return
--- /dev/null
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+import importlib
+
+from nominatim.errors import UsageError
+
+class PlaceName:
+ """ A searchable name for a place together with properties.
+ Every name object saves the name proper and two basic properties:
+ * 'kind' describes the name of the OSM key used without any suffixes
+ (i.e. the part after the colon removed)
+ * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+ is the part of the key after the first colon.
+ In addition to that, the name may have arbitrary additional attributes.
+ Which attributes are used, depends on the token analyser.
+ """
+
+ def __init__(self, name, kind, suffix):
+ self.name = name
+ self.kind = kind
+ self.suffix = suffix
+ self.attr = {}
+
+
+ def __repr__(self):
+ return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+ def clone(self, name=None, kind=None, suffix=None, attr=None):
+ """ Create a deep copy of the place name, optionally with the
+ given parameters replaced. In the attribute list only the given
+ keys are updated. The list is not replaced completely.
+ In particular, the function cannot to be used to remove an
+ attribute from a place name.
+ """
+ newobj = PlaceName(name or self.name,
+ kind or self.kind,
+ suffix or self.suffix)
+
+ newobj.attr.update(self.attr)
+ if attr:
+ newobj.attr.update(attr)
+
+ return newobj
+
+
+ def set_attr(self, key, value):
+ """ Add the given property to the name. If the property was already
+ set, then the value is overwritten.
+ """
+ self.attr[key] = value
+
+
+ def get_attr(self, key, default=None):
+ """ Return the given property or the value of 'default' if it
+ is not set.
+ """
+ return self.attr.get(key, default)
+
+
+ def has_attr(self, key):
+ """ Check if the given attribute is set.
+ """
+ return key in self.attr
+
+
+class _ProcessInfo:
+ """ Container class for information handed into to handler functions.
+ The 'names' and 'address' members are mutable. A handler must change
+ them by either modifying the lists place or replacing the old content
+ with a new list.
+ """
+
+ def __init__(self, place):
+ self.place = place
+ self.names = self._convert_name_dict(place.name)
+ self.address = self._convert_name_dict(place.address)
+
+
+ @staticmethod
+ def _convert_name_dict(names):
+ """ Convert a dictionary of names into a list of PlaceNames.
+ The dictionary key is split into the primary part of the key
+ and the suffix (the part after an optional colon).
+ """
+ out = []
+
+ if names:
+ for key, value in names.items():
+ parts = key.split(':', 1)
+ out.append(PlaceName(value.strip(),
+ parts[0].strip(),
+ parts[1].strip() if len(parts) > 1 else None))
+
+ return out
+
+
+class PlaceSanitizer:
+ """ Controller class which applies sanitizer functions on the place
+ names and address before they are used by the token analysers.
+ """
+
+ def __init__(self, rules):
+ self.handlers = []
+
+ if rules:
+ for func in rules:
+ if 'step' not in func:
+ raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+ module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
+ handler_module = importlib.import_module(module_name)
+ self.handlers.append(handler_module.create(func))
+
+
+ def process_names(self, place):
+ """ Extract a sanitized list of names and address parts from the
+ given place. The function returns a tuple
+ (list of names, list of address names)
+ """
+ obj = _ProcessInfo(place)
+
+ for func in self.handlers:
+ func(obj)
+
+ return obj.names, obj.address
monkeypatch.undo()
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
- variants=('~gasse -> gasse', 'street => st', )):
+ variants=('~gasse -> gasse', 'street => st', ),
+ sanitizers=[]):
cfgstr = {'normalization' : list(norm),
- 'transliteration' : list(trans),
- 'variants' : [ {'words': list(variants)}]}
+ 'sanitizers' : sanitizers,
+ 'transliteration' : list(trans),
+ 'variants' : [ {'words': list(variants)}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
@pytest.fixture(autouse=True)
def setup(self, analyzer, sql_functions):
- with analyzer() as anl:
+ sanitizers = [{'step': 'split-name-list'},
+ {'step': 'strip-brace-terms'}]
+ with analyzer(sanitizers=sanitizers) as anl:
self.analyzer = anl
yield anl
def expect_name_terms(self, info, *expected_terms):
tokens = self.analyzer.get_word_token_info(expected_terms)
- print (tokens)
for token in tokens:
assert token[2] is not None, "No token for {0}".format(token)
def process_named_place(self, names):
- place = {'name': names}
-
- return self.analyzer.process_place(PlaceInfo(place))
+ return self.analyzer.process_place(PlaceInfo({'name': names}))
def test_simple_names(self):