def get_search_rules(self):
""" Return the ICU rules to be used during search.
- The rules combine normalization, compound decomposition (including
- abbreviated compounds) and transliteration.
+ The rules combine normalization and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
- # For all compound suffixes: add them in their full and any abbreviated form.
- suffixes = set()
- for suffix in self.compound_suffixes:
- suffixes.add(suffix)
- suffixes.update(self.abbreviations.get(suffix, []))
-
- for suffix in sorted(suffixes, key=len, reverse=True):
- rules.write("'{0} ' > ' {0} ';".format(suffix))
-
- # Finally add transliteration.
+ # Then add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
"""
synonyms = defaultdict(set)
+ # First add entries for compound decomposition.
+ for suffix in self.compound_suffixes:
+ variants = (suffix + ' ', ' ' + suffix + ' ')
+ for key in variants:
+ synonyms[key].update(variants)
+
for full, abbr in self.abbreviations.items():
key = ' ' + full + ' '
# Entries in the abbreviation list always apply to full words:
# Replacements are optional, so add a noop
synonyms[key].add(key)
- # Entries in the compound list expand to themselves and to
- # abbreviations.
- for suffix in self.compound_suffixes:
- keyset = synonyms[suffix + ' ']
- keyset.add(' ' + suffix + ' ')
- keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
- # The terms the entries are shortended to, need to be decompunded as well.
- for abbr in self.abbreviations.get(suffix, []):
- synonyms[abbr + ' '].add(' ' + abbr + ' ')
+ if full in self.compound_suffixes:
+ # Full word abbreviating to compunded version.
+ synonyms[key].update((a + ' ' for a in abbr))
+
+ key = full + ' '
+ # Uncompunded suffix abbrevitating to decompounded version.
+ synonyms[key].update((' ' + a + ' ' for a in abbr))
+ # Uncompunded suffix abbrevitating to compunded version.
+ synonyms[key].update((a + ' ' for a in abbr))
# sort the resulting list by descending length (longer matches are prefered).
sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
Scenario: Special characters in name
Given the places
| osm | class | type | name |
- | N1 | place | locality | Jim-Knopf-Str |
+ | N1 | place | locality | Jim-Knopf-Straße |
| N2 | place | locality | Smith/Weston |
| N3 | place | locality | space mountain |
| N4 | place | locality | space |
proc = ICUNameProcessor(rules)
assert set(get_normalized_variants(proc, "Bauwegstraße")) \
- == {'bauweg straße', 'bauweg str'}
- assert get_normalized_variants(proc, "Bauwegstr") == ['bauweg str']
- assert get_normalized_variants(proc, "holzweg") == ['holz weg']
+ == {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}
+ assert get_normalized_variants(proc, "Bauwegstr") == ['bauwegstr']
+ assert set(get_normalized_variants(proc, "holzweg")) \
+ == {'holz weg', 'holzweg'}
assert get_normalized_variants(proc, "hallo") == ['hallo']
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
- assert proc.get_search_normalized('Master Street') == 'master street'
- assert proc.get_search_normalized('Earnes St') == 'earne s st'
- assert proc.get_search_normalized('Nostreet') == 'no street'
+ assert proc.get_search_normalized('Master Street') == 'master street'
+ assert proc.get_search_normalized('Earnes St') == 'earnes st'
+ assert proc.get_search_normalized('Nostreet') == 'nostreet'
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" Baum straße ") == " baum straße "
- assert trans.transliterate(" Baumstraße ") == " baum straße "
- assert trans.transliterate(" Baumstrasse ") == " baum strasse "
- assert trans.transliterate(" Baumstr ") == " baum str "
- assert trans.transliterate(" Baumwegstr ") == " baumweg str "
+ assert trans.transliterate(" Baumstraße ") == " baumstraße "
+ assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
+ assert trans.transliterate(" Baumstr ") == " baumstr "
+ assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
assert trans.transliterate(" Αθήνα ") == " athēna "
assert trans.transliterate(" проспект ") == " prospekt "
repl = ICURuleLoader(fpath).get_replacement_pairs()
assert [(a, sorted(b)) for a, b in repl] == \
- [(' strasse ', [' st ', ' str ', ' strasse ']),
- ('strasse ', [' st ', ' str ', ' strasse ']),
- ('pfad ', [' pfad ']),
- ('str ' , [' str ']),
- ('st ' , [' st '])]
+ [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
+ ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
+ (' pfad ', [' pfad ', 'pfad ']),
+ ('pfad ', [' pfad ', 'pfad '])]
def test_get_replacement_pairs_multi_from(cfgfile):
tok = tokenizer_factory()
tok.init_new_db(test_config)
- assert word_table.get_partial_words() == {('te', 1), ('st', 1), ('52', 1),
+ assert word_table.get_partial_words() == {('test', 1), ('52', 1),
('no', 1), ('area', 2),
+ ('holzstrasse', 1), ('holzstr', 1),
('holz', 1), ('strasse', 1),
('str', 1)}