- def get_replacement_pairs(self):
- """ Return the list of possible compound decompositions with
- application of abbreviations included.
- The result is a list of pairs: the first item is the sequence to
- replace, the second is a list of replacements.
- """
- synonyms = defaultdict(set)
-
- # First add entries for compound decomposition.
- for suffix in self.compound_suffixes:
- variants = (suffix + ' ', ' ' + suffix + ' ')
- for key in variants:
- synonyms[key].update(variants)
-
- for full, abbr in self.abbreviations.items():
- key = ' ' + full + ' '
- # Entries in the abbreviation list always apply to full words:
- synonyms[key].update((' ' + a + ' ' for a in abbr))
- # Replacements are optional, so add a noop
- synonyms[key].add(key)
-
- if full in self.compound_suffixes:
- # Full word abbreviating to compunded version.
- synonyms[key].update((a + ' ' for a in abbr))
-
- key = full + ' '
- # Uncompunded suffix abbrevitating to decompounded version.
- synonyms[key].update((' ' + a + ' ' for a in abbr))
- # Uncompunded suffix abbrevitating to compunded version.
- synonyms[key].update((a + ' ' for a in abbr))
-
- # sort the resulting list by descending length (longer matches are prefered).
- sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
-
- return [(k, list(synonyms[k])) for k in sorted_keys]
-
-
- def _load_from_yaml(self):
- rules = yaml.safe_load(self.configfile.read_text())
-
- self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
- self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
- self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
- self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))