config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalization_rules)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
- config['mutations'] = rules.get('mutations', [])
+
+ # parse mutation rules
+ config['mutations'] = []
+ for rule in rules.get('mutations', []):
+ if 'pattern' not in rule:
+ raise UsageError("Missing field 'pattern' in mutation configuration.")
+ if not isinstance(rule['pattern'], str):
+ raise UsageError("Field 'pattern' in mutation configuration "
+ "must be a simple text field.")
+ if 'replacements' not in rule:
+ raise UsageError("Missing field 'replacements' in mutation configuration.")
+ if not isinstance(rule['replacements'], list):
+ raise UsageError("Field 'replacements' in mutation configuration "
+ "must be a list of texts.")
+
+ config['mutations'].append((rule['pattern'], rule['replacements']))
return config
### Analysis section
-def create(transliterator, config):
+def create(normalizer, transliterator, config):
""" Create a new token analysis instance for this module.
"""
- return GenericTokenAnalysis(transliterator, config)
+ return GenericTokenAnalysis(normalizer, transliterator, config)
class GenericTokenAnalysis:
and provides the functions to apply the transformations.
"""
- def __init__(self, to_ascii, config):
+ def __init__(self, norm, to_ascii, config):
+ self.norm = norm
self.to_ascii = to_ascii
self.variant_only = config['variant_only']
self.replacements = None
# set up mutation rules
- self.mutations = []
- for cfg in config['mutations']:
- if 'pattern' not in cfg:
- raise UsageError("Missing field 'pattern' in mutation configuration.")
- if not isinstance(cfg['pattern'], str):
- raise UsageError("Field 'pattern' in mutation configuration "
- "must be a simple text field.")
- if 'replacements' not in cfg:
- raise UsageError("Missing field 'replacements' in mutation configuration.")
- if not isinstance(cfg['replacements'], list):
- raise UsageError("Field 'replacements' in mutation configuration "
- "must be a list of texts.")
-
- self.mutations.append(MutationVariantGenerator(cfg['pattern'],
- cfg['replacements']))
+ self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
+
+
+ def normalize(self, name):
+ """ Return the normalized form of the name. This is the standard form
+ from which possible variants for the name can be derived.
+ """
+ return self.norm.transliterate(name).strip()
def get_variants_ascii(self, norm_name):