seq = difflib.SequenceMatcher(a=self.lookup_word, b=norm)
distance = 0
for tag, afrom, ato, bfrom, bto in seq.get_opcodes():
- if tag == 'delete' and (afrom == 0 or ato == len(self.lookup_word)):
+ if tag in ('delete', 'insert') and (afrom == 0 or ato == len(self.lookup_word)):
distance += 1
elif tag == 'replace':
distance += max((ato-afrom), (bto-bfrom))
penalty = 0.0
if row.type == 'w':
penalty = 0.3
+ elif row.type == 'W':
+ if len(row.word_token) == 1 and row.word_token == row.word:
+ penalty = 0.2 if row.word.isdigit() else 0.3
elif row.type == 'H':
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
if all(not c.isdigit() for c in row.word_token):
penalty += 0.2 * (len(row.word_token) - 1)
+ elif row.type == 'C':
+ if len(row.word_token) == 1:
+ penalty = 0.3
if row.info is None:
lookup_word = row.word
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
- rules = await self.conn.get_property('tokenizer_import_normalisation')
- self.normalizer = Transliterator.createFromRules("normalization", rules)
- rules = await self.conn.get_property('tokenizer_import_transliteration')
- self.transliterator = Transliterator.createFromRules("transliteration", rules)
+ async def _make_normalizer() -> Any:
+ rules = await self.conn.get_property('tokenizer_import_normalisation')
+ return Transliterator.createFromRules("normalization", rules)
+
+ self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer',
+ _make_normalizer)
+
+ async def _make_transliterator() -> Any:
+ rules = await self.conn.get_property('tokenizer_import_transliteration')
+ return Transliterator.createFromRules("transliteration", rules)
+
+ self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
+ _make_transliterator)
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,