nominatim/tokenizer/token_analysis/generic.py

   1 """
   2 Generic processor for names that creates abbreviation variants.
   3 """
   4 from collections import defaultdict
   5 import itertools
   6
   7 from icu import Transliterator
   8 import datrie
   9
  10 ### Analysis section
  11
  12 def create(norm_rules, trans_rules, config):
  13     """ Create a new token analysis instance for this module.
  14     """
  15     return GenericTokenAnalysis(norm_rules, trans_rules, config['variants'])
  16
  17
  18 class GenericTokenAnalysis:
  19     """ Collects the different transformation rules for normalisation of names
  20         and provides the functions to apply the transformations.
  21     """
  22
  23     def __init__(self, norm_rules, trans_rules, replacements):
  24         self.normalizer = Transliterator.createFromRules("icu_normalization",
  25                                                          norm_rules)
  26         self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
  27                                                        trans_rules +
  28                                                        ";[:Space:]+ > ' '")
  29         self.search = Transliterator.createFromRules("icu_search",
  30                                                      norm_rules + trans_rules)
  31
  32         # Intermediate reorder by source. Also compute required character set.
  33         immediate = defaultdict(list)
  34         chars = set()
  35         for variant in replacements:
  36             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
  37                 replstr = variant.replacement[:-1]
  38             else:
  39                 replstr = variant.replacement
  40             immediate[variant.source].append(replstr)
  41             chars.update(variant.source)
  42         # Then copy to datrie
  43         self.replacements = datrie.Trie(''.join(chars))
  44         for src, repllist in immediate.items():
  45             self.replacements[src] = repllist
  46
  47
  48     def get_normalized(self, name):
  49         """ Normalize the given name, i.e. remove all elements not relevant
  50             for search.
  51         """
  52         return self.normalizer.transliterate(name).strip()
  53
  54     def get_variants_ascii(self, norm_name):
  55         """ Compute the spelling variants for the given normalized name
  56             and transliterate the result.
  57         """
  58         baseform = '^ ' + norm_name + ' ^'
  59         partials = ['']
  60
  61         startpos = 0
  62         pos = 0
  63         force_space = False
  64         while pos < len(baseform):
  65             full, repl = self.replacements.longest_prefix_item(baseform[pos:],
  66                                                                (None, None))
  67             if full is not None:
  68                 done = baseform[startpos:pos]
  69                 partials = [v + done + r
  70                             for v, r in itertools.product(partials, repl)
  71                             if not force_space or r.startswith(' ')]
  72                 if len(partials) > 128:
  73                     # If too many variants are produced, they are unlikely
  74                     # to be helpful. Only use the original term.
  75                     startpos = 0
  76                     break
  77                 startpos = pos + len(full)
  78                 if full[-1] == ' ':
  79                     startpos -= 1
  80                     force_space = True
  81                 pos = startpos
  82             else:
  83                 pos += 1
  84                 force_space = False
  85
  86         # No variants detected? Fast return.
  87         if startpos == 0:
  88             trans_name = self.to_ascii.transliterate(norm_name).strip()
  89             return [trans_name] if trans_name else []
  90
  91         return self._compute_result_set(partials, baseform[startpos:])
  92
  93
  94     def _compute_result_set(self, partials, prefix):
  95         results = set()
  96
  97         for variant in partials:
  98             vname = variant + prefix
  99             trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
 100             if trans_name:
 101                 results.add(trans_name)
 102
 103         return list(results)
 104
 105
 106     def get_search_normalized(self, name):
 107         """ Return the normalized version of the name (including transliteration)
 108             to be applied at search time.
 109         """
 110         return self.search.transliterate(' ' + name + ' ').strip()