apply variants by languages

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 5 Oct 2021 15:18:10 +0000 (17:18 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 6 Oct 2021 09:09:54 +0000 (11:09 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 5 Oct 2021 15:18:10 +0000 (17:18 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 6 Oct 2021 09:09:54 +0000 (11:09 +0200)
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index 361b67d46c8411eb0ec2d4da28f6c84ce1573cc0..b3e9c4c7d3daa6e49c5cb3452a15ee5936487d82 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -11,6 +11,7 @@ from nominatim.db.properties import set_property, get_property
  from nominatim.errors import UsageError
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+import nominatim.tools.country_info
  
  LOG = logging.getLogger()
  
@@ -38,6 +39,9 @@ class ICURuleLoader:
          rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                                config='TOKENIZER_CONFIG')
  
+        # Make sure country information is available to analyzers and sanatizers.
+        nominatim.tools.country_info.setup_country_config(config)
+
          self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
          self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
          self.analysis_rules = _get_section(rules, 'token-analysis')
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

new file mode 100644 (file)

index 0000000..c98c825
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -0,0 +1,100 @@
+"""
+Name processor for tagging the langauge of the name
+"""
+import re
+
+from nominatim.tools import country_info
+
+class _AnalyzerByLanguage:
+    """ Processor for tagging the language of names in a place.
+    """
+
+    def __init__(self, config):
+        if 'filter-kind' in config:
+            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
+        else:
+            self.regexes = None
+
+        self.use_defaults = config.get('use-defaults', 'no')
+        if self.use_defaults not in ('mono', 'all'):
+            self.use_defaults = False
+
+        self.replace = config.get('mode', 'replace') != 'append'
+        self.whitelist = config.get('whitelist')
+
+        # Compute the languages to use when no suffix is given.
+        self.deflangs = {}
+        for ccode, prop in country_info.iterate():
+            clangs = prop['languages']
+            if len(clangs) == 1 or self.use_defaults == 'all':
+                if self.whitelist:
+                    self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                else:
+                    self.deflangs[ccode] = clangs
+
+
+
+    def _kind_matches(self, kind):
+        if self.regexes is None:
+            return True
+
+        return any(regex.search(kind) for regex in self.regexes)
+
+
+    def _suffix_matches(self, suffix):
+        if self.whitelist is None:
+            return len(suffix) in (2, 3) and suffix.islower()
+
+        return suffix in self.whitelist
+
+
+    def __call__(self, obj):
+        if not obj.names:
+            return
+
+        more_names = []
+
+        for name in (n for n in obj.names
+                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+            if name.suffix:
+                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
+            else:
+                if self.use_defaults:
+                    langs = self.deflangs.get(obj.place.country_code)
+                    if self.use_defaults == 'mono' and len(langs) > 1:
+                        langs = None
+
+            if langs:
+                if self.replace:
+                    name.set_attr('analyzer', langs[0])
+                else:
+                    more_names.append(name.clone(attr={'analyzer': langs[0]}))
+
+                more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+
+        obj.names.extend(more_names)
+
+
+def create(config):
+    """ Create a function that sets the analyzer property depending on the
+        language of the tag. The language is taken from the suffix.
+
+        To restrict the set of languages that should be tagged, use
+        'whitelist'. A list of acceptable suffixes. When unset, all 2- and
+        3-letter codes are accepted.
+
+        'use-defaults' configures what happens when the name has no suffix
+        with a language tag. When set to 'all', a variant is created for
+        each on the spoken languages in the country the feature is in. When
+        set to 'mono', a variant is created, when only one language is spoken
+        in the country. The default is, to do nothing with the default languages
+        of a country.
+
+        'mode' hay be 'replace' (the default) or 'append' and configures if
+        the original name (without any analyzer tagged) is retained.
+
+        With 'filter-kind' the set of names the sanitizer should be applied
+        to can be retricted to the given patterns of 'kind'. It expects a
+        list of regular expression to be matched against 'kind'.
+    """
+    return _AnalyzerByLanguage(config)
diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py

index c904d87d4eb18e15143b12935891955930dd5113..b8cfde3997640788fa36a686d65d63c4367a6e32 100644 (file)
--- a/nominatim/tokenizer/token_analysis/generic.py
+++ b/nominatim/tokenizer/token_analysis/generic.py
@@ -18,7 +18,19 @@ ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
  def configure(rules, normalization_rules):
      """ Extract and preprocess the configuration for this module.
      """
-    rules = rules.get('variants')
+    config = {}
+
+    config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
+                                                                  normalization_rules)
+    config['variant_only'] = rules.get('mode', '') == 'variant-only'
+
+    return config
+
+
+def _get_variant_config(rules, normalization_rules):
+    """ Convert the variant definition from the configuration into
+        replacement sets.
+    """
      immediate = defaultdict(list)
      chars = set()
  
@@ -41,8 +53,7 @@ def configure(rules, normalization_rules):
              immediate[variant.source].append(replstr)
              chars.update(variant.source)
  
-    return {'replacements': list(immediate.items()),
-            'chars': ''.join(chars)}
+    return list(immediate.items()), ''.join(chars)
  
  
  class _VariantMaker:
@@ -144,11 +155,15 @@ class GenericTokenAnalysis:
  
      def __init__(self, to_ascii, config):
          self.to_ascii = to_ascii
+        self.variant_only = config['variant_only']
  
          # Set up datrie
-        self.replacements = datrie.Trie(config['chars'])
-        for src, repllist in config['replacements']:
-            self.replacements[src] = repllist
+        if config['replacements']:
+            self.replacements = datrie.Trie(config['chars'])
+            for src, repllist in config['replacements']:
+                self.replacements[src] = repllist
+        else:
+            self.replacements = None
  
  
      def get_variants_ascii(self, norm_name):
@@ -159,45 +174,51 @@ class GenericTokenAnalysis:
          partials = ['']
  
          startpos = 0
-        pos = 0
-        force_space = False
-        while pos < len(baseform):
-            full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                               (None, None))
-            if full is not None:
-                done = baseform[startpos:pos]
-                partials = [v + done + r
-                            for v, r in itertools.product(partials, repl)
-                            if not force_space or r.startswith(' ')]
-                if len(partials) > 128:
-                    # If too many variants are produced, they are unlikely
-                    # to be helpful. Only use the original term.
-                    startpos = 0
-                    break
-                startpos = pos + len(full)
-                if full[-1] == ' ':
-                    startpos -= 1
-                    force_space = True
-                pos = startpos
-            else:
-                pos += 1
-                force_space = False
+        if self.replacements is not None:
+            pos = 0
+            force_space = False
+            while pos < len(baseform):
+                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                                   (None, None))
+                if full is not None:
+                    done = baseform[startpos:pos]
+                    partials = [v + done + r
+                                for v, r in itertools.product(partials, repl)
+                                if not force_space or r.startswith(' ')]
+                    if len(partials) > 128:
+                        # If too many variants are produced, they are unlikely
+                        # to be helpful. Only use the original term.
+                        startpos = 0
+                        break
+                    startpos = pos + len(full)
+                    if full[-1] == ' ':
+                        startpos -= 1
+                        force_space = True
+                    pos = startpos
+                else:
+                    pos += 1
+                    force_space = False
  
          # No variants detected? Fast return.
          if startpos == 0:
+            if self.variant_only:
+                return []
+
              trans_name = self.to_ascii.transliterate(norm_name).strip()
              return [trans_name] if trans_name else []
  
-        return self._compute_result_set(partials, baseform[startpos:])
+        return self._compute_result_set(partials, baseform[startpos:],
+                                        norm_name if self.variant_only else '')
  
  
-    def _compute_result_set(self, partials, prefix):
+    def _compute_result_set(self, partials, prefix, exclude):
          results = set()
  
          for variant in partials:
-            vname = variant + prefix
-            trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
-            if trans_name:
-                results.add(trans_name)
+            vname = (variant + prefix)[1:-1].strip()
+            if vname != exclude:
+                trans_name = self.to_ascii.transliterate(vname).strip()
+                if trans_name:
+                    results.add(trans_name)
  
          return list(results)
diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py

index e04a8693f116bccd6d7e609de0c463b74170e46a..635d15840a84b8197efb9f5cb358344a78a0c2b9 100644 (file)
--- a/nominatim/tools/country_info.py
+++ b/nominatim/tools/country_info.py
@@ -13,12 +13,21 @@ class _CountryInfo:
      def __init__(self):
          self._info = {}
  
+
      def load(self, config):
          """ Load the country properties from the configuration files,
              if they are not loaded yet.
          """
          if not self._info:
              self._info = config.load_sub_configuration('country_settings.yaml')
+            # Convert languages into a list for simpler handling.
+            for prop in self._info.values():
+                if 'languages' not in prop:
+                    prop['languages'] = []
+                elif not isinstance(prop['languages'], list):
+                    prop['languages'] = [x.strip()
+                                         for x in prop['languages'].split(',')]
+
  
      def items(self):
          """ Return tuples of (country_code, property dict) as iterable.
@@ -36,6 +45,12 @@ def setup_country_config(config):
      _COUNTRY_INFO.load(config)
  
  
+def iterate():
+    """ Iterate over country code and properties.
+    """
+    return _COUNTRY_INFO.items()
+
+
  def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
      """ Create and populate the tables with basic static data that provides
          the background for geocoding. Data is assumed to not yet exist.
@@ -50,10 +65,7 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
                  partition = 0
              else:
                  partition = props.get('partition')
-            if ',' in (props.get('languages', ',') or ','):
-                lang = None
-            else:
-                lang = props['languages']
+            lang = props['languages'][0] if len(props['languages']) == 1 else None
              params.append((ccode, partition, lang))
  
      with connect(dsn) as conn:
diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml

index 77b137a1b8019fcfce1facbee8c9f7fc32891e94..dcbb1847f8fd1d7158d4dae8122081346e061e34 100644 (file)
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -171,7 +171,7 @@ bt:
  #  (Bouvet Island)
  bv:
      partition: 185
-    languages: no
+    languages: "no"
  
  # Botswana (Botswana)
  bw:
@@ -1006,7 +1006,7 @@ si:
  #  (Svalbard and Jan Mayen)
  sj:
      partition: 197
-    languages: no
+    languages: "no"
  
  # Slovakia (Slovensko)
  sk:
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index d070adcbbd649122aa5a37c621271b7d8635cb01..41760c49e0fbd2122d2f1e7fd1966fc4278d1975 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -27,36 +27,160 @@ transliteration:
  sanitizers:
      - step: split-name-list
      - step: strip-brace-terms
+    - step: tag-analyzer-by-language
+      filter-kind: [".*name.*"]
+      whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
+      use-defaults: all
+      mode: append
  token-analysis:
      - analyzer: generic
+    - id: bg
+      analyzer: generic
+      mode: variant-only
        variants:
            - !include icu-rules/variants-bg.yaml
+    - id: ca
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-ca.yaml
+    - id: cs
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-cs.yaml
+    - id: da
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-da.yaml
+    - id: de
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-de.yaml
+    - id: el
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-el.yaml
+    - id: en
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-en.yaml
+    - id: es
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-es.yaml
+    - id: et
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-et.yaml
+    - id: eu
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-eu.yaml
+    - id: fi
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-fi.yaml
+    - id: fr
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-fr.yaml
+    - id: gl
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-gl.yaml
+    - id: hu
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-hu.yaml
+    - id: it
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-it.yaml
+    - id: ja
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-ja.yaml
+    - id: mg
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-mg.yaml
+    - id: ms
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-ms.yaml
+    - id: nl
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-nl.yaml
+    - id: no
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-no.yaml
+    - id: pl
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-pl.yaml
+    - id: pt
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-pt.yaml
+    - id: ro
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-ro.yaml
+    - id: ru
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-ru.yaml
+    - id: sk
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-sk.yaml
+    - id: sl
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-sl.yaml
+    - id: sv
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-sv.yaml
+    - id: tr
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-tr.yaml
+    - id: uk
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-uk.yaml
+    - id: vi
+      analyzer: generic
+      mode: variant-only
+      variants:
            - !include icu-rules/variants-vi.yaml
diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature

index b8a760f99bd0bc03e127c14ae60de5f93fdf0290..deaa635e0b190733a99306795d2bef779f5caaa0 100644 (file)
--- a/test/bdd/db/query/normalization.feature
+++ b/test/bdd/db/query/normalization.feature
@@ -52,7 +52,7 @@ Feature: Import and search of names
  
      Scenario: Special characters in name
          Given the places
-          | osm | class | type      | name |
+          | osm | class | type      | name+name:de |
            | N1  | place | locality  | Jim-Knopf-Straße |
            | N2  | place | locality  | Smith/Weston |
            | N3  | place | locality  | space mountain |
diff --git a/test/python/tokenizer/token_analysis/test_generic.py b/test/python/tokenizer/token_analysis/test_generic.py

index f0ce4208e288afbe9fdce86f255abd6220c69de2..a9b09ea43fecd51a60d15a83cc0e7cfdd764b675 100644 (file)
--- a/test/python/tokenizer/token_analysis/test_generic.py
+++ b/test/python/tokenizer/token_analysis/test_generic.py
@@ -40,7 +40,7 @@ def cfgfile(def_config, tmp_path):
  
  
  def get_normalized_variants(proc, name):
-    return proc.get_variants_ascii(proc.get_normalized(name))
+    return proc.analysis[None].get_variants_ascii(proc.normalizer.transliterate(name).strip())
  
  
  def test_variants_empty(cfgfile):
@@ -99,6 +99,6 @@ def test_search_normalized(cfgfile):
      config = cfgfile('~street => s,st', 'master => mstr')
      proc = ICURuleLoader(config).make_token_analysis()
  
-    assert proc.get_search_normalized('Master Street') == 'master street'
-    assert proc.get_search_normalized('Earnes St') == 'earnes st'
-    assert proc.get_search_normalized('Nostreet') == 'nostreet'
+    assert proc.search.transliterate('Master Street').strip() == 'master street'
+    assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
+    assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 5 Oct 2021 15:18:10 +0000 (17:18 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 6 Oct 2021 09:09:54 +0000 (11:09 +0200)
nominatim/tokenizer/icu_rule_loader.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/token_analysis/generic.py		patch \| blob \| history
nominatim/tools/country_info.py		patch \| blob \| history
settings/country_settings.yaml		patch \| blob \| history
settings/icu_tokenizer.yaml		patch \| blob \| history
test/bdd/db/query/normalization.feature		patch \| blob \| history
test/python/tokenizer/token_analysis/test_generic.py		patch \| blob \| history