]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 87906d71d75484639078c56d7b1dd9c0295a8572..12d1eccd15f1799b6b45af4df6b0b39ec6a93674 100644 (file)
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
@@ -162,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Count the partial terms from the names in the place table.
         """
         words = Counter()
-        name_proc = self.loader.make_token_analysis()
+        analysis = self.loader.make_token_analysis()
 
         with conn.cursor(name="words") as cur:
             cur.execute(""" SELECT v, count(*) FROM
@@ -170,12 +172,10 @@ class LegacyICUTokenizer(AbstractTokenizer):
                             WHERE length(v) < 75 GROUP BY v""")
 
             for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
+                word = analysis.search.transliterate(name)
+                if word and ' ' in word:
+                    for term in set(word.split()):
+                        words[term] += cnt
 
         return words
 
@@ -187,10 +187,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         normalization.
     """
 
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
 
         self._cache = _TokenCache()
 
@@ -203,6 +204,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             self.conn = None
 
 
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.search.transliterate(name).strip()
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.normalizer.transliterate(name).strip()
+
+
     def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
@@ -218,9 +232,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         partial_tokens = {}
         for word in words:
             if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
             else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
 
         with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
@@ -251,7 +265,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             This function takes minor shortcuts on transliteration.
         """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
@@ -274,7 +288,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                     if postcode is None:
                         to_delete.append(word)
                     else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                     'P', postcode)
 
                 if to_delete:
@@ -292,7 +306,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             completely replaced. Otherwise the phrases are added to the
             already existing ones.
         """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -322,7 +336,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         added = 0
         with CopyBuffer() as copystr:
             for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                 if term:
                     copystr.add(term, 'S', word,
                                 json.dumps({'class': cls, 'type': typ,
@@ -356,9 +370,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
         word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
             if norm_name:
                 word_tokens.add(norm_name)
 
@@ -384,12 +410,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
             the database via the token_info field.
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.name
+        names, address = self.sanitizer.process_names(place)
 
         if names:
             fulls, partials = self._compute_name_tokens(names)
@@ -397,9 +423,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             token_info.add_names(fulls, partials)
 
             if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)
 
-        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
@@ -409,18 +434,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _process_place_address(self, token_info, address):
         hnrs = []
         addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 
         if hnrs:
             hnrs = self._split_housenumbers(hnrs)
@@ -429,11 +454,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         if addr_terms:
             token_info.add_address_terms(addr_terms)
 
+
     def _compute_partial_tokens(self, name):
         """ Normalize the given term, split it into partial words and return
             then token list for them.
         """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
 
         tokens = []
         need_lookup = []
@@ -456,28 +482,34 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
         return tokens
 
+
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
             dictionary of names.
         """
-        full_names = self._compute_full_names(names)
         full_tokens = set()
         partial_tokens = set()
 
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            norm_name = self._normalized(name.name)
+            if analyzer_id is None:
+                token_id = norm_name
+            else:
+                token_id = f'{norm_name}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
                 if not variants:
                     continue
 
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
+                                (token_id, variants))
                     full, part = cur.fetchone()
 
-                self._cache.names[norm_name] = (full, part)
+                self._cache.names[token_id] = (full, part)
 
             full_tokens.add(full)
             partial_tokens.update(part)
@@ -485,23 +517,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         return full_tokens, partial_tokens
 
 
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
@@ -509,7 +524,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                 if not term:
                     return