X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/a0a7b05c9fd88ac1f808ef2345d71ba577096e40..4c52777ef03738803845f9ee58d269d93bbb9c3d:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index af53e825..c585c5af 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,7 +3,6 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import io import itertools import logging import re @@ -124,7 +123,7 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - + # pylint: disable=missing-format-attribute def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ @@ -135,7 +134,7 @@ class LegacyICUTokenizer: @define('CONST_Term_Normalization_Rules', "{0.term_normalization}"); @define('CONST_Transliteration', "{0.naming_rules.search_rules}"); require_once('{1}/tokenizer/legacy_icu_tokenizer.php'); - """.format(self, phpdir))) # pylint: disable=missing-format-attribute + """.format(self, phpdir))) def _save_config(self, config): @@ -164,12 +163,17 @@ class LegacyICUTokenizer: words = Counter() name_proc = ICUNameProcessor(self.naming_rules) with conn.cursor(name="words") as cur: - cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v") + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") for name, cnt in cur: + terms = set() for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - for term in word.split(): - words[term] += cnt + if ' ' in word: + terms.update(word.split()) + for term in terms: + words[term] += cnt # copy them back into the word table with CopyBuffer() as copystr: @@ -178,7 +182,7 @@ class LegacyICUTokenizer: with conn.cursor() as cur: copystr.copy_out(cur, 'word', - columns=['word_token', 'search_name_count']) + columns=['word_token', 'search_name_count']) cur.execute("""UPDATE word SET word_id = nextval('seq_word') WHERE word_id is null""") @@ -336,7 +340,7 @@ class LegacyICUNameAnalyzer: for word, cls, typ, oper in to_add: term = self.name_processor.get_search_normalized(word) if term: - copystr.add(word, term, cls, typ, + copystr.add(word, ' ' + term, cls, typ, oper if oper in ('in', 'near') else None, 0) added += 1 @@ -447,6 +451,9 @@ class LegacyICUNameAnalyzer: full, part = self._cache.names.get(norm_name, (None, None)) if full is None: variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + with self.conn.cursor() as cur: cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", (norm_name, variants)) @@ -466,12 +473,13 @@ class LegacyICUNameAnalyzer: given dictionary of names. """ full_names = set() - for name in (n for ns in names.values() for n in re.split('[;,]', ns)): - full_names.add(name.strip()) + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names