X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/f44af49df9b86c9a35db000f7a42e65a6c4307ba..87907916ff06741bd4627101d30e6e11b8ea1a1a:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 09d4059e..2bd22c72 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,6 +3,7 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter +import functools import io import itertools import json @@ -34,7 +35,7 @@ def create(dsn, data_dir): class LegacyICUTokenizer: """ This tokenizer uses libICU to covert names and queries to ASCII. Otherwise it uses the same algorithms and data structures as the - normalization routines in Nominatm 3. + normalization routines in Nominatim 3. """ def __init__(self, dsn, data_dir): @@ -126,22 +127,25 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("normalizer", self.transliteration) + trans = Transliterator.createFromRules("trans", self.transliteration) return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) def _install_php(self, config): """ Install the php script for the tokenizer. """ + abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent("""\ 1: + word = self.make_standard_word(brace_split[0]) + if word: + full_names.add(word) + + return full_names + + def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes: - term = self.make_standard_word(postcode) - if not term: - return - - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) - self._cache.postcodes.add(postcode) + if re.search(r'[:,;]', postcode) is None: + postcode = self.normalize_postcode(postcode) + + if postcode not in self._cache.postcodes: + term = self.make_standard_word(postcode) + if not term: + return + + with self.conn.cursor() as cur: + # no word_id needed for postcodes + cur.execute("""INSERT INTO word (word, word_token, class, type, + search_name_count) + (SELECT pc, %s, 'place', 'postcode', 0 + FROM (VALUES (%s)) as v(pc) + WHERE NOT EXISTS + (SELECT * FROM word + WHERE word = pc and class='place' and type='postcode')) + """, (' ' + term, postcode)) + self._cache.postcodes.add(postcode) @staticmethod def _split_housenumbers(hnrs): @@ -449,8 +537,6 @@ class _TokenInfo: """ # Start with all partial names terms = set((part for ns in names for part in ns.split())) - # Add partials for the full terms (TO BE REMOVED) - terms.update((n for n in names)) # Add the full names terms.update((' ' + n for n in names))