libICU instead of the PostgreSQL module.
"""
from collections import Counter
+import functools
import io
import itertools
import json
class LegacyICUTokenizer:
""" This tokenizer uses libICU to covert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
- normalization routines in Nominatm 3.
+ normalization routines in Nominatim 3.
"""
def __init__(self, dsn, data_dir):
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
norm = Transliterator.createFromRules("normalizer", self.normalization)
- trans = Transliterator.createFromRules("normalizer", self.transliteration)
+ trans = Transliterator.createFromRules("trans", self.transliteration)
return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
# copy them back into the word table
copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
+
with conn.cursor() as cur:
+ copystr.seek(0)
cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
WHERE word_id is null""")
self.normalizer = normalizer
self.transliterator = transliterator
self.abbreviations = abbreviations
- #psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache()
self.conn = None
+ def get_word_token_info(self, conn, words):
+ """ Return token information for the given list of words.
+ If a word starts with # it is assumed to be a full name
+ otherwise is a partial name.
+
+ The function returns a list of tuples with
+ (original word, word token, word id).
+
+ The function is used for testing and debugging only
+ and not necessarily efficient.
+ """
+ tokens = {}
+ for word in words:
+ if word.startswith('#'):
+ tokens[word] = ' ' + self.make_standard_word(word[1:])
+ else:
+ tokens[word] = self.make_standard_word(word)
+
+ with conn.cursor() as cur:
+ cur.execute("""SELECT word_token, word_id
+ FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+ WHERE word_token = t.term
+ and class is null and country_code is null""",
+ (list(tokens.values()), ))
+ ids = {r[0]: r[1] for r in cur}
+
+ return [(k, v, ids[v]) for k, v in tokens.items()]
+
+
def normalize(self, phrase):
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return self.normalizer.transliterate(phrase)
+ @staticmethod
+ def normalize_postcode(postcode):
+ """ Convert the postcode to a standardized form.
+
+ This function must yield exactly the same result as the SQL function
+ 'token_normalized_postcode()'.
+ """
+ return postcode.strip().upper()
+
+
+ @functools.lru_cache(maxsize=1024)
def make_standard_word(self, name):
- """ Create the normalised version of the name.
+ """ Create the normalised version of the input.
"""
norm = ' ' + self.transliterator.transliterate(name) + ' '
for full, abbr in self.abbreviations:
return self.transliterator.transliterate(hnr)
- def add_postcodes_from_db(self):
- """ Add postcodes from the location_postcode table to the word table.
+ def update_postcodes_from_db(self):
+ """ Update postcode tokens in the word table from the location_postcode
+ table.
"""
+ to_delete = []
copystr = io.StringIO()
with self.conn.cursor() as cur:
- cur.execute("SELECT distinct(postcode) FROM location_postcode")
- for (postcode, ) in cur:
- copystr.write(postcode)
- copystr.write('\t ')
- copystr.write(self.transliterator.transliterate(postcode))
- copystr.write('\tplace\tpostcode\t0\n')
-
- cur.copy_from(copystr, 'word',
- columns=['word', 'word_token', 'class', 'type',
- 'search_name_count'])
- # Don't really need an ID for postcodes....
- # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
- # WHERE word_id is null and type = 'postcode'""")
-
-
- def update_special_phrases(self, phrases):
+ # This finds us the rows in location_postcode and word that are
+ # missing in the other table.
+ cur.execute("""SELECT * FROM
+ (SELECT pc, word FROM
+ (SELECT distinct(postcode) as pc FROM location_postcode) p
+ FULL JOIN
+ (SELECT word FROM word
+ WHERE class ='place' and type = 'postcode') w
+ ON pc = word) x
+ WHERE pc is null or word is null""")
+
+ for postcode, word in cur:
+ if postcode is None:
+ to_delete.append(word)
+ else:
+ copystr.write(postcode)
+ copystr.write('\t ')
+ copystr.write(self.transliterator.transliterate(postcode))
+ copystr.write('\tplace\tpostcode\t0\n')
+
+ if to_delete:
+ cur.execute("""DELETE FROM WORD
+ WHERE class ='place' and type = 'postcode'
+ and word = any(%s)
+ """, (to_delete, ))
+
+ if copystr.getvalue():
+ copystr.seek(0)
+ cur.copy_from(copystr, 'word',
+ columns=['word', 'word_token', 'class', 'type',
+ 'search_name_count'])
+
+
+ def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases.
"""
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
copystr.write(oper if oper in ('in', 'near') else '\\N')
copystr.write('\t0\n')
+ copystr.seek(0)
cur.copy_from(copystr, 'word',
columns=['word', 'word_token', 'class', 'type',
'operator', 'search_name_count'])
- if to_delete:
+ if to_delete and should_replace:
psycopg2.extras.execute_values(
cur,
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
"""
full_names = set((self.make_standard_word(n) for n in names))
full_names.discard('')
- self._add_normalised_country_names(country_code, full_names)
+ self._add_normalized_country_names(country_code, full_names)
- def _add_normalised_country_names(self, country_code, names):
+ def _add_normalized_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
+ word_tokens = set((' ' + name for name in names))
with self.conn.cursor() as cur:
# Get existing names
cur.execute("SELECT word_token FROM word WHERE country_code = %s",
(country_code, ))
- new_names = names.difference((t[0] for t in cur))
+ word_tokens.difference_update((t[0] for t in cur))
- if new_names:
+ if word_tokens:
cur.execute("""INSERT INTO word (word_id, word_token, country_code,
search_name_count)
(SELECT nextval('seq_word'), token, '{}', 0
FROM unnest(%s) as token)
- """.format(country_code), (list(new_names),))
+ """.format(country_code), (list(word_tokens),))
def process_place(self, place):
names = place.get('name')
if names:
- full_names = set((self.make_standard_word(name) for name in names.values()))
- full_names.discard('')
+ full_names = self._compute_full_names(names)
token_info.add_names(self.conn, full_names)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
- self._add_normalised_country_names(country_feature.lower(),
+ self._add_normalized_country_names(country_feature.lower(),
full_names)
address = place.get('address')
return token_info.data
+ def _compute_full_names(self, names):
+ """ Return the set of all full name word ids to be used with the
+ given dictionary of names.
+ """
+ full_names = set()
+ for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
+ word = self.make_standard_word(name)
+ if word:
+ full_names.add(word)
+
+ brace_split = name.split('(', 2)
+ if len(brace_split) > 1:
+ word = self.make_standard_word(brace_split[0])
+ if word:
+ full_names.add(word)
+
+ return full_names
+
+
def _add_postcode(self, postcode):
""" Make sure the normalized postcode is present in the word table.
"""
- if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
- term = self.make_standard_word(postcode)
- if not term:
- return
-
- with self.conn.cursor() as cur:
- # no word_id needed for postcodes
- cur.execute("""INSERT INTO word (word, word_token, class, type,
- search_name_count)
- (SELECT pc, %s, 'place', 'postcode', 0
- FROM (VALUES (%s)) as v(pc)
- WHERE NOT EXISTS
- (SELECT * FROM word
- WHERE word = pc and class='place' and type='postcode'))
- """, (' ' + term, postcode))
- self._cache.postcodes.add(postcode)
+ if re.search(r'[:,;]', postcode) is None:
+ postcode = self.normalize_postcode(postcode)
+
+ if postcode not in self._cache.postcodes:
+ term = self.make_standard_word(postcode)
+ if not term:
+ return
+
+ with self.conn.cursor() as cur:
+ # no word_id needed for postcodes
+ cur.execute("""INSERT INTO word (word, word_token, class, type,
+ search_name_count)
+ (SELECT pc, %s, 'place', 'postcode', 0
+ FROM (VALUES (%s)) as v(pc)
+ WHERE NOT EXISTS
+ (SELECT * FROM word
+ WHERE word = pc and class='place' and type='postcode'))
+ """, (' ' + term, postcode))
+ self._cache.postcodes.add(postcode)
@staticmethod
def _split_housenumbers(hnrs):
"""
# Start with all partial names
terms = set((part for ns in names for part in ns.split()))
- # Add partials for the full terms (TO BE REMOVED)
- terms.update((n for n in names))
# Add the full names
terms.update((' ' + n for n in names))