libICU instead of the PostgreSQL module.
"""
from collections import Counter
-import functools
-import io
import itertools
-import json
import logging
import re
from textwrap import dedent
from pathlib import Path
-from icu import Transliterator
-import psycopg2.extras
-
from nominatim.db.connection import connect
from nominatim.db.properties import set_property, get_property
+from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
"""
self.init_from_project()
- if self.normalization is None\
- or self.transliteration is None\
- or self.abbreviations is None:
+ if self.naming_rules is None:
return "Configuration for tokenizer 'legacy_icu' are missing."
return None
"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
-
+ # pylint: disable=missing-format-attribute
def _install_php(self, phpdir):
""" Install the php script for the tokenizer.
"""
words = Counter()
name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur:
- cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+ cur.execute(""" SELECT v, count(*) FROM
+ (SELECT svals(name) as v FROM place)x
+ WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
+ terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
- for term in word.split():
- words[term] += cnt
+ if ' ' in word:
+ terms.update(word.split())
+ for term in terms:
+ words[term] += cnt
# copy them back into the word table
- copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
+ with CopyBuffer() as copystr:
+ for args in words.items():
+ copystr.add(*args)
-
- with conn.cursor() as cur:
- copystr.seek(0)
- cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
- cur.execute("""UPDATE word SET word_id = nextval('seq_word')
- WHERE word_id is null""")
+ with conn.cursor() as cur:
+ copystr.copy_out(cur, 'word',
+ columns=['word_token', 'search_name_count'])
+ cur.execute("""UPDATE word SET word_id = nextval('seq_word')
+ WHERE word_id is null""")
conn.commit()
self.conn = None
- def get_word_token_info(self, conn, words):
+ def get_word_token_info(self, words):
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
tokens = {}
for word in words:
if word.startswith('#'):
- tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
+ tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
else:
- tokens[word] = self.name_processor.get_normalized(word)
+ tokens[word] = self.name_processor.get_search_normalized(word)
- with conn.cursor() as cur:
+ with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = t.term
(list(tokens.values()), ))
ids = {r[0]: r[1] for r in cur}
- return [(k, v, ids[v]) for k, v in tokens.items()]
+ return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
@staticmethod
table.
"""
to_delete = []
- copystr = io.StringIO()
with self.conn.cursor() as cur:
# This finds us the rows in location_postcode and word that are
# missing in the other table.
ON pc = word) x
WHERE pc is null or word is null""")
- for postcode, word in cur:
- if postcode is None:
- to_delete.append(word)
- else:
- copystr.write(postcode)
- copystr.write('\t ')
- copystr.write(self.name_processor.get_search_normalized(postcode))
- copystr.write('\tplace\tpostcode\t0\n')
+ with CopyBuffer() as copystr:
+ for postcode, word in cur:
+ if postcode is None:
+ to_delete.append(word)
+ else:
+ copystr.add(
+ postcode,
+ ' ' + self.name_processor.get_search_normalized(postcode),
+ 'place', 'postcode', 0)
- if to_delete:
- cur.execute("""DELETE FROM WORD
- WHERE class ='place' and type = 'postcode'
- and word = any(%s)
- """, (to_delete, ))
+ if to_delete:
+ cur.execute("""DELETE FROM WORD
+ WHERE class ='place' and type = 'postcode'
+ and word = any(%s)
+ """, (to_delete, ))
- if copystr.getvalue():
- copystr.seek(0)
- cur.copy_from(copystr, 'word',
- columns=['word', 'word_token', 'class', 'type',
- 'search_name_count'])
+ copystr.copy_out(cur, 'word',
+ columns=['word', 'word_token', 'class', 'type',
+ 'search_name_count'])
def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases.
"""
- norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
+ norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
for label, cls, typ, oper in cur:
existing_phrases.add((label, cls, typ, oper or '-'))
- to_add = norm_phrases - existing_phrases
- to_delete = existing_phrases - norm_phrases
-
- if to_add:
- copystr = io.StringIO()
- for word, cls, typ, oper in to_add:
- term = self.name_processor.get_search_normalized(word)
- if term:
- copystr.write(word)
- copystr.write('\t ')
- copystr.write(term)
- copystr.write('\t')
- copystr.write(cls)
- copystr.write('\t')
- copystr.write(typ)
- copystr.write('\t')
- copystr.write(oper if oper in ('in', 'near') else '\\N')
- copystr.write('\t0\n')
-
- copystr.seek(0)
- cur.copy_from(copystr, 'word',
- columns=['word', 'word_token', 'class', 'type',
- 'operator', 'search_name_count'])
-
- if to_delete and should_replace:
- psycopg2.extras.execute_values(
- cur,
- """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
- WHERE word = name and class = in_class and type = in_type
- and ((op = '-' and operator is null) or op = operator)""",
- to_delete)
+ added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+ if should_replace:
+ deleted = self._remove_special_phrases(cur, norm_phrases,
+ existing_phrases)
+ else:
+ deleted = 0
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
- len(norm_phrases), len(to_add), len(to_delete))
+ len(norm_phrases), added, deleted)
+
+
+ def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+ """ Add all phrases to the database that are not yet there.
+ """
+ to_add = new_phrases - existing_phrases
+
+ added = 0
+ with CopyBuffer() as copystr:
+ for word, cls, typ, oper in to_add:
+ term = self.name_processor.get_search_normalized(word)
+ if term:
+ copystr.add(word, ' ' + term, cls, typ,
+ oper if oper in ('in', 'near') else None, 0)
+ added += 1
+
+ copystr.copy_out(cursor, 'word',
+ columns=['word', 'word_token', 'class', 'type',
+ 'operator', 'search_name_count'])
+
+ return added
+
+
+ @staticmethod
+ def _remove_special_phrases(cursor, new_phrases, existing_phrases):
+ """ Remove all phrases from the databse that are no longer in the
+ new phrase list.
+ """
+ to_delete = existing_phrases - new_phrases
+
+ if to_delete:
+ cursor.execute_values(
+ """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+ WHERE word = name and class = in_class and type = in_type
+ and ((op = '-' and operator is null) or op = operator)""",
+ to_delete)
+
+ return len(to_delete)
def add_country_names(self, country_code, names):
if word_tokens:
cur.execute("""INSERT INTO word (word_id, word_token, country_code,
search_name_count)
- (SELECT nextval('seq_word'), token, '{}', 0
+ (SELECT nextval('seq_word'), token, %s, 0
FROM unnest(%s) as token)
- """.format(country_code), (list(word_tokens),))
+ """, (country_code, list(word_tokens)))
def process_place(self, place):
self.add_country_names(country_feature.lower(), names)
address = place.get('address')
-
if address:
- hnrs = []
- addr_terms = []
- for key, value in address.items():
- if key == 'postcode':
- self._add_postcode(value)
- elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
- hnrs.append(value)
- elif key == 'street':
- token_info.add_street(*self._compute_name_tokens({'name': value}))
- elif key == 'place':
- token_info.add_place(*self._compute_name_tokens({'name': value}))
- elif not key.startswith('_') and \
- key not in ('country', 'full'):
- addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
- if hnrs:
- hnrs = self._split_housenumbers(hnrs)
- token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
- if addr_terms:
- token_info.add_address_terms(addr_terms)
+ self._process_place_address(token_info, address)
return token_info.data
+ def _process_place_address(self, token_info, address):
+ hnrs = []
+ addr_terms = []
+ for key, value in address.items():
+ if key == 'postcode':
+ self._add_postcode(value)
+ elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+ hnrs.append(value)
+ elif key == 'street':
+ token_info.add_street(*self._compute_name_tokens({'name': value}))
+ elif key == 'place':
+ token_info.add_place(*self._compute_name_tokens({'name': value}))
+ elif not key.startswith('_') and \
+ key not in ('country', 'full'):
+ addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+ if hnrs:
+ hnrs = self._split_housenumbers(hnrs)
+ token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+ if addr_terms:
+ token_info.add_address_terms(addr_terms)
+
+
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
+ if not variants:
+ continue
+
with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants))
return full_tokens, partial_tokens
- def _compute_full_names(self, names):
+ @staticmethod
+ def _compute_full_names(names):
""" Return the set of all full name word ids to be used with the
given dictionary of names.
"""
full_names = set()
- for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
- full_names.add(name.strip())
+ for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+ if name:
+ full_names.add(name)
- brace_idx = name.find('(')
- if brace_idx >= 0:
- full_names.add(name[:brace_idx].strip())
+ brace_idx = name.find('(')
+ if brace_idx >= 0:
+ full_names.add(name[:brace_idx].strip())
return full_names
self.data['hnr'] = ';'.join(hnrs)
- def add_street(self, fulls, partials):
+ def add_street(self, fulls, _):
""" Add addr:street match terms.
"""
if fulls: