from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
return LegacyICUTokenizer(dsn, data_dir)
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
""" This tokenizer uses libICU to covert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
- pass
def update_sql_functions(self, config):
LOG.warning("Precomputing word tokens")
# get partial words and their frequencies
- words = Counter()
- name_proc = ICUNameProcessor(self.naming_rules)
- with conn.cursor(name="words") as cur:
- cur.execute(""" SELECT v, count(*) FROM
- (SELECT svals(name) as v FROM place)x
- WHERE length(v) < 75 GROUP BY v""")
-
- for name, cnt in cur:
- terms = set()
- for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
- if ' ' in word:
- terms.update(word.split())
- for term in terms:
- words[term] += cnt
+ words = self._count_partial_terms(conn)
# copy them back into the word table
with CopyBuffer() as copystr:
- for k, v in words.items():
- copystr.add('w', k, json.dumps({'count': v}))
+ for term, cnt in words.items():
+ copystr.add('w', term, json.dumps({'count': cnt}))
with conn.cursor() as cur:
copystr.copy_out(cur, 'word',
conn.commit()
+ def _count_partial_terms(self, conn):
+ """ Count the partial terms from the names in the place table.
+ """
+ words = Counter()
+ name_proc = ICUNameProcessor(self.naming_rules)
+
+ with conn.cursor(name="words") as cur:
+ cur.execute(""" SELECT v, count(*) FROM
+ (SELECT svals(name) as v FROM place)x
+ WHERE length(v) < 75 GROUP BY v""")
+
+ for name, cnt in cur:
+ terms = set()
+ for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+ if ' ' in word:
+ terms.update(word.split())
+ for term in terms:
+ words[term] += cnt
+
+ return words
+
-class LegacyICUNameAnalyzer:
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names.
Each instance opens a connection to the database to request the
self._cache = _TokenCache()
- def __enter__(self):
- return self
-
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
-
-
def close(self):
""" Free all resources used by the analyzer.
"""
partial_tokens[word] = self.name_processor.get_search_normalized(word)
with self.conn.cursor() as cur:
- cur.execute("""(SELECT word_token, word_id
- FROM word WHERE word_token = ANY(%s) and type = 'W')
- UNION
- (SELECT word_token, word_id
- FROM word WHERE word_token = ANY(%s) and type = 'w')""",
- (list(full_tokens.values()),
- list(partial_tokens.values())))
- ids = {r[0]: r[1] for r in cur}
+ cur.execute("""SELECT word_token, word_id
+ FROM word WHERE word_token = ANY(%s) and type = 'W'
+ """, (list(full_tokens.values()),))
+ full_ids = {r[0]: r[1] for r in cur}
+ cur.execute("""SELECT word_token, word_id
+ FROM word WHERE word_token = ANY(%s) and type = 'w'""",
+ (list(partial_tokens.values()),))
+ part_ids = {r[0]: r[1] for r in cur}
- return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
- + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
+ return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+ + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
@staticmethod
(SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN
- (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
+ (SELECT word FROM word WHERE type = 'P') w
ON pc = word) x
WHERE pc is null or word is null""")
to_delete.append(word)
else:
copystr.add(self.name_processor.get_search_normalized(postcode),
- 'P', json.dumps({'postcode': postcode}))
+ 'P', postcode)
if to_delete:
cur.execute("""DELETE FROM WORD
- WHERE type ='P' and info->>'postcode' = any(%s)
+ WHERE type ='P' and word = any(%s)
""", (to_delete, ))
copystr.copy_out(cur, 'word',
- columns=['word_token', 'type', 'info'])
+ columns=['word_token', 'type', 'word'])
def update_special_phrases(self, phrases, should_replace):
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
- cur.execute("SELECT info FROM word WHERE type = 'S'")
- for (info, ) in cur:
- existing_phrases.add((info['word'], info['class'], info['type'],
+ cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+ for word, info in cur:
+ existing_phrases.add((word, info['class'], info['type'],
info.get('op') or '-'))
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
for word, cls, typ, oper in to_add:
term = self.name_processor.get_search_normalized(word)
if term:
- copystr.add(term, 'S',
- json.dumps({'word': word, 'class': cls, 'type': typ,
+ copystr.add(term, 'S', word,
+ json.dumps({'class': cls, 'type': typ,
'op': oper if oper in ('in', 'near') else None}))
added += 1
copystr.copy_out(cursor, 'word',
- columns=['word_token', 'type', 'info'])
+ columns=['word_token', 'type', 'word', 'info'])
return added
if to_delete:
cursor.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
- WHERE info->>'word' = name
+ WHERE type = 'S' and word = name
and info->>'class' = in_class and info->>'type' = in_type
and ((op = '-' and info->>'op' is null) or op = info->>'op')
""", to_delete)
with self.conn.cursor() as cur:
# Get existing names
cur.execute("""SELECT word_token FROM word
- WHERE type = 'C' and info->>'cc'= %s""",
+ WHERE type = 'C' and word = %s""",
(country_code, ))
word_tokens.difference_update((t[0] for t in cur))
# Only add those names that are not yet in the list.
if word_tokens:
- cur.execute("""INSERT INTO word (word_token, type, info)
- (SELECT token, 'C', json_build_object('cc', %s)
+ cur.execute("""INSERT INTO word (word_token, type, word)
+ (SELECT token, 'C', %s
FROM unnest(%s) as token)
""", (country_code, list(word_tokens)))
with self.conn.cursor() as cur:
# no word_id needed for postcodes
- cur.execute("""INSERT INTO word (word_token, type, info)
- (SELECT %s, 'P', json_build_object('postcode', pc)
- FROM (VALUES (%s)) as v(pc)
+ cur.execute("""INSERT INTO word (word_token, type, word)
+ (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
WHERE NOT EXISTS
(SELECT * FROM word
- WHERE type = 'P' and info->>postcode = pc))
+ WHERE type = 'P' and word = pc))
""", (term, postcode))
self._cache.postcodes.add(postcode)