import itertools
import json
import logging
-import re
from textwrap import dedent
from nominatim.db.connection import connect
"""
self.loader = ICURuleLoader(config)
- self._install_php(config.lib_dir.php)
+ self._install_php(config.lib_dir.php, overwrite=True)
self._save_config()
if init_db:
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
+ self._install_php(config.lib_dir.php, overwrite=False)
+
def finalize_import(self, config):
""" Do any required postprocessing to make the tokenizer data ready
if not conn.table_exists('search_name'):
return
with conn.cursor(name="hnr_counter") as cur:
- cur.execute("""SELECT word_id, word_token FROM word
+ cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
+ FROM word
WHERE type = 'H'
AND NOT EXISTS(SELECT * FROM search_name
WHERE ARRAY[word.word_id] && name_vector)
- AND (char_length(word_token) > 6
- OR word_token not similar to '\\d+')
+ AND (char_length(coalesce(word, word_token)) > 6
+ OR coalesce(word, word_token) not similar to '\\d+')
""")
candidates = {token: wid for wid, token in cur}
with conn.cursor(name="hnr_counter") as cur:
for hnr in row[0].split(';'):
candidates.pop(hnr, None)
LOG.info("There are %s outdated housenumbers.", len(candidates))
+ LOG.debug("Outdated housenumbers: %s", candidates.keys())
if candidates:
with conn.cursor() as cur:
cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
self.loader.make_token_analysis())
- def _install_php(self, phpdir):
+ def _install_php(self, phpdir, overwrite=True):
""" Install the php script for the tokenizer.
"""
php_file = self.data_dir / "tokenizer.php"
- php_file.write_text(dedent(f"""\
- <?php
- @define('CONST_Max_Word_Frequency', 10000000);
- @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
- @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
- require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+
+ if not php_file.exists() or overwrite:
+ php_file.write_text(dedent(f"""\
+ <?php
+ @define('CONST_Max_Word_Frequency', 10000000);
+ @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+ @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+ require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
def _save_config(self):
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
- @staticmethod
- def normalize_postcode(postcode):
+ def normalize_postcode(self, postcode):
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
def _process_place_address(self, token_info, address):
for item in address:
if item.kind == 'postcode':
- self._add_postcode(item.name)
+ token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street':
if not item.suffix:
token_info.add_place(self._compute_partial_tokens(item.name))
elif not item.kind.startswith('_') and not item.suffix and \
- item.kind not in ('country', 'full'):
+ item.kind not in ('country', 'full', 'inclusion'):
token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
continue
with self.conn.cursor() as cur:
- cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
+ cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
(token_id, variants))
full, part = cur.fetchone()
return full_tokens, partial_tokens
- def _add_postcode(self, postcode):
+ def _add_postcode(self, item):
""" Make sure the normalized postcode is present in the word table.
"""
- if re.search(r'[:,;]', postcode) is None:
- postcode = self.normalize_postcode(postcode)
+ analyzer = self.token_analysis.get_analyzer('@postcode')
- if postcode not in self._cache.postcodes:
- term = self._search_normalized(postcode)
- if not term:
- return
+ if analyzer is None:
+ postcode_name = item.name.strip().upper()
+ variant_base = None
+ else:
+ postcode_name = analyzer.normalize(item.name)
+ variant_base = item.get_attr("variant")
- with self.conn.cursor() as cur:
- # no word_id needed for postcodes
- cur.execute("""INSERT INTO word (word_token, type, word)
- (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
- WHERE NOT EXISTS
- (SELECT * FROM word
- WHERE type = 'P' and word = pc))
- """, (term, postcode))
- self._cache.postcodes.add(postcode)
+ if variant_base is not None:
+ postcode = f'{postcode_name}@{variant_base}'
+ else:
+ postcode = postcode_name
+
+ if postcode not in self._cache.postcodes:
+ term = self._search_normalized(postcode_name)
+ if not term:
+ return
+
+ variants = {term}
+ if analyzer is not None and variant_base is not None:
+ variants.update(analyzer.get_variants_ascii(variant_base))
+
+ with self.conn.cursor() as cur:
+ cur.execute("SELECT create_postcode_word(%s, %s)",
+ (postcode, list(variants)))
+ self._cache.postcodes.add(postcode)
class _TokenInfo:
self.street_tokens = set()
self.place_tokens = set()
self.address_tokens = {}
+ self.postcode = None
@staticmethod
if partials:
self.address_tokens[key] = self._mk_array(partials)
+ def set_postcode(self, postcode):
+ """ Set the postcode to the given one.
+ """
+ self.postcode = postcode
+
class _TokenCache:
""" Cache for token information to avoid repeated database queries.