"""
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
-
+ # pylint: disable=missing-format-attribute
def _install_php(self, phpdir):
""" Install the php script for the tokenizer.
"""
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
@define('CONST_Transliteration', "{0.naming_rules.search_rules}");
require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
- """.format(self, phpdir))) # pylint: disable=missing-format-attribute
+ """.format(self, phpdir)))
def _save_config(self, config):
words = Counter()
name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur:
- cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+ cur.execute(""" SELECT v, count(*) FROM
+ (SELECT svals(name) as v FROM place)x
+ WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur:
+ terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
- for term in word.split():
- words[term] += cnt
+ if ' ' in word:
+ terms.update(word.split())
+ for term in terms:
+ words[term] += cnt
# copy them back into the word table
with CopyBuffer() as copystr:
self.add_country_names(country_feature.lower(), names)
address = place.get('address')
-
if address:
- hnrs = []
- addr_terms = []
- for key, value in address.items():
- if key == 'postcode':
- self._add_postcode(value)
- elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
- hnrs.append(value)
- elif key == 'street':
- token_info.add_street(*self._compute_name_tokens({'name': value}))
- elif key == 'place':
- token_info.add_place(*self._compute_name_tokens({'name': value}))
- elif not key.startswith('_') and \
- key not in ('country', 'full'):
- addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
- if hnrs:
- hnrs = self._split_housenumbers(hnrs)
- token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
- if addr_terms:
- token_info.add_address_terms(addr_terms)
+ self._process_place_address(token_info, address)
return token_info.data
+ def _process_place_address(self, token_info, address):
+ hnrs = []
+ addr_terms = []
+ for key, value in address.items():
+ if key == 'postcode':
+ self._add_postcode(value)
+ elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+ hnrs.append(value)
+ elif key == 'street':
+ token_info.add_street(*self._compute_name_tokens({'name': value}))
+ elif key == 'place':
+ token_info.add_place(*self._compute_name_tokens({'name': value}))
+ elif not key.startswith('_') and \
+ key not in ('country', 'full'):
+ addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+ if hnrs:
+ hnrs = self._split_housenumbers(hnrs)
+ token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+ if addr_terms:
+ token_info.add_address_terms(addr_terms)
+
+
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
+ if not variants:
+ continue
+
with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants))
given dictionary of names.
"""
full_names = set()
- for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
- full_names.add(name.strip())
+ for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
+ if name:
+ full_names.add(name)
- brace_idx = name.find('(')
- if brace_idx >= 0:
- full_names.add(name[:brace_idx].strip())
+ brace_idx = name.find('(')
+ if brace_idx >= 0:
+ full_names.add(name[:brace_idx].strip())
return full_names