2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
11 from textwrap import dedent
12 from pathlib import Path
14 from icu import Transliterator
15 import psycopg2.extras
17 from nominatim.db.connection import connect
18 from nominatim.db.properties import set_property, get_property
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
21 DBCFG_NORMALIZATION = "tokenizer_normalization"
22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
23 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
24 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer:
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatm 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
43 self.normalization = None
44 self.transliteration = None
45 self.abbreviations = None
48 def init_new_db(self, config, init_db=True):
49 """ Set up a new tokenizer for the database.
51 This copies all necessary data in the project directory to make
52 sure the tokenizer remains stable even over updates.
54 if config.TOKENIZER_CONFIG:
55 cfgfile = Path(config.TOKENIZER_CONFIG)
57 cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
59 rules = json.loads(cfgfile.read_text())
60 self.transliteration = ';'.join(rules['normalization']) + ';'
61 self.abbreviations = rules["abbreviations"]
62 self.normalization = config.TERM_NORMALIZATION
64 self._install_php(config)
65 self._save_config(config)
68 self.update_sql_functions(config)
69 self._init_db_tables(config)
72 def init_from_project(self):
73 """ Initialise the tokenizer from the project directory.
75 with connect(self.dsn) as conn:
76 self.normalization = get_property(conn, DBCFG_NORMALIZATION)
77 self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
78 self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
81 def finalize_import(self, config):
82 """ Do any required postprocessing to make the tokenizer data ready
85 with connect(self.dsn) as conn:
86 sqlp = SQLPreprocessor(conn, config)
87 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
90 def update_sql_functions(self, config):
91 """ Reimport the SQL functions for this tokenizer.
93 with connect(self.dsn) as conn:
94 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
95 sqlp = SQLPreprocessor(conn, config)
96 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
97 max_word_freq=max_word_freq)
100 def check_database(self):
101 """ Check that the tokenizer is set up correctly.
103 self.init_from_project()
105 if self.normalization is None\
106 or self.transliteration is None\
107 or self.abbreviations is None:
108 return "Configuration for tokenizer 'legacy_icu' are missing."
113 def name_analyzer(self):
114 """ Create a new analyzer for tokenizing names and queries
115 using this tokinzer. Analyzers are context managers and should
119 with tokenizer.name_analyzer() as analyzer:
123 When used outside the with construct, the caller must ensure to
124 call the close() function before destructing the analyzer.
126 Analyzers are not thread-safe. You need to instantiate one per thread.
128 norm = Transliterator.createFromRules("normalizer", self.normalization)
129 trans = Transliterator.createFromRules("normalizer", self.transliteration)
130 return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
133 def _install_php(self, config):
134 """ Install the php script for the tokenizer.
136 abbr_inverse = list(zip(*self.abbreviations))
137 php_file = self.data_dir / "tokenizer.php"
138 php_file.write_text(dedent("""\
140 @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
141 @define('CONST_Term_Normalization_Rules', "{0.normalization}");
142 @define('CONST_Transliteration', "{0.transliteration}");
143 @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
144 require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
145 """.format(self, config,
146 "','".join(abbr_inverse[0]),
147 "','".join(abbr_inverse[1]))))
150 def _save_config(self, config):
151 """ Save the configuration that needs to remain stable for the given
152 database as database properties.
154 with connect(self.dsn) as conn:
155 set_property(conn, DBCFG_NORMALIZATION, self.normalization)
156 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
157 set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
158 set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
161 def _init_db_tables(self, config):
162 """ Set up the word table and fill it with pre-computed word
165 with connect(self.dsn) as conn:
166 sqlp = SQLPreprocessor(conn, config)
167 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
170 LOG.warning("Precomputing word tokens")
172 # get partial words and their frequencies
174 with self.name_analyzer() as analyzer:
175 with conn.cursor(name="words") as cur:
176 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
178 for name, cnt in cur:
179 term = analyzer.make_standard_word(name)
181 for word in term.split():
184 # copy them back into the word table
185 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
187 with conn.cursor() as cur:
188 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
189 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
190 WHERE word_id is null""")
195 class LegacyICUNameAnalyzer:
196 """ The legacy analyzer uses the ICU library for splitting names.
198 Each instance opens a connection to the database to request the
202 def __init__(self, dsn, normalizer, transliterator, abbreviations):
203 self.conn = connect(dsn).connection
204 self.conn.autocommit = True
205 self.normalizer = normalizer
206 self.transliterator = transliterator
207 self.abbreviations = abbreviations
208 #psycopg2.extras.register_hstore(self.conn)
210 self._cache = _TokenCache()
217 def __exit__(self, exc_type, exc_value, traceback):
222 """ Free all resources used by the analyzer.
229 def normalize(self, phrase):
230 """ Normalize the given phrase, i.e. remove all properties that
231 are irrelevant for search.
233 return self.normalizer.transliterate(phrase)
235 def make_standard_word(self, name):
236 """ Create the normalised version of the name.
238 norm = ' ' + self.transliterator.transliterate(name) + ' '
239 for full, abbr in self.abbreviations:
241 norm = norm.replace(full, abbr)
246 def _make_standard_hnr(self, hnr):
247 """ Create a normalised version of a housenumber.
249 This function takes minor shortcuts on transliteration.
254 return self.transliterator.transliterate(hnr)
256 def add_postcodes_from_db(self):
257 """ Add postcodes from the location_postcode table to the word table.
259 copystr = io.StringIO()
260 with self.conn.cursor() as cur:
261 cur.execute("SELECT distinct(postcode) FROM location_postcode")
262 for (postcode, ) in cur:
263 copystr.write(postcode)
265 copystr.write(self.transliterator.transliterate(postcode))
266 copystr.write('\tplace\tpostcode\t0\n')
268 cur.copy_from(copystr, 'word',
269 columns=['word', 'word_token', 'class', 'type',
270 'search_name_count'])
271 # Don't really need an ID for postcodes....
272 # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
273 # WHERE word_id is null and type = 'postcode'""")
276 def update_special_phrases(self, phrases):
277 """ Replace the search index for special phrases with the new phrases.
279 norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
282 with self.conn.cursor() as cur:
283 # Get the old phrases.
284 existing_phrases = set()
285 cur.execute("""SELECT word, class, type, operator FROM word
286 WHERE class != 'place'
287 OR (type != 'house' AND type != 'postcode')""")
288 for label, cls, typ, oper in cur:
289 existing_phrases.add((label, cls, typ, oper or '-'))
291 to_add = norm_phrases - existing_phrases
292 to_delete = existing_phrases - norm_phrases
295 copystr = io.StringIO()
296 for word, cls, typ, oper in to_add:
297 term = self.make_standard_word(word)
307 copystr.write(oper if oper in ('in', 'near') else '\\N')
308 copystr.write('\t0\n')
310 cur.copy_from(copystr, 'word',
311 columns=['word', 'word_token', 'class', 'type',
312 'operator', 'search_name_count'])
315 psycopg2.extras.execute_values(
317 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
318 WHERE word = name and class = in_class and type = in_type
319 and ((op = '-' and operator is null) or op = operator)""",
322 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
323 len(norm_phrases), len(to_add), len(to_delete))
326 def add_country_names(self, country_code, names):
327 """ Add names for the given country to the search index.
329 full_names = set((self.make_standard_word(n) for n in names))
330 full_names.discard('')
331 self._add_normalised_country_names(country_code, full_names)
334 def _add_normalised_country_names(self, country_code, names):
335 """ Add names for the given country to the search index.
337 with self.conn.cursor() as cur:
339 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
341 new_names = names.difference((t[0] for t in cur))
344 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
346 (SELECT nextval('seq_word'), token, '{}', 0
347 FROM unnest(%s) as token)
348 """.format(country_code), (list(new_names),))
351 def process_place(self, place):
352 """ Determine tokenizer information about the given place.
354 Returns a JSON-serialisable structure that will be handed into
355 the database via the token_info field.
357 token_info = _TokenInfo(self._cache)
359 names = place.get('name')
362 full_names = set((self.make_standard_word(name) for name in names.values()))
363 full_names.discard('')
365 token_info.add_names(self.conn, full_names)
367 country_feature = place.get('country_feature')
368 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
369 self._add_normalised_country_names(country_feature.lower(),
372 address = place.get('address')
377 for key, value in address.items():
378 if key == 'postcode':
379 self._add_postcode(value)
380 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
382 elif key == 'street':
383 token_info.add_street(self.conn, self.make_standard_word(value))
385 token_info.add_place(self.conn, self.make_standard_word(value))
386 elif not key.startswith('_') and \
387 key not in ('country', 'full'):
388 addr_terms.append((key, self.make_standard_word(value)))
391 hnrs = self._split_housenumbers(hnrs)
392 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
395 token_info.add_address_terms(self.conn, addr_terms)
397 return token_info.data
400 def _add_postcode(self, postcode):
401 """ Make sure the normalized postcode is present in the word table.
403 if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
404 term = self.make_standard_word(postcode)
408 with self.conn.cursor() as cur:
409 # no word_id needed for postcodes
410 cur.execute("""INSERT INTO word (word, word_token, class, type,
412 (SELECT pc, %s, 'place', 'postcode', 0
413 FROM (VALUES (%s)) as v(pc)
416 WHERE word = pc and class='place' and type='postcode'))
417 """, (' ' + term, postcode))
418 self._cache.postcodes.add(postcode)
421 def _split_housenumbers(hnrs):
422 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
423 # split numbers if necessary
426 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
428 if len(simple_list) > 1:
429 hnrs = list(set(simple_list))
439 """ Collect token information to be sent back to the database.
441 def __init__(self, cache):
446 def _mk_array(tokens):
447 return '{%s}' % ','.join((str(s) for s in tokens))
450 def add_names(self, conn, names):
451 """ Adds token information for the normalised names.
453 # Start with all partial names
454 terms = set((part for ns in names for part in ns.split()))
455 # Add partials for the full terms (TO BE REMOVED)
456 terms.update((n for n in names))
458 terms.update((' ' + n for n in names))
460 self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
463 def add_housenumbers(self, conn, hnrs):
464 """ Extract housenumber information from a list of normalised
467 self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
468 self.data['hnr'] = ';'.join(hnrs)
471 def add_street(self, conn, street):
472 """ Add addr:street match terms.
479 tid = self.cache.names.get(term)
482 with conn.cursor() as cur:
483 cur.execute("""SELECT word_id FROM word
484 WHERE word_token = %s
485 and class is null and type is null""",
488 tid = cur.fetchone()[0]
489 self.cache.names[term] = tid
492 self.data['street'] = '{%d}' % tid
495 def add_place(self, conn, place):
496 """ Add addr:place search and match terms.
501 partial_ids = self.cache.get_term_tokens(conn, place.split())
502 tid = self.cache.get_term_tokens(conn, [' ' + place])
504 self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
505 self.data['place_match'] = '{%s}' % tid[0]
508 def add_address_terms(self, conn, terms):
509 """ Add additional address terms.
513 for key, value in terms:
516 partial_ids = self.cache.get_term_tokens(conn, value.split())
518 tid = self.cache.names.get(term)
521 with conn.cursor() as cur:
522 cur.execute("""SELECT word_id FROM word
523 WHERE word_token = %s
524 and class is null and type is null""",
527 tid = cur.fetchone()[0]
528 self.cache.names[term] = tid
530 tokens[key] = [self._mk_array(partial_ids),
531 '{%s}' % ('' if tid is None else str(tid))]
534 self.data['addr'] = tokens
538 """ Cache for token information to avoid repeated database queries.
540 This cache is not thread-safe and needs to be instantiated per
545 self.postcodes = set()
546 self.housenumbers = {}
549 def get_term_tokens(self, conn, terms):
550 """ Get token ids for a list of terms, looking them up in the database
557 token = self.names.get(term)
564 with conn.cursor() as cur:
565 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
567 for term, tid in cur:
568 self.names[term] = tid
575 def get_hnr_tokens(self, conn, terms):
576 """ Get token ids for a list of housenumbers, looking them up in the
577 database if necessary.
583 token = self.housenumbers.get(term)
590 with conn.cursor() as cur:
591 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
593 for term, tid in cur:
594 self.housenumbers[term] = tid