]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 import itertools
12 import json
13 import logging
14 import re
15 from textwrap import dedent
16
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
23
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
25
26 LOG = logging.getLogger()
27
28 def create(dsn, data_dir):
29     """ Create a new instance of the tokenizer provided by this module.
30     """
31     return LegacyICUTokenizer(dsn, data_dir)
32
33
34 class LegacyICUTokenizer(AbstractTokenizer):
35     """ This tokenizer uses libICU to covert names and queries to ASCII.
36         Otherwise it uses the same algorithms and data structures as the
37         normalization routines in Nominatim 3.
38     """
39
40     def __init__(self, dsn, data_dir):
41         self.dsn = dsn
42         self.data_dir = data_dir
43         self.loader = None
44
45
46     def init_new_db(self, config, init_db=True):
47         """ Set up a new tokenizer for the database.
48
49             This copies all necessary data in the project directory to make
50             sure the tokenizer remains stable even over updates.
51         """
52         self.loader = ICURuleLoader(config)
53
54         self._install_php(config.lib_dir.php, overwrite=True)
55         self._save_config()
56
57         if init_db:
58             self.update_sql_functions(config)
59             self._init_db_tables(config)
60
61
62     def init_from_project(self, config):
63         """ Initialise the tokenizer from the project directory.
64         """
65         self.loader = ICURuleLoader(config)
66
67         with connect(self.dsn) as conn:
68             self.loader.load_config_from_db(conn)
69
70         self._install_php(config.lib_dir.php, overwrite=False)
71
72
73     def finalize_import(self, config):
74         """ Do any required postprocessing to make the tokenizer data ready
75             for use.
76         """
77         with connect(self.dsn) as conn:
78             sqlp = SQLPreprocessor(conn, config)
79             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80
81
82     def update_sql_functions(self, config):
83         """ Reimport the SQL functions for this tokenizer.
84         """
85         with connect(self.dsn) as conn:
86             sqlp = SQLPreprocessor(conn, config)
87             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88
89
90     def check_database(self, config):
91         """ Check that the tokenizer is set up correctly.
92         """
93         # Will throw an error if there is an issue.
94         self.init_from_project(config)
95
96
97     def update_statistics(self):
98         """ Recompute frequencies for all name words.
99         """
100         with connect(self.dsn) as conn:
101             if conn.table_exists('search_name'):
102                 with conn.cursor() as cur:
103                     cur.drop_table("word_frequencies")
104                     LOG.info("Computing word frequencies")
105                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
106                                      SELECT unnest(name_vector) as id, count(*)
107                                      FROM search_name GROUP BY id""")
108                     cur.execute("CREATE INDEX ON word_frequencies(id)")
109                     LOG.info("Update word table with recomputed frequencies")
110                     cur.execute("""UPDATE word
111                                    SET info = info || jsonb_build_object('count', count)
112                                    FROM word_frequencies WHERE word_id = id""")
113                     cur.drop_table("word_frequencies")
114             conn.commit()
115
116
117     def _cleanup_housenumbers(self):
118         """ Remove unused house numbers.
119         """
120         with connect(self.dsn) as conn:
121             if not conn.table_exists('search_name'):
122                 return
123             with conn.cursor(name="hnr_counter") as cur:
124                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
125                                FROM word
126                                WHERE type = 'H'
127                                  AND NOT EXISTS(SELECT * FROM search_name
128                                                 WHERE ARRAY[word.word_id] && name_vector)
129                                  AND (char_length(coalesce(word, word_token)) > 6
130                                       OR coalesce(word, word_token) not similar to '\\d+')
131                             """)
132                 candidates = {token: wid for wid, token in cur}
133             with conn.cursor(name="hnr_counter") as cur:
134                 cur.execute("""SELECT housenumber FROM placex
135                                WHERE housenumber is not null
136                                      AND (char_length(housenumber) > 6
137                                           OR housenumber not similar to '\\d+')
138                             """)
139                 for row in cur:
140                     for hnr in row[0].split(';'):
141                         candidates.pop(hnr, None)
142             LOG.info("There are %s outdated housenumbers.", len(candidates))
143             LOG.debug("Outdated housenumbers: %s", candidates.keys())
144             if candidates:
145                 with conn.cursor() as cur:
146                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
147                                 (list(candidates.values()), ))
148                 conn.commit()
149
150
151
152     def update_word_tokens(self):
153         """ Remove unused tokens.
154         """
155         LOG.warning("Cleaning up housenumber tokens.")
156         self._cleanup_housenumbers()
157         LOG.warning("Tokenizer house-keeping done.")
158
159
160     def name_analyzer(self):
161         """ Create a new analyzer for tokenizing names and queries
162             using this tokinzer. Analyzers are context managers and should
163             be used accordingly:
164
165             ```
166             with tokenizer.name_analyzer() as analyzer:
167                 analyser.tokenize()
168             ```
169
170             When used outside the with construct, the caller must ensure to
171             call the close() function before destructing the analyzer.
172
173             Analyzers are not thread-safe. You need to instantiate one per thread.
174         """
175         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
176                                      self.loader.make_token_analysis())
177
178
179     def _install_php(self, phpdir, overwrite=True):
180         """ Install the php script for the tokenizer.
181         """
182         php_file = self.data_dir / "tokenizer.php"
183
184         if not php_file.exists() or overwrite:
185             php_file.write_text(dedent(f"""\
186                 <?php
187                 @define('CONST_Max_Word_Frequency', 10000000);
188                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
189                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
190                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
191
192
193     def _save_config(self):
194         """ Save the configuration that needs to remain stable for the given
195             database as database properties.
196         """
197         with connect(self.dsn) as conn:
198             self.loader.save_config_to_db(conn)
199
200
201     def _init_db_tables(self, config):
202         """ Set up the word table and fill it with pre-computed word
203             frequencies.
204         """
205         with connect(self.dsn) as conn:
206             sqlp = SQLPreprocessor(conn, config)
207             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
208             conn.commit()
209
210
211 class LegacyICUNameAnalyzer(AbstractAnalyzer):
212     """ The legacy analyzer uses the ICU library for splitting names.
213
214         Each instance opens a connection to the database to request the
215         normalization.
216     """
217
218     def __init__(self, dsn, sanitizer, token_analysis):
219         self.conn = connect(dsn).connection
220         self.conn.autocommit = True
221         self.sanitizer = sanitizer
222         self.token_analysis = token_analysis
223
224         self._cache = _TokenCache()
225
226
227     def close(self):
228         """ Free all resources used by the analyzer.
229         """
230         if self.conn:
231             self.conn.close()
232             self.conn = None
233
234
235     def _search_normalized(self, name):
236         """ Return the search token transliteration of the given name.
237         """
238         return self.token_analysis.search.transliterate(name).strip()
239
240
241     def _normalized(self, name):
242         """ Return the normalized version of the given name with all
243             non-relevant information removed.
244         """
245         return self.token_analysis.normalizer.transliterate(name).strip()
246
247
248     def get_word_token_info(self, words):
249         """ Return token information for the given list of words.
250             If a word starts with # it is assumed to be a full name
251             otherwise is a partial name.
252
253             The function returns a list of tuples with
254             (original word, word token, word id).
255
256             The function is used for testing and debugging only
257             and not necessarily efficient.
258         """
259         full_tokens = {}
260         partial_tokens = {}
261         for word in words:
262             if word.startswith('#'):
263                 full_tokens[word] = self._search_normalized(word[1:])
264             else:
265                 partial_tokens[word] = self._search_normalized(word)
266
267         with self.conn.cursor() as cur:
268             cur.execute("""SELECT word_token, word_id
269                             FROM word WHERE word_token = ANY(%s) and type = 'W'
270                         """, (list(full_tokens.values()),))
271             full_ids = {r[0]: r[1] for r in cur}
272             cur.execute("""SELECT word_token, word_id
273                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
274                         (list(partial_tokens.values()),))
275             part_ids = {r[0]: r[1] for r in cur}
276
277         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
278                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
279
280
281     def normalize_postcode(self, postcode):
282         """ Convert the postcode to a standardized form.
283
284             This function must yield exactly the same result as the SQL function
285             'token_normalized_postcode()'.
286         """
287         return postcode.strip().upper()
288
289
290     def update_postcodes_from_db(self):
291         """ Update postcode tokens in the word table from the location_postcode
292             table.
293         """
294         to_delete = []
295         with self.conn.cursor() as cur:
296             # This finds us the rows in location_postcode and word that are
297             # missing in the other table.
298             cur.execute("""SELECT * FROM
299                             (SELECT pc, word FROM
300                               (SELECT distinct(postcode) as pc FROM location_postcode) p
301                               FULL JOIN
302                               (SELECT word FROM word WHERE type = 'P') w
303                               ON pc = word) x
304                            WHERE pc is null or word is null""")
305
306             with CopyBuffer() as copystr:
307                 for postcode, word in cur:
308                     if postcode is None:
309                         to_delete.append(word)
310                     else:
311                         copystr.add(self._search_normalized(postcode),
312                                     'P', postcode)
313
314                 if to_delete:
315                     cur.execute("""DELETE FROM WORD
316                                    WHERE type ='P' and word = any(%s)
317                                 """, (to_delete, ))
318
319                 copystr.copy_out(cur, 'word',
320                                  columns=['word_token', 'type', 'word'])
321
322
323     def update_special_phrases(self, phrases, should_replace):
324         """ Replace the search index for special phrases with the new phrases.
325             If `should_replace` is True, then the previous set of will be
326             completely replaced. Otherwise the phrases are added to the
327             already existing ones.
328         """
329         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
330                             for p in phrases))
331
332         with self.conn.cursor() as cur:
333             # Get the old phrases.
334             existing_phrases = set()
335             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
336             for word, info in cur:
337                 existing_phrases.add((word, info['class'], info['type'],
338                                       info.get('op') or '-'))
339
340             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
341             if should_replace:
342                 deleted = self._remove_special_phrases(cur, norm_phrases,
343                                                        existing_phrases)
344             else:
345                 deleted = 0
346
347         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
348                  len(norm_phrases), added, deleted)
349
350
351     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
352         """ Add all phrases to the database that are not yet there.
353         """
354         to_add = new_phrases - existing_phrases
355
356         added = 0
357         with CopyBuffer() as copystr:
358             for word, cls, typ, oper in to_add:
359                 term = self._search_normalized(word)
360                 if term:
361                     copystr.add(term, 'S', word,
362                                 json.dumps({'class': cls, 'type': typ,
363                                             'op': oper if oper in ('in', 'near') else None}))
364                     added += 1
365
366             copystr.copy_out(cursor, 'word',
367                              columns=['word_token', 'type', 'word', 'info'])
368
369         return added
370
371
372     @staticmethod
373     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
374         """ Remove all phrases from the databse that are no longer in the
375             new phrase list.
376         """
377         to_delete = existing_phrases - new_phrases
378
379         if to_delete:
380             cursor.execute_values(
381                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
382                     WHERE type = 'S' and word = name
383                           and info->>'class' = in_class and info->>'type' = in_type
384                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
385                 """, to_delete)
386
387         return len(to_delete)
388
389
390     def add_country_names(self, country_code, names):
391         """ Add default names for the given country to the search index.
392         """
393         # Make sure any name preprocessing for country names applies.
394         info = PlaceInfo({'name': names, 'country_code': country_code,
395                           'rank_address': 4, 'class': 'boundary',
396                           'type': 'administrative'})
397         self._add_country_full_names(country_code,
398                                      self.sanitizer.process_names(info)[0],
399                                      internal=True)
400
401
402     def _add_country_full_names(self, country_code, names, internal=False):
403         """ Add names for the given country from an already sanitized
404             name list.
405         """
406         word_tokens = set()
407         for name in names:
408             norm_name = self._search_normalized(name.name)
409             if norm_name:
410                 word_tokens.add(norm_name)
411
412         with self.conn.cursor() as cur:
413             # Get existing names
414             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
415                              FROM word
416                              WHERE type = 'C' and word = %s""",
417                         (country_code, ))
418             existing_tokens = {True: set(), False: set()} # internal/external names
419             for word in cur:
420                 existing_tokens[word[1]].add(word[0])
421
422             # Delete names that no longer exist.
423             gone_tokens = existing_tokens[internal] - word_tokens
424             if internal:
425                 gone_tokens.update(existing_tokens[False] & word_tokens)
426             if gone_tokens:
427                 cur.execute("""DELETE FROM word
428                                USING unnest(%s) as token
429                                WHERE type = 'C' and word = %s
430                                      and word_token = token""",
431                             (list(gone_tokens), country_code))
432
433             # Only add those names that are not yet in the list.
434             new_tokens = word_tokens - existing_tokens[True]
435             if not internal:
436                 new_tokens -= existing_tokens[False]
437             if new_tokens:
438                 if internal:
439                     sql = """INSERT INTO word (word_token, type, word, info)
440                                (SELECT token, 'C', %s, '{"internal": "yes"}'
441                                   FROM unnest(%s) as token)
442                            """
443                 else:
444                     sql = """INSERT INTO word (word_token, type, word)
445                                    (SELECT token, 'C', %s
446                                     FROM unnest(%s) as token)
447                           """
448                 cur.execute(sql, (country_code, list(new_tokens)))
449
450
451     def process_place(self, place):
452         """ Determine tokenizer information about the given place.
453
454             Returns a JSON-serializable structure that will be handed into
455             the database via the token_info field.
456         """
457         token_info = _TokenInfo()
458
459         names, address = self.sanitizer.process_names(place)
460
461         if names:
462             token_info.set_names(*self._compute_name_tokens(names))
463
464             if place.is_country():
465                 self._add_country_full_names(place.country_code, names)
466
467         if address:
468             self._process_place_address(token_info, address)
469
470         return token_info.to_dict()
471
472
473     def _process_place_address(self, token_info, address):
474         for item in address:
475             if item.kind == 'postcode':
476                 self._add_postcode(item.name)
477             elif item.kind == 'housenumber':
478                 token_info.add_housenumber(*self._compute_housenumber_token(item))
479             elif item.kind == 'street':
480                 token_info.add_street(self._retrieve_full_tokens(item.name))
481             elif item.kind == 'place':
482                 if not item.suffix:
483                     token_info.add_place(self._compute_partial_tokens(item.name))
484             elif not item.kind.startswith('_') and not item.suffix and \
485                  item.kind not in ('country', 'full'):
486                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
487
488
489     def _compute_housenumber_token(self, hnr):
490         """ Normalize the housenumber and return the word token and the
491             canonical form.
492         """
493         analyzer = self.token_analysis.analysis.get('@housenumber')
494         result = None, None
495
496         if analyzer is None:
497             # When no custom analyzer is set, simply normalize and transliterate
498             norm_name = self._search_normalized(hnr.name)
499             if norm_name:
500                 result = self._cache.housenumbers.get(norm_name, result)
501                 if result[0] is None:
502                     with self.conn.cursor() as cur:
503                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
504                         result = cur.fetchone()[0], norm_name
505                         self._cache.housenumbers[norm_name] = result
506         else:
507             # Otherwise use the analyzer to determine the canonical name.
508             # Per convention we use the first variant as the 'lookup name', the
509             # name that gets saved in the housenumber field of the place.
510             norm_name = analyzer.normalize(hnr.name)
511             if norm_name:
512                 result = self._cache.housenumbers.get(norm_name, result)
513                 if result[0] is None:
514                     variants = analyzer.get_variants_ascii(norm_name)
515                     if variants:
516                         with self.conn.cursor() as cur:
517                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
518                                         (norm_name, list(variants)))
519                             result = cur.fetchone()[0], variants[0]
520                             self._cache.housenumbers[norm_name] = result
521
522         return result
523
524
525     def _compute_partial_tokens(self, name):
526         """ Normalize the given term, split it into partial words and return
527             then token list for them.
528         """
529         norm_name = self._search_normalized(name)
530
531         tokens = []
532         need_lookup = []
533         for partial in norm_name.split():
534             token = self._cache.partials.get(partial)
535             if token:
536                 tokens.append(token)
537             else:
538                 need_lookup.append(partial)
539
540         if need_lookup:
541             with self.conn.cursor() as cur:
542                 cur.execute("""SELECT word, getorcreate_partial_word(word)
543                                FROM unnest(%s) word""",
544                             (need_lookup, ))
545
546                 for partial, token in cur:
547                     tokens.append(token)
548                     self._cache.partials[partial] = token
549
550         return tokens
551
552
553     def _retrieve_full_tokens(self, name):
554         """ Get the full name token for the given name, if it exists.
555             The name is only retrived for the standard analyser.
556         """
557         norm_name = self._search_normalized(name)
558
559         # return cached if possible
560         if norm_name in self._cache.fulls:
561             return self._cache.fulls[norm_name]
562
563         with self.conn.cursor() as cur:
564             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
565                         (norm_name, ))
566             full = [row[0] for row in cur]
567
568         self._cache.fulls[norm_name] = full
569
570         return full
571
572
573     def _compute_name_tokens(self, names):
574         """ Computes the full name and partial name tokens for the given
575             dictionary of names.
576         """
577         full_tokens = set()
578         partial_tokens = set()
579
580         for name in names:
581             analyzer_id = name.get_attr('analyzer')
582             analyzer = self.token_analysis.get_analyzer(analyzer_id)
583             norm_name = analyzer.normalize(name.name)
584             if analyzer_id is None:
585                 token_id = norm_name
586             else:
587                 token_id = f'{norm_name}@{analyzer_id}'
588
589             full, part = self._cache.names.get(token_id, (None, None))
590             if full is None:
591                 variants = analyzer.get_variants_ascii(norm_name)
592                 if not variants:
593                     continue
594
595                 with self.conn.cursor() as cur:
596                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
597                                 (token_id, variants))
598                     full, part = cur.fetchone()
599
600                 self._cache.names[token_id] = (full, part)
601
602             full_tokens.add(full)
603             partial_tokens.update(part)
604
605         return full_tokens, partial_tokens
606
607
608     def _add_postcode(self, postcode):
609         """ Make sure the normalized postcode is present in the word table.
610         """
611         if re.search(r'[:,;]', postcode) is None:
612             postcode = self.normalize_postcode(postcode)
613
614             if postcode not in self._cache.postcodes:
615                 term = self._search_normalized(postcode)
616                 if not term:
617                     return
618
619                 with self.conn.cursor() as cur:
620                     # no word_id needed for postcodes
621                     cur.execute("""INSERT INTO word (word_token, type, word)
622                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
623                                     WHERE NOT EXISTS
624                                      (SELECT * FROM word
625                                       WHERE type = 'P' and word = pc))
626                                 """, (term, postcode))
627                 self._cache.postcodes.add(postcode)
628
629
630 class _TokenInfo:
631     """ Collect token information to be sent back to the database.
632     """
633     def __init__(self):
634         self.names = None
635         self.housenumbers = set()
636         self.housenumber_tokens = set()
637         self.street_tokens = set()
638         self.place_tokens = set()
639         self.address_tokens = {}
640
641
642     @staticmethod
643     def _mk_array(tokens):
644         return f"{{{','.join((str(s) for s in tokens))}}}"
645
646
647     def to_dict(self):
648         """ Return the token information in database importable format.
649         """
650         out = {}
651
652         if self.names:
653             out['names'] = self.names
654
655         if self.housenumbers:
656             out['hnr'] = ';'.join(self.housenumbers)
657             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
658
659         if self.street_tokens:
660             out['street'] = self._mk_array(self.street_tokens)
661
662         if self.place_tokens:
663             out['place'] = self._mk_array(self.place_tokens)
664
665         if self.address_tokens:
666             out['addr'] = self.address_tokens
667
668         return out
669
670
671     def set_names(self, fulls, partials):
672         """ Adds token information for the normalised names.
673         """
674         self.names = self._mk_array(itertools.chain(fulls, partials))
675
676
677     def add_housenumber(self, token, hnr):
678         """ Extract housenumber information from a list of normalised
679             housenumbers.
680         """
681         if token:
682             self.housenumbers.add(hnr)
683             self.housenumber_tokens.add(token)
684
685
686     def add_street(self, tokens):
687         """ Add addr:street match terms.
688         """
689         self.street_tokens.update(tokens)
690
691
692     def add_place(self, tokens):
693         """ Add addr:place search and match terms.
694         """
695         self.place_tokens.update(tokens)
696
697
698     def add_address_term(self, key, partials):
699         """ Add additional address terms.
700         """
701         if partials:
702             self.address_tokens[key] = self._mk_array(partials)
703
704
705 class _TokenCache:
706     """ Cache for token information to avoid repeated database queries.
707
708         This cache is not thread-safe and needs to be instantiated per
709         analyzer.
710     """
711     def __init__(self):
712         self.names = {}
713         self.partials = {}
714         self.fulls = {}
715         self.postcodes = set()
716         self.housenumbers = {}