]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/icu_tokenizer.py
0dc551e1b4ce6e828720fad913502c185e49c327
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 import itertools
12 import json
13 import logging
14 from textwrap import dedent
15
16 from nominatim.db.connection import connect
17 from nominatim.db.utils import CopyBuffer
18 from nominatim.db.sql_preprocessor import SQLPreprocessor
19 from nominatim.indexer.place_info import PlaceInfo
20 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
21 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
22
23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24
25 LOG = logging.getLogger()
26
27 def create(dsn, data_dir):
28     """ Create a new instance of the tokenizer provided by this module.
29     """
30     return LegacyICUTokenizer(dsn, data_dir)
31
32
33 class LegacyICUTokenizer(AbstractTokenizer):
34     """ This tokenizer uses libICU to covert names and queries to ASCII.
35         Otherwise it uses the same algorithms and data structures as the
36         normalization routines in Nominatim 3.
37     """
38
39     def __init__(self, dsn, data_dir):
40         self.dsn = dsn
41         self.data_dir = data_dir
42         self.loader = None
43
44
45     def init_new_db(self, config, init_db=True):
46         """ Set up a new tokenizer for the database.
47
48             This copies all necessary data in the project directory to make
49             sure the tokenizer remains stable even over updates.
50         """
51         self.loader = ICURuleLoader(config)
52
53         self._install_php(config.lib_dir.php, overwrite=True)
54         self._save_config()
55
56         if init_db:
57             self.update_sql_functions(config)
58             self._init_db_tables(config)
59
60
61     def init_from_project(self, config):
62         """ Initialise the tokenizer from the project directory.
63         """
64         self.loader = ICURuleLoader(config)
65
66         with connect(self.dsn) as conn:
67             self.loader.load_config_from_db(conn)
68
69         self._install_php(config.lib_dir.php, overwrite=False)
70
71
72     def finalize_import(self, config):
73         """ Do any required postprocessing to make the tokenizer data ready
74             for use.
75         """
76         with connect(self.dsn) as conn:
77             sqlp = SQLPreprocessor(conn, config)
78             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
79
80
81     def update_sql_functions(self, config):
82         """ Reimport the SQL functions for this tokenizer.
83         """
84         with connect(self.dsn) as conn:
85             sqlp = SQLPreprocessor(conn, config)
86             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
87
88
89     def check_database(self, config):
90         """ Check that the tokenizer is set up correctly.
91         """
92         # Will throw an error if there is an issue.
93         self.init_from_project(config)
94
95
96     def update_statistics(self):
97         """ Recompute frequencies for all name words.
98         """
99         with connect(self.dsn) as conn:
100             if conn.table_exists('search_name'):
101                 with conn.cursor() as cur:
102                     cur.drop_table("word_frequencies")
103                     LOG.info("Computing word frequencies")
104                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
105                                      SELECT unnest(name_vector) as id, count(*)
106                                      FROM search_name GROUP BY id""")
107                     cur.execute("CREATE INDEX ON word_frequencies(id)")
108                     LOG.info("Update word table with recomputed frequencies")
109                     cur.execute("""UPDATE word
110                                    SET info = info || jsonb_build_object('count', count)
111                                    FROM word_frequencies WHERE word_id = id""")
112                     cur.drop_table("word_frequencies")
113             conn.commit()
114
115
116     def _cleanup_housenumbers(self):
117         """ Remove unused house numbers.
118         """
119         with connect(self.dsn) as conn:
120             if not conn.table_exists('search_name'):
121                 return
122             with conn.cursor(name="hnr_counter") as cur:
123                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
124                                FROM word
125                                WHERE type = 'H'
126                                  AND NOT EXISTS(SELECT * FROM search_name
127                                                 WHERE ARRAY[word.word_id] && name_vector)
128                                  AND (char_length(coalesce(word, word_token)) > 6
129                                       OR coalesce(word, word_token) not similar to '\\d+')
130                             """)
131                 candidates = {token: wid for wid, token in cur}
132             with conn.cursor(name="hnr_counter") as cur:
133                 cur.execute("""SELECT housenumber FROM placex
134                                WHERE housenumber is not null
135                                      AND (char_length(housenumber) > 6
136                                           OR housenumber not similar to '\\d+')
137                             """)
138                 for row in cur:
139                     for hnr in row[0].split(';'):
140                         candidates.pop(hnr, None)
141             LOG.info("There are %s outdated housenumbers.", len(candidates))
142             LOG.debug("Outdated housenumbers: %s", candidates.keys())
143             if candidates:
144                 with conn.cursor() as cur:
145                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
146                                 (list(candidates.values()), ))
147                 conn.commit()
148
149
150
151     def update_word_tokens(self):
152         """ Remove unused tokens.
153         """
154         LOG.warning("Cleaning up housenumber tokens.")
155         self._cleanup_housenumbers()
156         LOG.warning("Tokenizer house-keeping done.")
157
158
159     def name_analyzer(self):
160         """ Create a new analyzer for tokenizing names and queries
161             using this tokinzer. Analyzers are context managers and should
162             be used accordingly:
163
164             ```
165             with tokenizer.name_analyzer() as analyzer:
166                 analyser.tokenize()
167             ```
168
169             When used outside the with construct, the caller must ensure to
170             call the close() function before destructing the analyzer.
171
172             Analyzers are not thread-safe. You need to instantiate one per thread.
173         """
174         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
175                                      self.loader.make_token_analysis())
176
177
178     def _install_php(self, phpdir, overwrite=True):
179         """ Install the php script for the tokenizer.
180         """
181         php_file = self.data_dir / "tokenizer.php"
182
183         if not php_file.exists() or overwrite:
184             php_file.write_text(dedent(f"""\
185                 <?php
186                 @define('CONST_Max_Word_Frequency', 10000000);
187                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
188                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
189                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
190
191
192     def _save_config(self):
193         """ Save the configuration that needs to remain stable for the given
194             database as database properties.
195         """
196         with connect(self.dsn) as conn:
197             self.loader.save_config_to_db(conn)
198
199
200     def _init_db_tables(self, config):
201         """ Set up the word table and fill it with pre-computed word
202             frequencies.
203         """
204         with connect(self.dsn) as conn:
205             sqlp = SQLPreprocessor(conn, config)
206             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
207             conn.commit()
208
209
210 class LegacyICUNameAnalyzer(AbstractAnalyzer):
211     """ The legacy analyzer uses the ICU library for splitting names.
212
213         Each instance opens a connection to the database to request the
214         normalization.
215     """
216
217     def __init__(self, dsn, sanitizer, token_analysis):
218         self.conn = connect(dsn).connection
219         self.conn.autocommit = True
220         self.sanitizer = sanitizer
221         self.token_analysis = token_analysis
222
223         self._cache = _TokenCache()
224
225
226     def close(self):
227         """ Free all resources used by the analyzer.
228         """
229         if self.conn:
230             self.conn.close()
231             self.conn = None
232
233
234     def _search_normalized(self, name):
235         """ Return the search token transliteration of the given name.
236         """
237         return self.token_analysis.search.transliterate(name).strip()
238
239
240     def _normalized(self, name):
241         """ Return the normalized version of the given name with all
242             non-relevant information removed.
243         """
244         return self.token_analysis.normalizer.transliterate(name).strip()
245
246
247     def get_word_token_info(self, words):
248         """ Return token information for the given list of words.
249             If a word starts with # it is assumed to be a full name
250             otherwise is a partial name.
251
252             The function returns a list of tuples with
253             (original word, word token, word id).
254
255             The function is used for testing and debugging only
256             and not necessarily efficient.
257         """
258         full_tokens = {}
259         partial_tokens = {}
260         for word in words:
261             if word.startswith('#'):
262                 full_tokens[word] = self._search_normalized(word[1:])
263             else:
264                 partial_tokens[word] = self._search_normalized(word)
265
266         with self.conn.cursor() as cur:
267             cur.execute("""SELECT word_token, word_id
268                             FROM word WHERE word_token = ANY(%s) and type = 'W'
269                         """, (list(full_tokens.values()),))
270             full_ids = {r[0]: r[1] for r in cur}
271             cur.execute("""SELECT word_token, word_id
272                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
273                         (list(partial_tokens.values()),))
274             part_ids = {r[0]: r[1] for r in cur}
275
276         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
277                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
278
279
280     def normalize_postcode(self, postcode):
281         """ Convert the postcode to a standardized form.
282
283             This function must yield exactly the same result as the SQL function
284             'token_normalized_postcode()'.
285         """
286         return postcode.strip().upper()
287
288
289     def update_postcodes_from_db(self):
290         """ Update postcode tokens in the word table from the location_postcode
291             table.
292         """
293         to_delete = []
294         with self.conn.cursor() as cur:
295             # This finds us the rows in location_postcode and word that are
296             # missing in the other table.
297             cur.execute("""SELECT * FROM
298                             (SELECT pc, word FROM
299                               (SELECT distinct(postcode) as pc FROM location_postcode) p
300                               FULL JOIN
301                               (SELECT word FROM word WHERE type = 'P') w
302                               ON pc = word) x
303                            WHERE pc is null or word is null""")
304
305             with CopyBuffer() as copystr:
306                 for postcode, word in cur:
307                     if postcode is None:
308                         to_delete.append(word)
309                     else:
310                         copystr.add(self._search_normalized(postcode),
311                                     'P', postcode)
312
313                 if to_delete:
314                     cur.execute("""DELETE FROM WORD
315                                    WHERE type ='P' and word = any(%s)
316                                 """, (to_delete, ))
317
318                 copystr.copy_out(cur, 'word',
319                                  columns=['word_token', 'type', 'word'])
320
321
322     def update_special_phrases(self, phrases, should_replace):
323         """ Replace the search index for special phrases with the new phrases.
324             If `should_replace` is True, then the previous set of will be
325             completely replaced. Otherwise the phrases are added to the
326             already existing ones.
327         """
328         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
329                             for p in phrases))
330
331         with self.conn.cursor() as cur:
332             # Get the old phrases.
333             existing_phrases = set()
334             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
335             for word, info in cur:
336                 existing_phrases.add((word, info['class'], info['type'],
337                                       info.get('op') or '-'))
338
339             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
340             if should_replace:
341                 deleted = self._remove_special_phrases(cur, norm_phrases,
342                                                        existing_phrases)
343             else:
344                 deleted = 0
345
346         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
347                  len(norm_phrases), added, deleted)
348
349
350     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
351         """ Add all phrases to the database that are not yet there.
352         """
353         to_add = new_phrases - existing_phrases
354
355         added = 0
356         with CopyBuffer() as copystr:
357             for word, cls, typ, oper in to_add:
358                 term = self._search_normalized(word)
359                 if term:
360                     copystr.add(term, 'S', word,
361                                 json.dumps({'class': cls, 'type': typ,
362                                             'op': oper if oper in ('in', 'near') else None}))
363                     added += 1
364
365             copystr.copy_out(cursor, 'word',
366                              columns=['word_token', 'type', 'word', 'info'])
367
368         return added
369
370
371     @staticmethod
372     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
373         """ Remove all phrases from the databse that are no longer in the
374             new phrase list.
375         """
376         to_delete = existing_phrases - new_phrases
377
378         if to_delete:
379             cursor.execute_values(
380                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
381                     WHERE type = 'S' and word = name
382                           and info->>'class' = in_class and info->>'type' = in_type
383                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
384                 """, to_delete)
385
386         return len(to_delete)
387
388
389     def add_country_names(self, country_code, names):
390         """ Add default names for the given country to the search index.
391         """
392         # Make sure any name preprocessing for country names applies.
393         info = PlaceInfo({'name': names, 'country_code': country_code,
394                           'rank_address': 4, 'class': 'boundary',
395                           'type': 'administrative'})
396         self._add_country_full_names(country_code,
397                                      self.sanitizer.process_names(info)[0],
398                                      internal=True)
399
400
401     def _add_country_full_names(self, country_code, names, internal=False):
402         """ Add names for the given country from an already sanitized
403             name list.
404         """
405         word_tokens = set()
406         for name in names:
407             norm_name = self._search_normalized(name.name)
408             if norm_name:
409                 word_tokens.add(norm_name)
410
411         with self.conn.cursor() as cur:
412             # Get existing names
413             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
414                              FROM word
415                              WHERE type = 'C' and word = %s""",
416                         (country_code, ))
417             existing_tokens = {True: set(), False: set()} # internal/external names
418             for word in cur:
419                 existing_tokens[word[1]].add(word[0])
420
421             # Delete names that no longer exist.
422             gone_tokens = existing_tokens[internal] - word_tokens
423             if internal:
424                 gone_tokens.update(existing_tokens[False] & word_tokens)
425             if gone_tokens:
426                 cur.execute("""DELETE FROM word
427                                USING unnest(%s) as token
428                                WHERE type = 'C' and word = %s
429                                      and word_token = token""",
430                             (list(gone_tokens), country_code))
431
432             # Only add those names that are not yet in the list.
433             new_tokens = word_tokens - existing_tokens[True]
434             if not internal:
435                 new_tokens -= existing_tokens[False]
436             if new_tokens:
437                 if internal:
438                     sql = """INSERT INTO word (word_token, type, word, info)
439                                (SELECT token, 'C', %s, '{"internal": "yes"}'
440                                   FROM unnest(%s) as token)
441                            """
442                 else:
443                     sql = """INSERT INTO word (word_token, type, word)
444                                    (SELECT token, 'C', %s
445                                     FROM unnest(%s) as token)
446                           """
447                 cur.execute(sql, (country_code, list(new_tokens)))
448
449
450     def process_place(self, place):
451         """ Determine tokenizer information about the given place.
452
453             Returns a JSON-serializable structure that will be handed into
454             the database via the token_info field.
455         """
456         token_info = _TokenInfo()
457
458         names, address = self.sanitizer.process_names(place)
459
460         if names:
461             token_info.set_names(*self._compute_name_tokens(names))
462
463             if place.is_country():
464                 self._add_country_full_names(place.country_code, names)
465
466         if address:
467             self._process_place_address(token_info, address)
468
469         return token_info.to_dict()
470
471
472     def _process_place_address(self, token_info, address):
473         for item in address:
474             if item.kind == 'postcode':
475                 token_info.set_postcode(self._add_postcode(item))
476             elif item.kind == 'housenumber':
477                 token_info.add_housenumber(*self._compute_housenumber_token(item))
478             elif item.kind == 'street':
479                 token_info.add_street(self._retrieve_full_tokens(item.name))
480             elif item.kind == 'place':
481                 if not item.suffix:
482                     token_info.add_place(self._compute_partial_tokens(item.name))
483             elif not item.kind.startswith('_') and not item.suffix and \
484                  item.kind not in ('country', 'full', 'inclusion'):
485                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
486
487
488     def _compute_housenumber_token(self, hnr):
489         """ Normalize the housenumber and return the word token and the
490             canonical form.
491         """
492         analyzer = self.token_analysis.analysis.get('@housenumber')
493         result = None, None
494
495         if analyzer is None:
496             # When no custom analyzer is set, simply normalize and transliterate
497             norm_name = self._search_normalized(hnr.name)
498             if norm_name:
499                 result = self._cache.housenumbers.get(norm_name, result)
500                 if result[0] is None:
501                     with self.conn.cursor() as cur:
502                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
503                         result = cur.fetchone()[0], norm_name
504                         self._cache.housenumbers[norm_name] = result
505         else:
506             # Otherwise use the analyzer to determine the canonical name.
507             # Per convention we use the first variant as the 'lookup name', the
508             # name that gets saved in the housenumber field of the place.
509             norm_name = analyzer.normalize(hnr.name)
510             if norm_name:
511                 result = self._cache.housenumbers.get(norm_name, result)
512                 if result[0] is None:
513                     variants = analyzer.get_variants_ascii(norm_name)
514                     if variants:
515                         with self.conn.cursor() as cur:
516                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
517                                         (norm_name, list(variants)))
518                             result = cur.fetchone()[0], variants[0]
519                             self._cache.housenumbers[norm_name] = result
520
521         return result
522
523
524     def _compute_partial_tokens(self, name):
525         """ Normalize the given term, split it into partial words and return
526             then token list for them.
527         """
528         norm_name = self._search_normalized(name)
529
530         tokens = []
531         need_lookup = []
532         for partial in norm_name.split():
533             token = self._cache.partials.get(partial)
534             if token:
535                 tokens.append(token)
536             else:
537                 need_lookup.append(partial)
538
539         if need_lookup:
540             with self.conn.cursor() as cur:
541                 cur.execute("""SELECT word, getorcreate_partial_word(word)
542                                FROM unnest(%s) word""",
543                             (need_lookup, ))
544
545                 for partial, token in cur:
546                     tokens.append(token)
547                     self._cache.partials[partial] = token
548
549         return tokens
550
551
552     def _retrieve_full_tokens(self, name):
553         """ Get the full name token for the given name, if it exists.
554             The name is only retrived for the standard analyser.
555         """
556         norm_name = self._search_normalized(name)
557
558         # return cached if possible
559         if norm_name in self._cache.fulls:
560             return self._cache.fulls[norm_name]
561
562         with self.conn.cursor() as cur:
563             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
564                         (norm_name, ))
565             full = [row[0] for row in cur]
566
567         self._cache.fulls[norm_name] = full
568
569         return full
570
571
572     def _compute_name_tokens(self, names):
573         """ Computes the full name and partial name tokens for the given
574             dictionary of names.
575         """
576         full_tokens = set()
577         partial_tokens = set()
578
579         for name in names:
580             analyzer_id = name.get_attr('analyzer')
581             analyzer = self.token_analysis.get_analyzer(analyzer_id)
582             norm_name = analyzer.normalize(name.name)
583             if analyzer_id is None:
584                 token_id = norm_name
585             else:
586                 token_id = f'{norm_name}@{analyzer_id}'
587
588             full, part = self._cache.names.get(token_id, (None, None))
589             if full is None:
590                 variants = analyzer.get_variants_ascii(norm_name)
591                 if not variants:
592                     continue
593
594                 with self.conn.cursor() as cur:
595                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
596                                 (token_id, variants))
597                     full, part = cur.fetchone()
598
599                 self._cache.names[token_id] = (full, part)
600
601             full_tokens.add(full)
602             partial_tokens.update(part)
603
604         return full_tokens, partial_tokens
605
606
607     def _add_postcode(self, item):
608         """ Make sure the normalized postcode is present in the word table.
609         """
610         analyzer = self.token_analysis.analysis.get('@postcode')
611
612         if analyzer is None:
613             postcode_name = item.name.strip().upper()
614             variant_base = None
615         else:
616             postcode_name = analyzer.normalize(item.name)
617             variant_base = item.get_attr("variant")
618
619         if variant_base is not None:
620             postcode = f'{postcode_name}@{variant_base}'
621         else:
622             postcode = postcode_name
623
624         if postcode not in self._cache.postcodes:
625             term = self._search_normalized(postcode_name)
626             if not term:
627                 return
628
629             variants = {term}
630             if analyzer is not None and variant_base is not None:
631                 variants.update(analyzer.get_variants_ascii(variant_base))
632
633             with self.conn.cursor() as cur:
634                 cur.execute("SELECT create_postcode_word(%s, %s)",
635                             (postcode, list(variants)))
636             self._cache.postcodes.add(postcode)
637
638
639 class _TokenInfo:
640     """ Collect token information to be sent back to the database.
641     """
642     def __init__(self):
643         self.names = None
644         self.housenumbers = set()
645         self.housenumber_tokens = set()
646         self.street_tokens = set()
647         self.place_tokens = set()
648         self.address_tokens = {}
649         self.postcode = None
650
651
652     @staticmethod
653     def _mk_array(tokens):
654         return f"{{{','.join((str(s) for s in tokens))}}}"
655
656
657     def to_dict(self):
658         """ Return the token information in database importable format.
659         """
660         out = {}
661
662         if self.names:
663             out['names'] = self.names
664
665         if self.housenumbers:
666             out['hnr'] = ';'.join(self.housenumbers)
667             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
668
669         if self.street_tokens:
670             out['street'] = self._mk_array(self.street_tokens)
671
672         if self.place_tokens:
673             out['place'] = self._mk_array(self.place_tokens)
674
675         if self.address_tokens:
676             out['addr'] = self.address_tokens
677
678         if self.postcode:
679             out['postcode'] = self.postcode
680
681         return out
682
683
684     def set_names(self, fulls, partials):
685         """ Adds token information for the normalised names.
686         """
687         self.names = self._mk_array(itertools.chain(fulls, partials))
688
689
690     def add_housenumber(self, token, hnr):
691         """ Extract housenumber information from a list of normalised
692             housenumbers.
693         """
694         if token:
695             self.housenumbers.add(hnr)
696             self.housenumber_tokens.add(token)
697
698
699     def add_street(self, tokens):
700         """ Add addr:street match terms.
701         """
702         self.street_tokens.update(tokens)
703
704
705     def add_place(self, tokens):
706         """ Add addr:place search and match terms.
707         """
708         self.place_tokens.update(tokens)
709
710
711     def add_address_term(self, key, partials):
712         """ Add additional address terms.
713         """
714         if partials:
715             self.address_tokens[key] = self._mk_array(partials)
716
717     def set_postcode(self, postcode):
718         """ Set the postcode to the given one.
719         """
720         self.postcode = postcode
721
722
723 class _TokenCache:
724     """ Cache for token information to avoid repeated database queries.
725
726         This cache is not thread-safe and needs to be instantiated per
727         analyzer.
728     """
729     def __init__(self):
730         self.names = {}
731         self.partials = {}
732         self.fulls = {}
733         self.postcodes = set()
734         self.housenumbers = {}