]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/icu_tokenizer.py
Merge pull request #2760 from lonvia/reorganize-data-classes
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 import itertools
12 import json
13 import logging
14 from textwrap import dedent
15
16 from nominatim.db.connection import connect
17 from nominatim.db.utils import CopyBuffer
18 from nominatim.db.sql_preprocessor import SQLPreprocessor
19 from nominatim.data.place_info import PlaceInfo
20 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
21 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
22
23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24
25 LOG = logging.getLogger()
26
27 def create(dsn, data_dir):
28     """ Create a new instance of the tokenizer provided by this module.
29     """
30     return LegacyICUTokenizer(dsn, data_dir)
31
32
33 class LegacyICUTokenizer(AbstractTokenizer):
34     """ This tokenizer uses libICU to covert names and queries to ASCII.
35         Otherwise it uses the same algorithms and data structures as the
36         normalization routines in Nominatim 3.
37     """
38
39     def __init__(self, dsn, data_dir):
40         self.dsn = dsn
41         self.data_dir = data_dir
42         self.loader = None
43
44
45     def init_new_db(self, config, init_db=True):
46         """ Set up a new tokenizer for the database.
47
48             This copies all necessary data in the project directory to make
49             sure the tokenizer remains stable even over updates.
50         """
51         self.loader = ICURuleLoader(config)
52
53         self._install_php(config.lib_dir.php, overwrite=True)
54         self._save_config()
55
56         if init_db:
57             self.update_sql_functions(config)
58             self._init_db_tables(config)
59
60
61     def init_from_project(self, config):
62         """ Initialise the tokenizer from the project directory.
63         """
64         self.loader = ICURuleLoader(config)
65
66         with connect(self.dsn) as conn:
67             self.loader.load_config_from_db(conn)
68
69         self._install_php(config.lib_dir.php, overwrite=False)
70
71
72     def finalize_import(self, config):
73         """ Do any required postprocessing to make the tokenizer data ready
74             for use.
75         """
76         with connect(self.dsn) as conn:
77             sqlp = SQLPreprocessor(conn, config)
78             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
79
80
81     def update_sql_functions(self, config):
82         """ Reimport the SQL functions for this tokenizer.
83         """
84         with connect(self.dsn) as conn:
85             sqlp = SQLPreprocessor(conn, config)
86             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
87
88
89     def check_database(self, config):
90         """ Check that the tokenizer is set up correctly.
91         """
92         # Will throw an error if there is an issue.
93         self.init_from_project(config)
94
95
96     def update_statistics(self):
97         """ Recompute frequencies for all name words.
98         """
99         with connect(self.dsn) as conn:
100             if conn.table_exists('search_name'):
101                 with conn.cursor() as cur:
102                     cur.drop_table("word_frequencies")
103                     LOG.info("Computing word frequencies")
104                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
105                                      SELECT unnest(name_vector) as id, count(*)
106                                      FROM search_name GROUP BY id""")
107                     cur.execute("CREATE INDEX ON word_frequencies(id)")
108                     LOG.info("Update word table with recomputed frequencies")
109                     cur.execute("""UPDATE word
110                                    SET info = info || jsonb_build_object('count', count)
111                                    FROM word_frequencies WHERE word_id = id""")
112                     cur.drop_table("word_frequencies")
113             conn.commit()
114
115
116     def _cleanup_housenumbers(self):
117         """ Remove unused house numbers.
118         """
119         with connect(self.dsn) as conn:
120             if not conn.table_exists('search_name'):
121                 return
122             with conn.cursor(name="hnr_counter") as cur:
123                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
124                                FROM word
125                                WHERE type = 'H'
126                                  AND NOT EXISTS(SELECT * FROM search_name
127                                                 WHERE ARRAY[word.word_id] && name_vector)
128                                  AND (char_length(coalesce(word, word_token)) > 6
129                                       OR coalesce(word, word_token) not similar to '\\d+')
130                             """)
131                 candidates = {token: wid for wid, token in cur}
132             with conn.cursor(name="hnr_counter") as cur:
133                 cur.execute("""SELECT housenumber FROM placex
134                                WHERE housenumber is not null
135                                      AND (char_length(housenumber) > 6
136                                           OR housenumber not similar to '\\d+')
137                             """)
138                 for row in cur:
139                     for hnr in row[0].split(';'):
140                         candidates.pop(hnr, None)
141             LOG.info("There are %s outdated housenumbers.", len(candidates))
142             LOG.debug("Outdated housenumbers: %s", candidates.keys())
143             if candidates:
144                 with conn.cursor() as cur:
145                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
146                                 (list(candidates.values()), ))
147                 conn.commit()
148
149
150
151     def update_word_tokens(self):
152         """ Remove unused tokens.
153         """
154         LOG.warning("Cleaning up housenumber tokens.")
155         self._cleanup_housenumbers()
156         LOG.warning("Tokenizer house-keeping done.")
157
158
159     def name_analyzer(self):
160         """ Create a new analyzer for tokenizing names and queries
161             using this tokinzer. Analyzers are context managers and should
162             be used accordingly:
163
164             ```
165             with tokenizer.name_analyzer() as analyzer:
166                 analyser.tokenize()
167             ```
168
169             When used outside the with construct, the caller must ensure to
170             call the close() function before destructing the analyzer.
171
172             Analyzers are not thread-safe. You need to instantiate one per thread.
173         """
174         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
175                                      self.loader.make_token_analysis())
176
177
178     def _install_php(self, phpdir, overwrite=True):
179         """ Install the php script for the tokenizer.
180         """
181         php_file = self.data_dir / "tokenizer.php"
182
183         if not php_file.exists() or overwrite:
184             php_file.write_text(dedent(f"""\
185                 <?php
186                 @define('CONST_Max_Word_Frequency', 10000000);
187                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
188                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
189                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
190
191
192     def _save_config(self):
193         """ Save the configuration that needs to remain stable for the given
194             database as database properties.
195         """
196         with connect(self.dsn) as conn:
197             self.loader.save_config_to_db(conn)
198
199
200     def _init_db_tables(self, config):
201         """ Set up the word table and fill it with pre-computed word
202             frequencies.
203         """
204         with connect(self.dsn) as conn:
205             sqlp = SQLPreprocessor(conn, config)
206             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
207             conn.commit()
208
209
210 class LegacyICUNameAnalyzer(AbstractAnalyzer):
211     """ The legacy analyzer uses the ICU library for splitting names.
212
213         Each instance opens a connection to the database to request the
214         normalization.
215     """
216
217     def __init__(self, dsn, sanitizer, token_analysis):
218         self.conn = connect(dsn).connection
219         self.conn.autocommit = True
220         self.sanitizer = sanitizer
221         self.token_analysis = token_analysis
222
223         self._cache = _TokenCache()
224
225
226     def close(self):
227         """ Free all resources used by the analyzer.
228         """
229         if self.conn:
230             self.conn.close()
231             self.conn = None
232
233
234     def _search_normalized(self, name):
235         """ Return the search token transliteration of the given name.
236         """
237         return self.token_analysis.search.transliterate(name).strip()
238
239
240     def _normalized(self, name):
241         """ Return the normalized version of the given name with all
242             non-relevant information removed.
243         """
244         return self.token_analysis.normalizer.transliterate(name).strip()
245
246
247     def get_word_token_info(self, words):
248         """ Return token information for the given list of words.
249             If a word starts with # it is assumed to be a full name
250             otherwise is a partial name.
251
252             The function returns a list of tuples with
253             (original word, word token, word id).
254
255             The function is used for testing and debugging only
256             and not necessarily efficient.
257         """
258         full_tokens = {}
259         partial_tokens = {}
260         for word in words:
261             if word.startswith('#'):
262                 full_tokens[word] = self._search_normalized(word[1:])
263             else:
264                 partial_tokens[word] = self._search_normalized(word)
265
266         with self.conn.cursor() as cur:
267             cur.execute("""SELECT word_token, word_id
268                             FROM word WHERE word_token = ANY(%s) and type = 'W'
269                         """, (list(full_tokens.values()),))
270             full_ids = {r[0]: r[1] for r in cur}
271             cur.execute("""SELECT word_token, word_id
272                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
273                         (list(partial_tokens.values()),))
274             part_ids = {r[0]: r[1] for r in cur}
275
276         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
277                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
278
279
280     def normalize_postcode(self, postcode):
281         """ Convert the postcode to a standardized form.
282
283             This function must yield exactly the same result as the SQL function
284             'token_normalized_postcode()'.
285         """
286         return postcode.strip().upper()
287
288
289     def update_postcodes_from_db(self):
290         """ Update postcode tokens in the word table from the location_postcode
291             table.
292         """
293         analyzer = self.token_analysis.analysis.get('@postcode')
294
295         with self.conn.cursor() as cur:
296             # First get all postcode names currently in the word table.
297             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
298             word_entries = set((entry[0] for entry in cur))
299
300             # Then compute the required postcode names from the postcode table.
301             needed_entries = set()
302             cur.execute("SELECT country_code, postcode FROM location_postcode")
303             for cc, postcode in cur:
304                 info = PlaceInfo({'country_code': cc,
305                                   'class': 'place', 'type': 'postcode',
306                                   'address': {'postcode': postcode}})
307                 address = self.sanitizer.process_names(info)[1]
308                 for place in address:
309                     if place.kind == 'postcode':
310                         if analyzer is None:
311                             postcode_name = place.name.strip().upper()
312                             variant_base = None
313                         else:
314                             postcode_name = analyzer.normalize(place.name)
315                             variant_base = place.get_attr("variant")
316
317                         if variant_base:
318                             needed_entries.add(f'{postcode_name}@{variant_base}')
319                         else:
320                             needed_entries.add(postcode_name)
321                         break
322
323         # Now update the word table.
324         self._delete_unused_postcode_words(word_entries - needed_entries)
325         self._add_missing_postcode_words(needed_entries - word_entries)
326
327     def _delete_unused_postcode_words(self, tokens):
328         if tokens:
329             with self.conn.cursor() as cur:
330                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
331                             (list(tokens), ))
332
333     def _add_missing_postcode_words(self, tokens):
334         if not tokens:
335             return
336
337         analyzer = self.token_analysis.analysis.get('@postcode')
338         terms = []
339
340         for postcode_name in tokens:
341             if '@' in postcode_name:
342                 term, variant = postcode_name.split('@', 2)
343                 term = self._search_normalized(term)
344                 variants = {term}
345                 if analyzer is not None:
346                     variants.update(analyzer.get_variants_ascii(variant))
347                     variants = list(variants)
348             else:
349                 variants = [self._search_normalized(postcode_name)]
350             terms.append((postcode_name, variants))
351
352         if terms:
353             with self.conn.cursor() as cur:
354                 cur.execute_values("""SELECT create_postcode_word(pc, var)
355                                       FROM (VALUES %s) AS v(pc, var)""",
356                                    terms)
357
358
359
360
361     def update_special_phrases(self, phrases, should_replace):
362         """ Replace the search index for special phrases with the new phrases.
363             If `should_replace` is True, then the previous set of will be
364             completely replaced. Otherwise the phrases are added to the
365             already existing ones.
366         """
367         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
368                             for p in phrases))
369
370         with self.conn.cursor() as cur:
371             # Get the old phrases.
372             existing_phrases = set()
373             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
374             for word, info in cur:
375                 existing_phrases.add((word, info['class'], info['type'],
376                                       info.get('op') or '-'))
377
378             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
379             if should_replace:
380                 deleted = self._remove_special_phrases(cur, norm_phrases,
381                                                        existing_phrases)
382             else:
383                 deleted = 0
384
385         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
386                  len(norm_phrases), added, deleted)
387
388
389     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
390         """ Add all phrases to the database that are not yet there.
391         """
392         to_add = new_phrases - existing_phrases
393
394         added = 0
395         with CopyBuffer() as copystr:
396             for word, cls, typ, oper in to_add:
397                 term = self._search_normalized(word)
398                 if term:
399                     copystr.add(term, 'S', word,
400                                 json.dumps({'class': cls, 'type': typ,
401                                             'op': oper if oper in ('in', 'near') else None}))
402                     added += 1
403
404             copystr.copy_out(cursor, 'word',
405                              columns=['word_token', 'type', 'word', 'info'])
406
407         return added
408
409
410     @staticmethod
411     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
412         """ Remove all phrases from the databse that are no longer in the
413             new phrase list.
414         """
415         to_delete = existing_phrases - new_phrases
416
417         if to_delete:
418             cursor.execute_values(
419                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
420                     WHERE type = 'S' and word = name
421                           and info->>'class' = in_class and info->>'type' = in_type
422                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
423                 """, to_delete)
424
425         return len(to_delete)
426
427
428     def add_country_names(self, country_code, names):
429         """ Add default names for the given country to the search index.
430         """
431         # Make sure any name preprocessing for country names applies.
432         info = PlaceInfo({'name': names, 'country_code': country_code,
433                           'rank_address': 4, 'class': 'boundary',
434                           'type': 'administrative'})
435         self._add_country_full_names(country_code,
436                                      self.sanitizer.process_names(info)[0],
437                                      internal=True)
438
439
440     def _add_country_full_names(self, country_code, names, internal=False):
441         """ Add names for the given country from an already sanitized
442             name list.
443         """
444         word_tokens = set()
445         for name in names:
446             norm_name = self._search_normalized(name.name)
447             if norm_name:
448                 word_tokens.add(norm_name)
449
450         with self.conn.cursor() as cur:
451             # Get existing names
452             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
453                              FROM word
454                              WHERE type = 'C' and word = %s""",
455                         (country_code, ))
456             existing_tokens = {True: set(), False: set()} # internal/external names
457             for word in cur:
458                 existing_tokens[word[1]].add(word[0])
459
460             # Delete names that no longer exist.
461             gone_tokens = existing_tokens[internal] - word_tokens
462             if internal:
463                 gone_tokens.update(existing_tokens[False] & word_tokens)
464             if gone_tokens:
465                 cur.execute("""DELETE FROM word
466                                USING unnest(%s) as token
467                                WHERE type = 'C' and word = %s
468                                      and word_token = token""",
469                             (list(gone_tokens), country_code))
470
471             # Only add those names that are not yet in the list.
472             new_tokens = word_tokens - existing_tokens[True]
473             if not internal:
474                 new_tokens -= existing_tokens[False]
475             if new_tokens:
476                 if internal:
477                     sql = """INSERT INTO word (word_token, type, word, info)
478                                (SELECT token, 'C', %s, '{"internal": "yes"}'
479                                   FROM unnest(%s) as token)
480                            """
481                 else:
482                     sql = """INSERT INTO word (word_token, type, word)
483                                    (SELECT token, 'C', %s
484                                     FROM unnest(%s) as token)
485                           """
486                 cur.execute(sql, (country_code, list(new_tokens)))
487
488
489     def process_place(self, place):
490         """ Determine tokenizer information about the given place.
491
492             Returns a JSON-serializable structure that will be handed into
493             the database via the token_info field.
494         """
495         token_info = _TokenInfo()
496
497         names, address = self.sanitizer.process_names(place)
498
499         if names:
500             token_info.set_names(*self._compute_name_tokens(names))
501
502             if place.is_country():
503                 self._add_country_full_names(place.country_code, names)
504
505         if address:
506             self._process_place_address(token_info, address)
507
508         return token_info.to_dict()
509
510
511     def _process_place_address(self, token_info, address):
512         for item in address:
513             if item.kind == 'postcode':
514                 token_info.set_postcode(self._add_postcode(item))
515             elif item.kind == 'housenumber':
516                 token_info.add_housenumber(*self._compute_housenumber_token(item))
517             elif item.kind == 'street':
518                 token_info.add_street(self._retrieve_full_tokens(item.name))
519             elif item.kind == 'place':
520                 if not item.suffix:
521                     token_info.add_place(self._compute_partial_tokens(item.name))
522             elif not item.kind.startswith('_') and not item.suffix and \
523                  item.kind not in ('country', 'full', 'inclusion'):
524                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
525
526
527     def _compute_housenumber_token(self, hnr):
528         """ Normalize the housenumber and return the word token and the
529             canonical form.
530         """
531         analyzer = self.token_analysis.analysis.get('@housenumber')
532         result = None, None
533
534         if analyzer is None:
535             # When no custom analyzer is set, simply normalize and transliterate
536             norm_name = self._search_normalized(hnr.name)
537             if norm_name:
538                 result = self._cache.housenumbers.get(norm_name, result)
539                 if result[0] is None:
540                     with self.conn.cursor() as cur:
541                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
542                         result = cur.fetchone()[0], norm_name
543                         self._cache.housenumbers[norm_name] = result
544         else:
545             # Otherwise use the analyzer to determine the canonical name.
546             # Per convention we use the first variant as the 'lookup name', the
547             # name that gets saved in the housenumber field of the place.
548             norm_name = analyzer.normalize(hnr.name)
549             if norm_name:
550                 result = self._cache.housenumbers.get(norm_name, result)
551                 if result[0] is None:
552                     variants = analyzer.get_variants_ascii(norm_name)
553                     if variants:
554                         with self.conn.cursor() as cur:
555                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
556                                         (norm_name, list(variants)))
557                             result = cur.fetchone()[0], variants[0]
558                             self._cache.housenumbers[norm_name] = result
559
560         return result
561
562
563     def _compute_partial_tokens(self, name):
564         """ Normalize the given term, split it into partial words and return
565             then token list for them.
566         """
567         norm_name = self._search_normalized(name)
568
569         tokens = []
570         need_lookup = []
571         for partial in norm_name.split():
572             token = self._cache.partials.get(partial)
573             if token:
574                 tokens.append(token)
575             else:
576                 need_lookup.append(partial)
577
578         if need_lookup:
579             with self.conn.cursor() as cur:
580                 cur.execute("""SELECT word, getorcreate_partial_word(word)
581                                FROM unnest(%s) word""",
582                             (need_lookup, ))
583
584                 for partial, token in cur:
585                     tokens.append(token)
586                     self._cache.partials[partial] = token
587
588         return tokens
589
590
591     def _retrieve_full_tokens(self, name):
592         """ Get the full name token for the given name, if it exists.
593             The name is only retrived for the standard analyser.
594         """
595         norm_name = self._search_normalized(name)
596
597         # return cached if possible
598         if norm_name in self._cache.fulls:
599             return self._cache.fulls[norm_name]
600
601         with self.conn.cursor() as cur:
602             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
603                         (norm_name, ))
604             full = [row[0] for row in cur]
605
606         self._cache.fulls[norm_name] = full
607
608         return full
609
610
611     def _compute_name_tokens(self, names):
612         """ Computes the full name and partial name tokens for the given
613             dictionary of names.
614         """
615         full_tokens = set()
616         partial_tokens = set()
617
618         for name in names:
619             analyzer_id = name.get_attr('analyzer')
620             analyzer = self.token_analysis.get_analyzer(analyzer_id)
621             norm_name = analyzer.normalize(name.name)
622             if analyzer_id is None:
623                 token_id = norm_name
624             else:
625                 token_id = f'{norm_name}@{analyzer_id}'
626
627             full, part = self._cache.names.get(token_id, (None, None))
628             if full is None:
629                 variants = analyzer.get_variants_ascii(norm_name)
630                 if not variants:
631                     continue
632
633                 with self.conn.cursor() as cur:
634                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
635                                 (token_id, variants))
636                     full, part = cur.fetchone()
637
638                 self._cache.names[token_id] = (full, part)
639
640             full_tokens.add(full)
641             partial_tokens.update(part)
642
643         return full_tokens, partial_tokens
644
645
646     def _add_postcode(self, item):
647         """ Make sure the normalized postcode is present in the word table.
648         """
649         analyzer = self.token_analysis.analysis.get('@postcode')
650
651         if analyzer is None:
652             postcode_name = item.name.strip().upper()
653             variant_base = None
654         else:
655             postcode_name = analyzer.normalize(item.name)
656             variant_base = item.get_attr("variant")
657
658         if variant_base:
659             postcode = f'{postcode_name}@{variant_base}'
660         else:
661             postcode = postcode_name
662
663         if postcode not in self._cache.postcodes:
664             term = self._search_normalized(postcode_name)
665             if not term:
666                 return None
667
668             variants = {term}
669             if analyzer is not None and variant_base:
670                 variants.update(analyzer.get_variants_ascii(variant_base))
671
672             with self.conn.cursor() as cur:
673                 cur.execute("SELECT create_postcode_word(%s, %s)",
674                             (postcode, list(variants)))
675             self._cache.postcodes.add(postcode)
676
677         return postcode_name
678
679
680 class _TokenInfo:
681     """ Collect token information to be sent back to the database.
682     """
683     def __init__(self):
684         self.names = None
685         self.housenumbers = set()
686         self.housenumber_tokens = set()
687         self.street_tokens = set()
688         self.place_tokens = set()
689         self.address_tokens = {}
690         self.postcode = None
691
692
693     @staticmethod
694     def _mk_array(tokens):
695         return f"{{{','.join((str(s) for s in tokens))}}}"
696
697
698     def to_dict(self):
699         """ Return the token information in database importable format.
700         """
701         out = {}
702
703         if self.names:
704             out['names'] = self.names
705
706         if self.housenumbers:
707             out['hnr'] = ';'.join(self.housenumbers)
708             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
709
710         if self.street_tokens:
711             out['street'] = self._mk_array(self.street_tokens)
712
713         if self.place_tokens:
714             out['place'] = self._mk_array(self.place_tokens)
715
716         if self.address_tokens:
717             out['addr'] = self.address_tokens
718
719         if self.postcode:
720             out['postcode'] = self.postcode
721
722         return out
723
724
725     def set_names(self, fulls, partials):
726         """ Adds token information for the normalised names.
727         """
728         self.names = self._mk_array(itertools.chain(fulls, partials))
729
730
731     def add_housenumber(self, token, hnr):
732         """ Extract housenumber information from a list of normalised
733             housenumbers.
734         """
735         if token:
736             self.housenumbers.add(hnr)
737             self.housenumber_tokens.add(token)
738
739
740     def add_street(self, tokens):
741         """ Add addr:street match terms.
742         """
743         self.street_tokens.update(tokens)
744
745
746     def add_place(self, tokens):
747         """ Add addr:place search and match terms.
748         """
749         self.place_tokens.update(tokens)
750
751
752     def add_address_term(self, key, partials):
753         """ Add additional address terms.
754         """
755         if partials:
756             self.address_tokens[key] = self._mk_array(partials)
757
758     def set_postcode(self, postcode):
759         """ Set the postcode to the given one.
760         """
761         self.postcode = postcode
762
763
764 class _TokenCache:
765     """ Cache for token information to avoid repeated database queries.
766
767         This cache is not thread-safe and needs to be instantiated per
768         analyzer.
769     """
770     def __init__(self):
771         self.names = {}
772         self.partials = {}
773         self.fulls = {}
774         self.postcodes = set()
775         self.housenumbers = {}