1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
35 """ Create a new instance of the tokenizer provided by this module.
37 return ICUTokenizer(dsn, data_dir)
40 class ICUTokenizer(AbstractTokenizer):
41 """ This tokenizer uses libICU to convert names and queries to ASCII.
42 Otherwise it uses the same algorithms and data structures as the
43 normalization routines in Nominatim 3.
46 def __init__(self, dsn: str, data_dir: Path) -> None:
48 self.data_dir = data_dir
49 self.loader: Optional[ICURuleLoader] = None
52 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
53 """ Set up a new tokenizer for the database.
55 This copies all necessary data in the project directory to make
56 sure the tokenizer remains stable even over updates.
58 self.loader = ICURuleLoader(config)
60 self._install_php(config.lib_dir.php, overwrite=True)
64 self.update_sql_functions(config)
65 self._init_db_tables(config)
68 def init_from_project(self, config: Configuration) -> None:
69 """ Initialise the tokenizer from the project directory.
71 self.loader = ICURuleLoader(config)
73 with connect(self.dsn) as conn:
74 self.loader.load_config_from_db(conn)
76 self._install_php(config.lib_dir.php, overwrite=False)
79 def finalize_import(self, config: Configuration) -> None:
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config: Configuration) -> None:
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 sqlp = SQLPreprocessor(conn, config)
93 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
96 def check_database(self, config: Configuration) -> None:
97 """ Check that the tokenizer is set up correctly.
99 # Will throw an error if there is an issue.
100 self.init_from_project(config)
103 def update_statistics(self) -> None:
104 """ Recompute frequencies for all name words.
106 with connect(self.dsn) as conn:
107 if conn.table_exists('search_name'):
108 with conn.cursor() as cur:
109 cur.drop_table("word_frequencies")
110 LOG.info("Computing word frequencies")
111 cur.execute("""CREATE TEMP TABLE word_frequencies AS
112 SELECT unnest(name_vector) as id, count(*)
113 FROM search_name GROUP BY id""")
114 cur.execute("CREATE INDEX ON word_frequencies(id)")
115 LOG.info("Update word table with recomputed frequencies")
116 cur.execute("""UPDATE word
117 SET info = info || jsonb_build_object('count', count)
118 FROM word_frequencies WHERE word_id = id""")
119 cur.drop_table("word_frequencies")
123 def _cleanup_housenumbers(self) -> None:
124 """ Remove unused house numbers.
126 with connect(self.dsn) as conn:
127 if not conn.table_exists('search_name'):
129 with conn.cursor(name="hnr_counter") as cur:
130 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
133 AND NOT EXISTS(SELECT * FROM search_name
134 WHERE ARRAY[word.word_id] && name_vector)
135 AND (char_length(coalesce(word, word_token)) > 6
136 OR coalesce(word, word_token) not similar to '\\d+')
138 candidates = {token: wid for wid, token in cur}
139 with conn.cursor(name="hnr_counter") as cur:
140 cur.execute("""SELECT housenumber FROM placex
141 WHERE housenumber is not null
142 AND (char_length(housenumber) > 6
143 OR housenumber not similar to '\\d+')
146 for hnr in row[0].split(';'):
147 candidates.pop(hnr, None)
148 LOG.info("There are %s outdated housenumbers.", len(candidates))
149 LOG.debug("Outdated housenumbers: %s", candidates.keys())
151 with conn.cursor() as cur:
152 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
153 (list(candidates.values()), ))
158 def update_word_tokens(self) -> None:
159 """ Remove unused tokens.
161 LOG.warning("Cleaning up housenumber tokens.")
162 self._cleanup_housenumbers()
163 LOG.warning("Tokenizer house-keeping done.")
166 def name_analyzer(self) -> 'ICUNameAnalyzer':
167 """ Create a new analyzer for tokenizing names and queries
168 using this tokinzer. Analyzers are context managers and should
172 with tokenizer.name_analyzer() as analyzer:
176 When used outside the with construct, the caller must ensure to
177 call the close() function before destructing the analyzer.
179 Analyzers are not thread-safe. You need to instantiate one per thread.
181 assert self.loader is not None
182 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
183 self.loader.make_token_analysis())
186 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
187 """ Return a list of the `num` most frequent full words
190 with conn.cursor() as cur:
191 cur.execute("""SELECT word, sum((info->>'count')::int) as count
192 FROM word WHERE type = 'W'
194 ORDER BY count DESC LIMIT %s""", (num,))
195 return list(s[0].split('@')[0] for s in cur)
198 def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
199 """ Install the php script for the tokenizer.
201 assert self.loader is not None
202 php_file = self.data_dir / "tokenizer.php"
204 if not php_file.exists() or overwrite:
205 php_file.write_text(dedent(f"""\
207 @define('CONST_Max_Word_Frequency', 10000000);
208 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
209 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
210 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
213 def _save_config(self) -> None:
214 """ Save the configuration that needs to remain stable for the given
215 database as database properties.
217 assert self.loader is not None
218 with connect(self.dsn) as conn:
219 self.loader.save_config_to_db(conn)
222 def _init_db_tables(self, config: Configuration) -> None:
223 """ Set up the word table and fill it with pre-computed word
226 with connect(self.dsn) as conn:
227 sqlp = SQLPreprocessor(conn, config)
228 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
232 class ICUNameAnalyzer(AbstractAnalyzer):
233 """ The ICU analyzer uses the ICU library for splitting names.
235 Each instance opens a connection to the database to request the
239 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
240 token_analysis: ICUTokenAnalysis) -> None:
241 self.conn: Optional[Connection] = connect(dsn).connection
242 self.conn.autocommit = True
243 self.sanitizer = sanitizer
244 self.token_analysis = token_analysis
246 self._cache = _TokenCache()
249 def close(self) -> None:
250 """ Free all resources used by the analyzer.
257 def _search_normalized(self, name: str) -> str:
258 """ Return the search token transliteration of the given name.
260 return cast(str, self.token_analysis.search.transliterate(name)).strip()
263 def _normalized(self, name: str) -> str:
264 """ Return the normalized version of the given name with all
265 non-relevant information removed.
267 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
270 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
271 """ Return token information for the given list of words.
272 If a word starts with # it is assumed to be a full name
273 otherwise is a partial name.
275 The function returns a list of tuples with
276 (original word, word token, word id).
278 The function is used for testing and debugging only
279 and not necessarily efficient.
281 assert self.conn is not None
285 if word.startswith('#'):
286 full_tokens[word] = self._search_normalized(word[1:])
288 partial_tokens[word] = self._search_normalized(word)
290 with self.conn.cursor() as cur:
291 cur.execute("""SELECT word_token, word_id
292 FROM word WHERE word_token = ANY(%s) and type = 'W'
293 """, (list(full_tokens.values()),))
294 full_ids = {r[0]: r[1] for r in cur}
295 cur.execute("""SELECT word_token, word_id
296 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
297 (list(partial_tokens.values()),))
298 part_ids = {r[0]: r[1] for r in cur}
300 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
301 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
304 def normalize_postcode(self, postcode: str) -> str:
305 """ Convert the postcode to a standardized form.
307 This function must yield exactly the same result as the SQL function
308 'token_normalized_postcode()'.
310 return postcode.strip().upper()
313 def update_postcodes_from_db(self) -> None:
314 """ Update postcode tokens in the word table from the location_postcode
317 assert self.conn is not None
318 analyzer = self.token_analysis.analysis.get('@postcode')
320 with self.conn.cursor() as cur:
321 # First get all postcode names currently in the word table.
322 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
323 word_entries = set((entry[0] for entry in cur))
325 # Then compute the required postcode names from the postcode table.
326 needed_entries = set()
327 cur.execute("SELECT country_code, postcode FROM location_postcode")
328 for cc, postcode in cur:
329 info = PlaceInfo({'country_code': cc,
330 'class': 'place', 'type': 'postcode',
331 'address': {'postcode': postcode}})
332 address = self.sanitizer.process_names(info)[1]
333 for place in address:
334 if place.kind == 'postcode':
336 postcode_name = place.name.strip().upper()
339 postcode_name = analyzer.get_canonical_id(place)
340 variant_base = place.get_attr("variant")
343 needed_entries.add(f'{postcode_name}@{variant_base}')
345 needed_entries.add(postcode_name)
348 # Now update the word table.
349 self._delete_unused_postcode_words(word_entries - needed_entries)
350 self._add_missing_postcode_words(needed_entries - word_entries)
352 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
353 assert self.conn is not None
355 with self.conn.cursor() as cur:
356 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
359 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
360 assert self.conn is not None
364 analyzer = self.token_analysis.analysis.get('@postcode')
367 for postcode_name in tokens:
368 if '@' in postcode_name:
369 term, variant = postcode_name.split('@', 2)
370 term = self._search_normalized(term)
374 variants = analyzer.compute_variants(variant)
375 if term not in variants:
376 variants.append(term)
378 variants = [self._search_normalized(postcode_name)]
379 terms.append((postcode_name, variants))
382 with self.conn.cursor() as cur:
383 cur.execute_values("""SELECT create_postcode_word(pc, var)
384 FROM (VALUES %s) AS v(pc, var)""",
390 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
391 should_replace: bool) -> None:
392 """ Replace the search index for special phrases with the new phrases.
393 If `should_replace` is True, then the previous set of will be
394 completely replaced. Otherwise the phrases are added to the
395 already existing ones.
397 assert self.conn is not None
398 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
401 with self.conn.cursor() as cur:
402 # Get the old phrases.
403 existing_phrases = set()
404 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
405 for word, info in cur:
406 existing_phrases.add((word, info['class'], info['type'],
407 info.get('op') or '-'))
409 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
411 deleted = self._remove_special_phrases(cur, norm_phrases,
416 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
417 len(norm_phrases), added, deleted)
420 def _add_special_phrases(self, cursor: Cursor,
421 new_phrases: Set[Tuple[str, str, str, str]],
422 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
423 """ Add all phrases to the database that are not yet there.
425 to_add = new_phrases - existing_phrases
428 with CopyBuffer() as copystr:
429 for word, cls, typ, oper in to_add:
430 term = self._search_normalized(word)
432 copystr.add(term, 'S', word,
433 json.dumps({'class': cls, 'type': typ,
434 'op': oper if oper in ('in', 'near') else None}))
437 copystr.copy_out(cursor, 'word',
438 columns=['word_token', 'type', 'word', 'info'])
443 def _remove_special_phrases(self, cursor: Cursor,
444 new_phrases: Set[Tuple[str, str, str, str]],
445 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
446 """ Remove all phrases from the database that are no longer in the
449 to_delete = existing_phrases - new_phrases
452 cursor.execute_values(
453 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
454 WHERE type = 'S' and word = name
455 and info->>'class' = in_class and info->>'type' = in_type
456 and ((op = '-' and info->>'op' is null) or op = info->>'op')
459 return len(to_delete)
462 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
463 """ Add default names for the given country to the search index.
465 # Make sure any name preprocessing for country names applies.
466 info = PlaceInfo({'name': names, 'country_code': country_code,
467 'rank_address': 4, 'class': 'boundary',
468 'type': 'administrative'})
469 self._add_country_full_names(country_code,
470 self.sanitizer.process_names(info)[0],
474 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
475 internal: bool = False) -> None:
476 """ Add names for the given country from an already sanitized
479 assert self.conn is not None
482 norm_name = self._search_normalized(name.name)
484 word_tokens.add(norm_name)
486 with self.conn.cursor() as cur:
488 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
490 WHERE type = 'C' and word = %s""",
492 # internal/external names
493 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
495 existing_tokens[word[1]].add(word[0])
497 # Delete names that no longer exist.
498 gone_tokens = existing_tokens[internal] - word_tokens
500 gone_tokens.update(existing_tokens[False] & word_tokens)
502 cur.execute("""DELETE FROM word
503 USING unnest(%s) as token
504 WHERE type = 'C' and word = %s
505 and word_token = token""",
506 (list(gone_tokens), country_code))
508 # Only add those names that are not yet in the list.
509 new_tokens = word_tokens - existing_tokens[True]
511 new_tokens -= existing_tokens[False]
514 sql = """INSERT INTO word (word_token, type, word, info)
515 (SELECT token, 'C', %s, '{"internal": "yes"}'
516 FROM unnest(%s) as token)
519 sql = """INSERT INTO word (word_token, type, word)
520 (SELECT token, 'C', %s
521 FROM unnest(%s) as token)
523 cur.execute(sql, (country_code, list(new_tokens)))
526 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
527 """ Determine tokenizer information about the given place.
529 Returns a JSON-serializable structure that will be handed into
530 the database via the token_info field.
532 token_info = _TokenInfo()
534 names, address = self.sanitizer.process_names(place)
537 token_info.set_names(*self._compute_name_tokens(names))
539 if place.is_country():
540 assert place.country_code is not None
541 self._add_country_full_names(place.country_code, names)
544 self._process_place_address(token_info, address)
546 return token_info.to_dict()
549 def _process_place_address(self, token_info: '_TokenInfo',
550 address: Sequence[PlaceName]) -> None:
552 if item.kind == 'postcode':
553 token_info.set_postcode(self._add_postcode(item))
554 elif item.kind == 'housenumber':
555 token_info.add_housenumber(*self._compute_housenumber_token(item))
556 elif item.kind == 'street':
557 token_info.add_street(self._retrieve_full_tokens(item.name))
558 elif item.kind == 'place':
560 token_info.add_place(self._compute_partial_tokens(item.name))
561 elif not item.kind.startswith('_') and not item.suffix and \
562 item.kind not in ('country', 'full', 'inclusion'):
563 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
566 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
567 """ Normalize the housenumber and return the word token and the
570 assert self.conn is not None
571 analyzer = self.token_analysis.analysis.get('@housenumber')
572 result: Tuple[Optional[int], Optional[str]] = (None, None)
575 # When no custom analyzer is set, simply normalize and transliterate
576 norm_name = self._search_normalized(hnr.name)
578 result = self._cache.housenumbers.get(norm_name, result)
579 if result[0] is None:
580 with self.conn.cursor() as cur:
581 hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
583 result = hid, norm_name
584 self._cache.housenumbers[norm_name] = result
586 # Otherwise use the analyzer to determine the canonical name.
587 # Per convention we use the first variant as the 'lookup name', the
588 # name that gets saved in the housenumber field of the place.
589 word_id = analyzer.get_canonical_id(hnr)
591 result = self._cache.housenumbers.get(word_id, result)
592 if result[0] is None:
593 variants = analyzer.compute_variants(word_id)
595 with self.conn.cursor() as cur:
596 hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
597 (word_id, list(variants)))
598 result = hid, variants[0]
599 self._cache.housenumbers[word_id] = result
604 def _compute_partial_tokens(self, name: str) -> List[int]:
605 """ Normalize the given term, split it into partial words and return
606 then token list for them.
608 assert self.conn is not None
609 norm_name = self._search_normalized(name)
613 for partial in norm_name.split():
614 token = self._cache.partials.get(partial)
618 need_lookup.append(partial)
621 with self.conn.cursor() as cur:
622 cur.execute("""SELECT word, getorcreate_partial_word(word)
623 FROM unnest(%s) word""",
626 for partial, token in cur:
627 assert token is not None
629 self._cache.partials[partial] = token
634 def _retrieve_full_tokens(self, name: str) -> List[int]:
635 """ Get the full name token for the given name, if it exists.
636 The name is only retrieved for the standard analyser.
638 assert self.conn is not None
639 norm_name = self._search_normalized(name)
641 # return cached if possible
642 if norm_name in self._cache.fulls:
643 return self._cache.fulls[norm_name]
645 with self.conn.cursor() as cur:
646 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
648 full = [row[0] for row in cur]
650 self._cache.fulls[norm_name] = full
655 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
656 """ Computes the full name and partial name tokens for the given
659 assert self.conn is not None
660 full_tokens: Set[int] = set()
661 partial_tokens: Set[int] = set()
664 analyzer_id = name.get_attr('analyzer')
665 analyzer = self.token_analysis.get_analyzer(analyzer_id)
666 word_id = analyzer.get_canonical_id(name)
667 if analyzer_id is None:
670 token_id = f'{word_id}@{analyzer_id}'
672 full, part = self._cache.names.get(token_id, (None, None))
674 variants = analyzer.compute_variants(word_id)
678 with self.conn.cursor() as cur:
679 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
680 (token_id, variants))
681 full, part = cast(Tuple[int, List[int]], cur.fetchone())
683 self._cache.names[token_id] = (full, part)
685 assert part is not None
687 full_tokens.add(full)
688 partial_tokens.update(part)
690 return full_tokens, partial_tokens
693 def _add_postcode(self, item: PlaceName) -> Optional[str]:
694 """ Make sure the normalized postcode is present in the word table.
696 assert self.conn is not None
697 analyzer = self.token_analysis.analysis.get('@postcode')
700 postcode_name = item.name.strip().upper()
703 postcode_name = analyzer.get_canonical_id(item)
704 variant_base = item.get_attr("variant")
707 postcode = f'{postcode_name}@{variant_base}'
709 postcode = postcode_name
711 if postcode not in self._cache.postcodes:
712 term = self._search_normalized(postcode_name)
717 if analyzer is not None and variant_base:
718 variants.update(analyzer.compute_variants(variant_base))
720 with self.conn.cursor() as cur:
721 cur.execute("SELECT create_postcode_word(%s, %s)",
722 (postcode, list(variants)))
723 self._cache.postcodes.add(postcode)
729 """ Collect token information to be sent back to the database.
731 def __init__(self) -> None:
732 self.names: Optional[str] = None
733 self.housenumbers: Set[str] = set()
734 self.housenumber_tokens: Set[int] = set()
735 self.street_tokens: Optional[Set[int]] = None
736 self.place_tokens: Set[int] = set()
737 self.address_tokens: Dict[str, str] = {}
738 self.postcode: Optional[str] = None
741 def _mk_array(self, tokens: Iterable[Any]) -> str:
742 return f"{{{','.join((str(s) for s in tokens))}}}"
745 def to_dict(self) -> Dict[str, Any]:
746 """ Return the token information in database importable format.
748 out: Dict[str, Any] = {}
751 out['names'] = self.names
753 if self.housenumbers:
754 out['hnr'] = ';'.join(self.housenumbers)
755 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
757 if self.street_tokens is not None:
758 out['street'] = self._mk_array(self.street_tokens)
760 if self.place_tokens:
761 out['place'] = self._mk_array(self.place_tokens)
763 if self.address_tokens:
764 out['addr'] = self.address_tokens
767 out['postcode'] = self.postcode
772 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
773 """ Adds token information for the normalised names.
775 self.names = self._mk_array(itertools.chain(fulls, partials))
778 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
779 """ Extract housenumber information from a list of normalised
783 assert hnr is not None
784 self.housenumbers.add(hnr)
785 self.housenumber_tokens.add(token)
788 def add_street(self, tokens: Iterable[int]) -> None:
789 """ Add addr:street match terms.
791 if self.street_tokens is None:
792 self.street_tokens = set()
793 self.street_tokens.update(tokens)
796 def add_place(self, tokens: Iterable[int]) -> None:
797 """ Add addr:place search and match terms.
799 self.place_tokens.update(tokens)
802 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
803 """ Add additional address terms.
806 self.address_tokens[key] = self._mk_array(partials)
808 def set_postcode(self, postcode: Optional[str]) -> None:
809 """ Set the postcode to the given one.
811 self.postcode = postcode
815 """ Cache for token information to avoid repeated database queries.
817 This cache is not thread-safe and needs to be instantiated per
820 def __init__(self) -> None:
821 self.names: Dict[str, Tuple[int, List[int]]] = {}
822 self.partials: Dict[str, int] = {}
823 self.fulls: Dict[str, List[int]] = {}
824 self.postcodes: Set[str] = set()
825 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}