]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 13 Nov 2024 18:35:54 +0000 (19:35 +0100)
1  2 
src/nominatim_api/search/db_search_builder.py
src/nominatim_api/search/icu_tokenizer.py
src/nominatim_db/tokenizer/icu_tokenizer.py

index 0d7487a4c04fac42a0248ab97acacaf2e11c02e0,632270ef04176f394a10e29d9397141bdeb5a457..1fbb7168bb44a963f31e83bfd99f6f534bcf9be5
@@@ -42,7 -42,7 +42,7 @@@ def build_poi_search(category: List[Tup
      class _PoiData(dbf.SearchData):
          penalty = 0.0
          qualifiers = dbf.WeightedCategories(category, [0.0] * len(category))
-         countries=ccs
+         countries = ccs
  
      return dbs.PoiSearch(_PoiData())
  
@@@ -55,15 -55,13 +55,13 @@@ class SearchBuilder
          self.query = query
          self.details = details
  
      @property
      def configured_for_country(self) -> bool:
          """ Return true if the search details are configured to
              allow countries in the result.
          """
          return self.details.min_rank <= 4 and self.details.max_rank >= 4 \
-                and self.details.layer_enabled(DataLayer.ADDRESS)
+             and self.details.layer_enabled(DataLayer.ADDRESS)
  
      @property
      def configured_for_postcode(self) -> bool:
@@@ -71,8 -69,7 +69,7 @@@
              allow postcodes in the result.
          """
          return self.details.min_rank <= 5 and self.details.max_rank >= 11\
-                and self.details.layer_enabled(DataLayer.ADDRESS)
+             and self.details.layer_enabled(DataLayer.ADDRESS)
  
      @property
      def configured_for_housenumbers(self) -> bool:
@@@ -80,8 -77,7 +77,7 @@@
              allow addresses in the result.
          """
          return self.details.max_rank >= 30 \
-                and self.details.layer_enabled(DataLayer.ADDRESS)
+             and self.details.layer_enabled(DataLayer.ADDRESS)
  
      def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
          """ Yield all possible abstract searches for the given token assignment.
@@@ -92,7 -88,7 +88,7 @@@
  
          near_items = self.get_near_items(assignment)
          if near_items is not None and not near_items:
-             return # impossible compbination of near items and category parameter
+             return  # impossible combination of near items and category parameter
  
          if assignment.name is None:
              if near_items and not sdata.postcodes:
                  search.penalty += assignment.penalty
                  yield search
  
      def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch]:
          """ Build abstract search query for a simple category search.
              This kind of search requires an additional geographic constraint.
             and ((self.details.viewbox and self.details.bounded_viewbox) or self.details.near):
              yield dbs.PoiSearch(sdata)
  
      def build_special_search(self, sdata: dbf.SearchData,
                               address: List[TokenRange],
                               is_category: bool) -> Iterator[dbs.AbstractSearch]:
                  penalty += 0.2
              yield dbs.PostcodeSearch(penalty, sdata)
  
      def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
                                   address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
          """ Build a simple address search for special entries where the
          expected_count = sum(t.count for t in hnrs)
  
          partials = {t.token: t.addr_count for trange in address
-                        for t in self.query.get_partials_list(trange)}
+                     for t in self.query.get_partials_list(trange)}
  
          if not partials:
              # can happen when none of the partials is indexed
          sdata.housenumbers = dbf.WeightedStrings([], [])
          yield dbs.PlaceSearch(0.05, sdata, expected_count)
  
      def build_name_search(self, sdata: dbf.SearchData,
                            name: TokenRange, address: List[TokenRange],
                            is_category: bool) -> Iterator[dbs.AbstractSearch]:
                  sdata.lookups = lookup
                  yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
  
-     def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
-                           -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
+     def yield_lookups(self, name: TokenRange, address: List[TokenRange]
+                       ) -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
          """ Yield all variants how the given name and address should best
              be searched for. This takes into account how frequent the terms
              are and tries to find a lookup that optimizes index use.
          """
-         penalty = 0.0 # extra penalty
+         penalty = 0.0  # extra penalty
          name_partials = {t.token: t for t in self.query.get_partials_list(name)}
  
          addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
              yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
              return
  
 -        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
 +        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 50000
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
          if name_fulls:
              fulls_count = sum(t.count for t in name_fulls)
  
 -            if fulls_count < 50000 or addr_count < 30000:
 +            if fulls_count < 80000 or addr_count < 50000:
-                 yield penalty,fulls_count / (2**len(addr_tokens)), \
+                 yield penalty, fulls_count / (2**len(addr_tokens)), \
                      self.get_full_name_ranking(name_fulls, addr_partials,
                                                 fulls_count > 30000 / max(1, len(addr_tokens)))
  
          if exp_count < 10000 and addr_count < 20000:
              penalty += 0.35 * max(1 if name_fulls else 0.1,
                                    5 - len(name_partials) - len(addr_tokens))
-             yield penalty, exp_count,\
-                   self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
+             yield penalty, exp_count, \
+                 self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
  
      def get_name_address_ranking(self, name_tokens: List[int],
                                   addr_partials: List[Token]) -> List[dbf.FieldLookup]:
  
          return lookup
  
      def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
                                use_lookup: bool) -> List[dbf.FieldLookup]:
          """ Create a ranking expression with full name terms and
          # This might yield wrong results, nothing we can do about that.
          if use_lookup:
              addr_restrict_tokens = []
 -            addr_lookup_tokens = []
 -            for t in addr_partials:
 -                if t.addr_count > 20000:
 -                    addr_restrict_tokens.append(t.token)
 -                else:
 -                    addr_lookup_tokens.append(t.token)
 +            addr_lookup_tokens = [t.token for t in addr_partials]
          else:
              addr_restrict_tokens = [t.token for t in addr_partials]
              addr_lookup_tokens = []
          return dbf.lookup_by_any_name([t.token for t in name_fulls],
                                        addr_restrict_tokens, addr_lookup_tokens)
  
      def get_name_ranking(self, trange: TokenRange,
                           db_field: str = 'name_vector') -> dbf.FieldRanking:
          """ Create a ranking expression for a name term in the given range.
          default = sum(t.penalty for t in name_partials) + 0.2
          return dbf.FieldRanking(db_field, default, ranks)
  
      def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
          """ Create a list of ranking expressions for an address term
              for the given ranges.
          heapq.heappush(todo, (0, trange.start, dbf.RankedTokens(0.0, [])))
          ranks: List[dbf.RankedTokens] = []
  
-         while todo: # pylint: disable=too-many-nested-blocks
+         while todo:
              neglen, pos, rank = heapq.heappop(todo)
              for tlist in self.query.nodes[pos].starting:
                  if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
  
          return dbf.FieldRanking('nameaddress_vector', default, ranks)
  
      def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchData]:
          """ Collect the tokens for the non-name search fields in the
              assignment.
  
          return sdata
  
      def get_country_tokens(self, trange: TokenRange) -> List[Token]:
          """ Return the list of country tokens for the given range,
              optionally filtered by the country list from the details
  
          return tokens
  
      def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
          """ Return the list of qualifier tokens for the given range,
              optionally filtered by the qualifier list from the details
  
          return tokens
  
      def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
          """ Collect tokens for near items search or use the categories
              requested per parameter.
index c2a265105a69d08eb3d7d8a75331e4a8c4d61dc9,fa14531aed0d6c07cf79c277255324495b1b063d..c18dd8be62ed1190284e9c0751464b5e54091a47
@@@ -48,6 -48,7 +48,7 @@@ class QueryPart(NamedTuple)
  QueryParts = List[QueryPart]
  WordDict = Dict[str, List[qmod.TokenRange]]
  
  def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
      """ Return all combinations of words in the terms list after the
          given position.
@@@ -72,7 -73,6 +73,6 @@@ class ICUToken(qmod.Token)
          assert self.info
          return self.info.get('class', ''), self.info.get('type', '')
  
      def rematch(self, norm: str) -> None:
          """ Check how well the token matches the given normalized string
              and add a penalty, if necessary.
@@@ -91,7 -91,6 +91,6 @@@
                  distance += abs((ato-afrom) - (bto-bfrom))
          self.penalty += (distance/len(self.lookup_word))
  
      @staticmethod
      def from_db_row(row: SaRow) -> 'ICUToken':
          """ Create a ICUToken from the row of the word table.
                          addr_count=max(1, addr_count))
  
  
  class ICUQueryAnalyzer(AbstractQueryAnalyzer):
      """ Converter for query strings into a tokenized query
          using the tokens created by a ICU tokenizer.
      """
      def __init__(self, conn: SearchConnection) -> None:
          self.conn = conn
  
      async def setup(self) -> None:
          """ Set up static data structures needed for the analysis.
          """
                       sa.Column('word', sa.Text),
                       sa.Column('info', Json))
  
      async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
          """ Analyze the given list of phrases and return the
              tokenized query.
  
          return query
  
      def normalize_text(self, text: str) -> str:
          """ Bring the given text into a normalized form. That is the
              standardized form search will work with. All information removed
              at this stage is inevitably lost.
          """
 -        return cast(str, self.normalizer.transliterate(text))
 +        norm = cast(str, self.normalizer.transliterate(text))
 +        numspaces = norm.count(' ')
 +        if numspaces > 4 and len(norm) <= (numspaces + 1) * 3:
 +            return ''
 +
 +        return norm
  
      def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
          """ Transliterate the phrases and split them into tokens.
  
  
          return parts, words
  
      async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
          """ Return the token information from the database for the
              given word tokens.
          t = self.conn.t.meta.tables['word']
          return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
  
      def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
          """ Add tokens to query that are not saved in the database.
          """
                                           count=1, addr_count=1, lookup_word=part.token,
                                           word_token=part.token, info=None))
  
      def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
          """ Add penalties to tokens that depend on presence of other token.
          """
                         and (repl.ttype != qmod.TokenType.HOUSENUMBER
                              or len(tlist.tokens[0].lookup_word) > 4):
                          repl.add_penalty(0.39)
-             elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
-                  and len(tlist.tokens[0].lookup_word) <= 3:
+             elif (tlist.ttype == qmod.TokenType.HOUSENUMBER
+                   and len(tlist.tokens[0].lookup_word) <= 3):
                  if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
                      for repl in node.starting:
                          if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
index 452bf26ce951c949e2bda958b3fb7e1581134879,83928644a9c3a9964e26af05c81ef061b8cfeb05..19818adb9d3df610ed04ec39b547cf99d5adc590
@@@ -17,7 -17,7 +17,7 @@@ from pathlib import Pat
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
  
- from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+ from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
                              drop_tables, table_exists, execute_scalar
  from ..config import Configuration
  from ..db.sql_preprocessor import SQLPreprocessor
@@@ -32,10 -32,11 +32,11 @@@ DBCFG_TERM_NORMALIZATION = "tokenizer_t
  
  LOG = logging.getLogger()
  
- WORD_TYPES =(('country_names', 'C'),
-              ('postcodes', 'P'),
-              ('full_word', 'W'),
-              ('housenumbers', 'H'))
+ WORD_TYPES = (('country_names', 'C'),
+               ('postcodes', 'P'),
+               ('full_word', 'W'),
+               ('housenumbers', 'H'))
  
  def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
@@@ -54,7 -55,6 +55,6 @@@ class ICUTokenizer(AbstractTokenizer)
          self.data_dir = data_dir
          self.loader: Optional[ICURuleLoader] = None
  
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
          """ Set up a new tokenizer for the database.
  
@@@ -70,7 -70,6 +70,6 @@@
              self._setup_db_tables(config)
              self._create_base_indices(config, 'word')
  
      def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
      def finalize_import(self, config: Configuration) -> None:
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          self._create_lookup_indices(config, 'word')
  
      def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
              sqlp = SQLPreprocessor(conn, config)
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
      def check_database(self, config: Configuration) -> None:
          """ Check that the tokenizer is set up correctly.
          """
          # Will throw an error if there is an issue.
          self.init_from_project(config)
  
      def update_statistics(self, config: Configuration, threads: int = 2) -> None:
          """ Recompute frequencies for all name words.
          """
                                       SELECT unnest(nameaddress_vector) as id, count(*)
                                       FROM search_name GROUP BY id""")
                      cur.execute('CREATE INDEX ON addressword_frequencies(id)')
-                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
-                                                                                INOUT info JSONB)
-                                    AS $$
-                                    DECLARE rec RECORD;
-                                    BEGIN
-                                    IF info is null THEN
-                                      info = '{}'::jsonb;
-                                    END IF;
-                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
-                                    LOOP
-                                      info = info || jsonb_build_object('count', rec.count);
-                                    END LOOP;
-                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
-                                    LOOP
-                                      info = info || jsonb_build_object('addr_count', rec.count);
-                                    END LOOP;
-                                    IF info = '{}'::jsonb THEN
-                                      info = null;
-                                    END IF;
-                                    END;
-                                    $$ LANGUAGE plpgsql IMMUTABLE;
-                                 """)
+                     cur.execute("""
+                         CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                     INOUT info JSONB)
+                         AS $$
+                         DECLARE rec RECORD;
+                         BEGIN
+                         IF info is null THEN
+                           info = '{}'::jsonb;
+                         END IF;
+                         FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                         LOOP
+                           info = info || jsonb_build_object('count', rec.count);
+                         END LOOP;
+                         FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                         LOOP
+                           info = info || jsonb_build_object('addr_count', rec.count);
+                         END LOOP;
+                         IF info = '{}'::jsonb THEN
+                           info = null;
+                         END IF;
+                         END;
+                         $$ LANGUAGE plpgsql IMMUTABLE;
+                         """)
                      LOG.info('Update word table with recomputed frequencies')
                      drop_tables(conn, 'tmp_word')
                      cur.execute("""CREATE TABLE tmp_word AS
                                              END) as info
                                      FROM word LEFT JOIN word_frequencies wf
                                           ON word.word_id = wf.id
 +                                    ORDER BY word_id
                                  """)
                      drop_tables(conn, 'word_frequencies')
  
          self._create_lookup_indices(config, 'tmp_word')
          self._move_temporary_word_table('tmp_word')
  
      def _cleanup_housenumbers(self) -> None:
          """ Remove unused house numbers.
          """
                                  (list(candidates.values()), ))
                  conn.commit()
  
      def update_word_tokens(self) -> None:
          """ Remove unused tokens.
          """
          self._cleanup_housenumbers()
          LOG.warning("Tokenizer house-keeping done.")
  
      def name_analyzer(self) -> 'ICUNameAnalyzer':
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
          return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                 self.loader.make_token_analysis())
  
      def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
          """ Return a list of the `num` most frequent full words
              in the database.
                               ORDER BY count DESC LIMIT %s""", (num,))
              return list(s[0].split('@')[0] for s in cur)
  
      def _save_config(self) -> None:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          with connect(self.dsn) as conn:
              self.loader.save_config_to_db(conn)
  
      def _setup_db_tables(self, config: Configuration) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
              """)
              conn.commit()
  
      def _create_base_indices(self, config: Configuration, table_name: str) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
                                  column_type=ctype)
              conn.commit()
  
      def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
          """ Create additional indexes used when running the API.
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
              # Index required for details lookup.
-             sqlp.run_string(conn, """
+             sqlp.run_string(
+                 conn,
+                 """
                  CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
                    ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
-             """,
-             table_name=table_name)
+                 """,
+                 table_name=table_name)
              conn.commit()
  
      def _move_temporary_word_table(self, old: str) -> None:
          """ Rename all tables and indexes used by the tokenizer.
          """
              conn.commit()
  
  
  class ICUNameAnalyzer(AbstractAnalyzer):
      """ The ICU analyzer uses the ICU library for splitting names.
  
  
          self._cache = _TokenCache()
  
      def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
              self.conn.close()
              self.conn = None
  
      def _search_normalized(self, name: str) -> str:
          """ Return the search token transliteration of the given name.
          """
          return cast(str, self.token_analysis.search.transliterate(name)).strip()
  
      def _normalized(self, name: str) -> str:
          """ Return the normalized version of the given name with all
              non-relevant information removed.
          """
          return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
  
      def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
              part_ids = {r[0]: r[1] for r in cur}
  
          return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
-                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
+             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
      def normalize_postcode(self, postcode: str) -> str:
          """ Convert the postcode to a standardized form.
          """
          return postcode.strip().upper()
  
      def update_postcodes_from_db(self) -> None:
          """ Update postcode tokens in the word table from the location_postcode
              table.
              with self.conn.cursor() as cur:
                  cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
  
      def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                 should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                   len(norm_phrases), added, deleted)
  
      def _add_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
  
          return added
  
      def _remove_special_phrases(self, cursor: Cursor,
-                              new_phrases: Set[Tuple[str, str, str, str]],
-                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+                                 new_phrases: Set[Tuple[str, str, str, str]],
+                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
          """ Remove all phrases from the database that are no longer in the
              new phrase list.
          """
  
          return len(to_delete)
  
      def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
          """ Add default names for the given country to the search index.
          """
                                       self.sanitizer.process_names(info)[0],
                                       internal=True)
  
      def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
                                  internal: bool = False) -> None:
          """ Add names for the given country from an already sanitized
                            """
                  cur.execute(sql, (country_code, list(new_tokens)))
  
      def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
          """ Determine tokenizer information about the given place.
  
  
          return token_info.to_dict()
  
      def _process_place_address(self, token_info: '_TokenInfo',
                                 address: Sequence[PlaceName]) -> None:
          for item in address:
              elif item.kind == 'place':
                  if not item.suffix:
                      token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
-             elif not item.kind.startswith('_') and not item.suffix and \
-                  item.kind not in ('country', 'full', 'inclusion'):
+             elif (not item.kind.startswith('_') and not item.suffix and
+                   item.kind not in ('country', 'full', 'inclusion')):
                  token_info.add_address_term(item.kind,
                                              itertools.chain(*self._compute_name_tokens([item])))
  
      def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
          """ Normalize the housenumber and return the word token and the
              canonical form.
  
          return result
  
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
              The name is only retrieved for the standard analyser.
  
          return full
  
      def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
  
          return full_tokens, partial_tokens
  
      def _add_postcode(self, item: PlaceName) -> Optional[str]:
          """ Make sure the normalized postcode is present in the word table.
          """
@@@ -836,11 -801,9 +802,9 @@@ class _TokenInfo
          self.address_tokens: Dict[str, str] = {}
          self.postcode: Optional[str] = None
  
      def _mk_array(self, tokens: Iterable[Any]) -> str:
          return f"{{{','.join((str(s) for s in tokens))}}}"
  
      def to_dict(self) -> Dict[str, Any]:
          """ Return the token information in database importable format.
          """
  
          return out
  
      def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
          """ Adds token information for the normalised names.
          """
          self.names = self._mk_array(itertools.chain(fulls, partials))
  
      def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
          """ Extract housenumber information from a list of normalised
              housenumbers.
              self.housenumbers.add(hnr)
              self.housenumber_tokens.add(token)
  
      def add_street(self, tokens: Iterable[int]) -> None:
          """ Add addr:street match terms.
          """
              self.street_tokens = set()
          self.street_tokens.update(tokens)
  
      def add_place(self, tokens: Iterable[int]) -> None:
          """ Add addr:place search and match terms.
          """
          self.place_tokens.update(tokens)
  
      def add_address_term(self, key: str, partials: Iterable[int]) -> None:
          """ Add additional address terms.
          """