Merge remote-tracking branch 'upstream/master'

[nominatim.git] / src / nominatim_db / tokenizer / icu_tokenizer.py
diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py

index 452bf26ce951c949e2bda958b3fb7e1581134879..16122d081ad9f15184d3c6cb53598e0c584ee82b 100644 (file)
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -17,7 +17,7 @@ from pathlib import Path
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
  
  from psycopg.types.json import Jsonb
  from psycopg import sql as pysql
  
-from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
+from ..db.connection import connect, Connection, Cursor, \
                              drop_tables, table_exists, execute_scalar
  from ..config import Configuration
  from ..db.sql_preprocessor import SQLPreprocessor
                              drop_tables, table_exists, execute_scalar
  from ..config import Configuration
  from ..db.sql_preprocessor import SQLPreprocessor
@@ -32,10 +32,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
-WORD_TYPES =(('country_names', 'C'),
-             ('postcodes', 'P'),
-             ('full_word', 'W'),
-             ('housenumbers', 'H'))
+WORD_TYPES = (('country_names', 'C'),
+              ('postcodes', 'P'),
+              ('full_word', 'W'),
+              ('housenumbers', 'H'))
+
  
  def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
  
  def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
@@ -54,7 +55,6 @@ class ICUTokenizer(AbstractTokenizer):
          self.data_dir = data_dir
          self.loader: Optional[ICURuleLoader] = None
  
          self.data_dir = data_dir
          self.loader: Optional[ICURuleLoader] = None
  
-
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
          """ Set up a new tokenizer for the database.
  
      def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
          """ Set up a new tokenizer for the database.
  
@@ -70,7 +70,6 @@ class ICUTokenizer(AbstractTokenizer):
              self._setup_db_tables(config)
              self._create_base_indices(config, 'word')
  
              self._setup_db_tables(config)
              self._create_base_indices(config, 'word')
  
-
      def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
      def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from the project directory.
          """
@@ -79,14 +78,12 @@ class ICUTokenizer(AbstractTokenizer):
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
          with connect(self.dsn) as conn:
              self.loader.load_config_from_db(conn)
  
-
      def finalize_import(self, config: Configuration) -> None:
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          self._create_lookup_indices(config, 'word')
  
      def finalize_import(self, config: Configuration) -> None:
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          self._create_lookup_indices(config, 'word')
  
-
      def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
      def update_sql_functions(self, config: Configuration) -> None:
          """ Reimport the SQL functions for this tokenizer.
          """
@@ -94,14 +91,12 @@ class ICUTokenizer(AbstractTokenizer):
              sqlp = SQLPreprocessor(conn, config)
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
              sqlp = SQLPreprocessor(conn, config)
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
-
      def check_database(self, config: Configuration) -> None:
          """ Check that the tokenizer is set up correctly.
          """
          # Will throw an error if there is an issue.
          self.init_from_project(config)
  
      def check_database(self, config: Configuration) -> None:
          """ Check that the tokenizer is set up correctly.
          """
          # Will throw an error if there is an issue.
          self.init_from_project(config)
  
-
      def update_statistics(self, config: Configuration, threads: int = 2) -> None:
          """ Recompute frequencies for all name words.
          """
      def update_statistics(self, config: Configuration, threads: int = 2) -> None:
          """ Recompute frequencies for all name words.
          """
@@ -115,80 +110,38 @@ class ICUTokenizer(AbstractTokenizer):
                      cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
                                       .format(pysql.Literal(min(threads, 6),)))
  
                      cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
                                       .format(pysql.Literal(min(threads, 6),)))
  
-                if server_version_tuple(conn) < (12, 0):
-                    LOG.info('Computing word frequencies')
-                    drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
-                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                     SELECT unnest(name_vector) as id, count(*)
-                                     FROM search_name GROUP BY id""")
-                    cur.execute('CREATE INDEX ON word_frequencies(id)')
-                    cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
-                                     SELECT unnest(nameaddress_vector) as id, count(*)
-                                     FROM search_name GROUP BY id""")
-                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
-                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
-                                                                               INOUT info JSONB)
-                                   AS $$
-                                   DECLARE rec RECORD;
-                                   BEGIN
-                                   IF info is null THEN
-                                     info = '{}'::jsonb;
-                                   END IF;
-                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
-                                   LOOP
-                                     info = info || jsonb_build_object('count', rec.count);
-                                   END LOOP;
-                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
-                                   LOOP
-                                     info = info || jsonb_build_object('addr_count', rec.count);
-                                   END LOOP;
-                                   IF info = '{}'::jsonb THEN
-                                     info = null;
-                                   END IF;
-                                   END;
-                                   $$ LANGUAGE plpgsql IMMUTABLE;
-                                """)
-                    LOG.info('Update word table with recomputed frequencies')
-                    drop_tables(conn, 'tmp_word')
-                    cur.execute("""CREATE TABLE tmp_word AS
-                                    SELECT word_id, word_token, type, word,
-                                           word_freq_update(word_id, info) as info
-                                    FROM word
-                                """)
-                    drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
-                else:
-                    LOG.info('Computing word frequencies')
-                    drop_tables(conn, 'word_frequencies')
-                    cur.execute("""
-                      CREATE TEMP TABLE word_frequencies AS
-                      WITH word_freq AS MATERIALIZED (
-                               SELECT unnest(name_vector) as id, count(*)
-                                     FROM search_name GROUP BY id),
-                           addr_freq AS MATERIALIZED (
-                               SELECT unnest(nameaddress_vector) as id, count(*)
-                                     FROM search_name GROUP BY id)
-                      SELECT coalesce(a.id, w.id) as id,
-                             (CASE WHEN w.count is null THEN '{}'::JSONB
-                                  ELSE jsonb_build_object('count', w.count) END
-                              ||
-                              CASE WHEN a.count is null THEN '{}'::JSONB
-                                  ELSE jsonb_build_object('addr_count', a.count) END) as info
-                      FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
-                      """)
-                    cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
-                    cur.execute('ANALYSE word_frequencies')
-                    LOG.info('Update word table with recomputed frequencies')
-                    drop_tables(conn, 'tmp_word')
-                    cur.execute("""CREATE TABLE tmp_word AS
-                                    SELECT word_id, word_token, type, word,
-                                           (CASE WHEN wf.info is null THEN word.info
-                                            ELSE coalesce(word.info, '{}'::jsonb) || wf.info
-                                            END) as info
-                                    FROM word LEFT JOIN word_frequencies wf
-                                         ON word.word_id = wf.id
-                                    ORDER BY word_id
-                                """)
-                    drop_tables(conn, 'word_frequencies')
+                LOG.info('Computing word frequencies')
+                drop_tables(conn, 'word_frequencies')
+                cur.execute("""
+                  CREATE TEMP TABLE word_frequencies AS
+                  WITH word_freq AS MATERIALIZED (
+                           SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id),
+                       addr_freq AS MATERIALIZED (
+                           SELECT unnest(nameaddress_vector) as id, count(*)
+                                 FROM search_name GROUP BY id)
+                  SELECT coalesce(a.id, w.id) as id,
+                         (CASE WHEN w.count is null THEN '{}'::JSONB
+                              ELSE jsonb_build_object('count', w.count) END
+                          ||
+                          CASE WHEN a.count is null THEN '{}'::JSONB
+                              ELSE jsonb_build_object('addr_count', a.count) END) as info
+                  FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
+                  """)
+                cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
+                cur.execute('ANALYSE word_frequencies')
+                LOG.info('Update word table with recomputed frequencies')
+                drop_tables(conn, 'tmp_word')
+                cur.execute("""CREATE TABLE tmp_word AS
+                                SELECT word_id, word_token, type, word,
+                                       (CASE WHEN wf.info is null THEN word.info
+                                        ELSE coalesce(word.info, '{}'::jsonb) || wf.info
+                                        END) as info
+                                FROM word LEFT JOIN word_frequencies wf
+                                     ON word.word_id = wf.id
+                                ORDER BY word_id
+                            """)
+                drop_tables(conn, 'word_frequencies')
  
              with conn.cursor() as cur:
                  cur.execute('SET max_parallel_workers_per_gather TO 0')
  
              with conn.cursor() as cur:
                  cur.execute('SET max_parallel_workers_per_gather TO 0')
@@ -201,8 +154,6 @@ class ICUTokenizer(AbstractTokenizer):
          self._create_lookup_indices(config, 'tmp_word')
          self._move_temporary_word_table('tmp_word')
  
          self._create_lookup_indices(config, 'tmp_word')
          self._move_temporary_word_table('tmp_word')
  
-
-
      def _cleanup_housenumbers(self) -> None:
          """ Remove unused house numbers.
          """
      def _cleanup_housenumbers(self) -> None:
          """ Remove unused house numbers.
          """
@@ -236,8 +187,6 @@ class ICUTokenizer(AbstractTokenizer):
                                  (list(candidates.values()), ))
                  conn.commit()
  
                                  (list(candidates.values()), ))
                  conn.commit()
  
-
-
      def update_word_tokens(self) -> None:
          """ Remove unused tokens.
          """
      def update_word_tokens(self) -> None:
          """ Remove unused tokens.
          """
@@ -245,7 +194,6 @@ class ICUTokenizer(AbstractTokenizer):
          self._cleanup_housenumbers()
          LOG.warning("Tokenizer house-keeping done.")
  
          self._cleanup_housenumbers()
          LOG.warning("Tokenizer house-keeping done.")
  
-
      def name_analyzer(self) -> 'ICUNameAnalyzer':
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
      def name_analyzer(self) -> 'ICUNameAnalyzer':
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
@@ -265,7 +213,6 @@ class ICUTokenizer(AbstractTokenizer):
          return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                 self.loader.make_token_analysis())
  
          return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                 self.loader.make_token_analysis())
  
-
      def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
          """ Return a list of the `num` most frequent full words
              in the database.
      def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
          """ Return a list of the `num` most frequent full words
              in the database.
@@ -277,7 +224,6 @@ class ICUTokenizer(AbstractTokenizer):
                               ORDER BY count DESC LIMIT %s""", (num,))
              return list(s[0].split('@')[0] for s in cur)
  
                               ORDER BY count DESC LIMIT %s""", (num,))
              return list(s[0].split('@')[0] for s in cur)
  
-
      def _save_config(self) -> None:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
      def _save_config(self) -> None:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
@@ -286,7 +232,6 @@ class ICUTokenizer(AbstractTokenizer):
          with connect(self.dsn) as conn:
              self.loader.save_config_to_db(conn)
  
          with connect(self.dsn) as conn:
              self.loader.save_config_to_db(conn)
  
-
      def _setup_db_tables(self, config: Configuration) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
      def _setup_db_tables(self, config: Configuration) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
@@ -310,7 +255,6 @@ class ICUTokenizer(AbstractTokenizer):
              """)
              conn.commit()
  
              """)
              conn.commit()
  
-
      def _create_base_indices(self, config: Configuration, table_name: str) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
      def _create_base_indices(self, config: Configuration, table_name: str) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
@@ -331,21 +275,21 @@ class ICUTokenizer(AbstractTokenizer):
                                  column_type=ctype)
              conn.commit()
  
                                  column_type=ctype)
              conn.commit()
  
-
      def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
          """ Create additional indexes used when running the API.
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
              # Index required for details lookup.
      def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
          """ Create additional indexes used when running the API.
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
              # Index required for details lookup.
-            sqlp.run_string(conn, """
+            sqlp.run_string(
+                conn,
+                """
                  CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
                    ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
                  CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
                    ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
-            """,
-            table_name=table_name)
+                """,
+                table_name=table_name)
              conn.commit()
  
              conn.commit()
  
-
      def _move_temporary_word_table(self, old: str) -> None:
          """ Rename all tables and indexes used by the tokenizer.
          """
      def _move_temporary_word_table(self, old: str) -> None:
          """ Rename all tables and indexes used by the tokenizer.
          """
@@ -362,8 +306,6 @@ class ICUTokenizer(AbstractTokenizer):
              conn.commit()
  
  
              conn.commit()
  
  
-
-
  class ICUNameAnalyzer(AbstractAnalyzer):
      """ The ICU analyzer uses the ICU library for splitting names.
  
  class ICUNameAnalyzer(AbstractAnalyzer):
      """ The ICU analyzer uses the ICU library for splitting names.
  
@@ -380,7 +322,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          self._cache = _TokenCache()
  
  
          self._cache = _TokenCache()
  
-
      def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
      def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
@@ -388,20 +329,17 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              self.conn.close()
              self.conn = None
  
              self.conn.close()
              self.conn = None
  
-
      def _search_normalized(self, name: str) -> str:
          """ Return the search token transliteration of the given name.
          """
          return cast(str, self.token_analysis.search.transliterate(name)).strip()
  
      def _search_normalized(self, name: str) -> str:
          """ Return the search token transliteration of the given name.
          """
          return cast(str, self.token_analysis.search.transliterate(name)).strip()
  
-
      def _normalized(self, name: str) -> str:
          """ Return the normalized version of the given name with all
              non-relevant information removed.
          """
          return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
  
      def _normalized(self, name: str) -> str:
          """ Return the normalized version of the given name with all
              non-relevant information removed.
          """
          return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
  
-
      def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
      def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
@@ -433,8 +371,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              part_ids = {r[0]: r[1] for r in cur}
  
          return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
              part_ids = {r[0]: r[1] for r in cur}
  
          return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
-               + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
-
+            + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
      def normalize_postcode(self, postcode: str) -> str:
          """ Convert the postcode to a standardized form.
  
      def normalize_postcode(self, postcode: str) -> str:
          """ Convert the postcode to a standardized form.
@@ -444,7 +381,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
          """
          return postcode.strip().upper()
  
          """
          return postcode.strip().upper()
  
-
      def update_postcodes_from_db(self) -> None:
          """ Update postcode tokens in the word table from the location_postcode
              table.
      def update_postcodes_from_db(self) -> None:
          """ Update postcode tokens in the word table from the location_postcode
              table.
@@ -517,9 +453,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              with self.conn.cursor() as cur:
                  cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
  
              with self.conn.cursor() as cur:
                  cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
  
-
-
-
      def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                 should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
      def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                                 should_replace: bool) -> None:
          """ Replace the search index for special phrases with the new phrases.
@@ -549,7 +482,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                   len(norm_phrases), added, deleted)
  
          LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                   len(norm_phrases), added, deleted)
  
-
      def _add_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
      def _add_special_phrases(self, cursor: Cursor,
                               new_phrases: Set[Tuple[str, str, str, str]],
                               existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
@@ -569,10 +501,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          return added
  
  
          return added
  
-
      def _remove_special_phrases(self, cursor: Cursor,
      def _remove_special_phrases(self, cursor: Cursor,
-                             new_phrases: Set[Tuple[str, str, str, str]],
-                             existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
+                                new_phrases: Set[Tuple[str, str, str, str]],
+                                existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
          """ Remove all phrases from the database that are no longer in the
              new phrase list.
          """
          """ Remove all phrases from the database that are no longer in the
              new phrase list.
          """
@@ -588,7 +519,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          return len(to_delete)
  
  
          return len(to_delete)
  
-
      def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
          """ Add default names for the given country to the search index.
          """
      def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
          """ Add default names for the given country to the search index.
          """
@@ -600,7 +530,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                                       self.sanitizer.process_names(info)[0],
                                       internal=True)
  
                                       self.sanitizer.process_names(info)[0],
                                       internal=True)
  
-
      def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
                                  internal: bool = False) -> None:
          """ Add names for the given country from an already sanitized
      def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
                                  internal: bool = False) -> None:
          """ Add names for the given country from an already sanitized
@@ -652,7 +581,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                            """
                  cur.execute(sql, (country_code, list(new_tokens)))
  
                            """
                  cur.execute(sql, (country_code, list(new_tokens)))
  
-
      def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
          """ Determine tokenizer information about the given place.
  
      def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
          """ Determine tokenizer information about the given place.
  
@@ -675,7 +603,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          return token_info.to_dict()
  
  
          return token_info.to_dict()
  
-
      def _process_place_address(self, token_info: '_TokenInfo',
                                 address: Sequence[PlaceName]) -> None:
          for item in address:
      def _process_place_address(self, token_info: '_TokenInfo',
                                 address: Sequence[PlaceName]) -> None:
          for item in address:
@@ -688,12 +615,11 @@ class ICUNameAnalyzer(AbstractAnalyzer):
              elif item.kind == 'place':
                  if not item.suffix:
                      token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
              elif item.kind == 'place':
                  if not item.suffix:
                      token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
-            elif not item.kind.startswith('_') and not item.suffix and \
-                 item.kind not in ('country', 'full', 'inclusion'):
+            elif (not item.kind.startswith('_') and not item.suffix and
+                  item.kind not in ('country', 'full', 'inclusion')):
                  token_info.add_address_term(item.kind,
                                              itertools.chain(*self._compute_name_tokens([item])))
  
                  token_info.add_address_term(item.kind,
                                              itertools.chain(*self._compute_name_tokens([item])))
  
-
      def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
          """ Normalize the housenumber and return the word token and the
              canonical form.
      def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
          """ Normalize the housenumber and return the word token and the
              canonical form.
@@ -729,7 +655,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          return result
  
  
          return result
  
-
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
              The name is only retrieved for the standard analyser.
      def _retrieve_full_tokens(self, name: str) -> List[int]:
          """ Get the full name token for the given name, if it exists.
              The name is only retrieved for the standard analyser.
@@ -750,7 +675,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          return full
  
  
          return full
  
-
      def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
      def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
@@ -788,7 +712,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
  
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
-
      def _add_postcode(self, item: PlaceName) -> Optional[str]:
          """ Make sure the normalized postcode is present in the word table.
          """
      def _add_postcode(self, item: PlaceName) -> Optional[str]:
          """ Make sure the normalized postcode is present in the word table.
          """
@@ -836,11 +759,9 @@ class _TokenInfo:
          self.address_tokens: Dict[str, str] = {}
          self.postcode: Optional[str] = None
  
          self.address_tokens: Dict[str, str] = {}
          self.postcode: Optional[str] = None
  
-
      def _mk_array(self, tokens: Iterable[Any]) -> str:
          return f"{{{','.join((str(s) for s in tokens))}}}"
  
      def _mk_array(self, tokens: Iterable[Any]) -> str:
          return f"{{{','.join((str(s) for s in tokens))}}}"
  
-
      def to_dict(self) -> Dict[str, Any]:
          """ Return the token information in database importable format.
          """
      def to_dict(self) -> Dict[str, Any]:
          """ Return the token information in database importable format.
          """
@@ -867,13 +788,11 @@ class _TokenInfo:
  
          return out
  
  
          return out
  
-
      def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
          """ Adds token information for the normalised names.
          """
          self.names = self._mk_array(itertools.chain(fulls, partials))
  
      def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
          """ Adds token information for the normalised names.
          """
          self.names = self._mk_array(itertools.chain(fulls, partials))
  
-
      def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
          """ Extract housenumber information from a list of normalised
              housenumbers.
      def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
          """ Extract housenumber information from a list of normalised
              housenumbers.
@@ -883,7 +802,6 @@ class _TokenInfo:
              self.housenumbers.add(hnr)
              self.housenumber_tokens.add(token)
  
              self.housenumbers.add(hnr)
              self.housenumber_tokens.add(token)
  
-
      def add_street(self, tokens: Iterable[int]) -> None:
          """ Add addr:street match terms.
          """
      def add_street(self, tokens: Iterable[int]) -> None:
          """ Add addr:street match terms.
          """
@@ -891,13 +809,11 @@ class _TokenInfo:
              self.street_tokens = set()
          self.street_tokens.update(tokens)
  
              self.street_tokens = set()
          self.street_tokens.update(tokens)
  
-
      def add_place(self, tokens: Iterable[int]) -> None:
          """ Add addr:place search and match terms.
          """
          self.place_tokens.update(tokens)
  
      def add_place(self, tokens: Iterable[int]) -> None:
          """ Add addr:place search and match terms.
          """
          self.place_tokens.update(tokens)
  
-
      def add_address_term(self, key: str, partials: Iterable[int]) -> None:
          """ Add additional address terms.
          """
      def add_address_term(self, key: str, partials: Iterable[int]) -> None:
          """ Add additional address terms.
          """