]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tools/tiger_data.py
reintroduce cutoffs when searching for very frequent words
[nominatim.git] / nominatim / tools / tiger_data.py
index 9903ea2bb8e68c43ff25533dfcc9d1d0905c74da..70cecae58648997f0bc6a81a863fee7b420432f5 100644 (file)
@@ -7,17 +7,23 @@
 """
 Functions for importing tiger data and handling tarbar and directory files
 """
+from typing import Any, TextIO, List, Union, cast
 import csv
 import io
 import logging
 import os
 import tarfile
 
+from psycopg2.extras import Json
+
+from nominatim.config import Configuration
 from nominatim.db.connection import connect
 from nominatim.db.async_connection import WorkerPool
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
 from nominatim.data.place_info import PlaceInfo
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
+from nominatim.tools import freeze
 
 LOG = logging.getLogger()
 
@@ -26,9 +32,9 @@ class TigerInput:
         either be in a directory or gzipped together in a tar file.
     """
 
-    def __init__(self, data_dir):
+    def __init__(self, data_dir: str) -> None:
         self.tar_handle = None
-        self.files = []
+        self.files: List[Union[str, tarfile.TarInfo]] = []
 
         if data_dir.endswith('.tar.gz'):
             try:
@@ -48,33 +54,36 @@ class TigerInput:
             LOG.warning("Tiger data import selected but no files found at %s", data_dir)
 
 
-    def __enter__(self):
+    def __enter__(self) -> 'TigerInput':
         return self
 
 
-    def __exit__(self, exc_type, exc_val, exc_tb):
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
         if self.tar_handle:
             self.tar_handle.close()
             self.tar_handle = None
 
 
-    def next_file(self):
+    def next_file(self) -> TextIO:
         """ Return a file handle to the next file to be processed.
             Raises an IndexError if there is no file left.
         """
         fname = self.files.pop(0)
 
         if self.tar_handle is not None:
-            return io.TextIOWrapper(self.tar_handle.extractfile(fname))
+            extracted = self.tar_handle.extractfile(fname)
+            assert extracted is not None
+            return io.TextIOWrapper(extracted)
 
-        return open(fname, encoding='utf-8')
+        return open(cast(str, fname), encoding='utf-8')
 
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.files)
 
 
-def handle_threaded_sql_statements(pool, fd, analyzer):
+def handle_threaded_sql_statements(pool: WorkerPool, fd: TextIO,
+                                   analyzer: AbstractAnalyzer) -> None:
     """ Handles sql statement with multiplexing
     """
     lines = 0
@@ -87,7 +96,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
             address = dict(street=row['street'], postcode=row['postcode'])
             args = ('SRID=4326;' + row['geometry'],
                     int(row['from']), int(row['to']), row['interpolation'],
-                    PlaceInfo({'address': address}).analyze(analyzer),
+                    Json(analyzer.process_place(PlaceInfo({'address': address}))),
                     analyzer.normalize_postcode(row['postcode']))
         except ValueError:
             continue
@@ -99,14 +108,22 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
             lines = 0
 
 
-def add_tiger_data(data_dir, config, threads, tokenizer):
+def add_tiger_data(data_dir: str, config: Configuration, threads: int,
+                   tokenizer: AbstractTokenizer) -> int:
     """ Import tiger data from directory or tar file `data dir`.
     """
     dsn = config.get_libpq_dsn()
 
+    with connect(dsn) as conn:
+        is_frozen = freeze.is_frozen(conn)
+        conn.close()
+
+        if is_frozen:
+            raise UsageError("Tiger cannot be imported when database frozen (Github issue #3048)")
+
     with TigerInput(data_dir) as tar:
         if not tar:
-            return
+            return 1
 
         with connect(dsn) as conn:
             sql = SQLPreprocessor(conn, config)
@@ -128,3 +145,5 @@ def add_tiger_data(data_dir, config, threads, tokenizer):
     with connect(dsn) as conn:
         sql = SQLPreprocessor(conn, config)
         sql.run_sql_file(conn, 'tiger_import_finish.sql')
+
+    return 0