]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tools/postcodes.py
reintroduce cutoffs when searching for very frequent words
[nominatim.git] / nominatim / tools / postcodes.py
index 26b96099a9f8f5a767525e843905a24b282935c1..7171e25d169d0af7b625430fafefdfba183c4df4 100644 (file)
@@ -8,7 +8,9 @@
 Functions for importing, updating and otherwise maintaining the table
 of artificial postcode centroids.
 """
+from typing import Optional, Tuple, Dict, List, TextIO
 from collections import defaultdict
+from pathlib import Path
 import csv
 import gzip
 import logging
@@ -16,39 +18,41 @@ from math import isfinite
 
 from psycopg2 import sql as pysql
 
-from nominatim.db.connection import connect
+from nominatim.db.connection import connect, Connection
 from nominatim.utils.centroid import PointsCentroid
-from nominatim.data.postcode_format import PostcodeFormatter
+from nominatim.data.postcode_format import PostcodeFormatter, CountryPostcodeMatcher
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 LOG = logging.getLogger()
 
-def _to_float(num, max_value):
+def _to_float(numstr: str, max_value: float) -> float:
     """ Convert the number in string into a float. The number is expected
         to be in the range of [-max_value, max_value]. Otherwise rises a
         ValueError.
     """
-    num = float(num)
+    num = float(numstr)
     if not isfinite(num) or num <= -max_value or num >= max_value:
         raise ValueError()
 
     return num
 
-class _CountryPostcodesCollector:
+class _PostcodeCollector:
     """ Collector for postcodes of a single country.
     """
 
-    def __init__(self, country, matcher):
+    def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
         self.country = country
         self.matcher = matcher
-        self.collected = defaultdict(PointsCentroid)
-        self.normalization_cache = None
+        self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
+        self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
 
 
-    def add(self, postcode, x, y):
+    def add(self, postcode: str, x: float, y: float) -> None:
         """ Add the given postcode to the collection cache. If the postcode
             already existed, it is overwritten with the new centroid.
         """
         if self.matcher is not None:
+            normalized: Optional[str]
             if self.normalization_cache and self.normalization_cache[0] == postcode:
                 normalized = self.normalization_cache[1]
             else:
@@ -60,7 +64,7 @@ class _CountryPostcodesCollector:
                 self.collected[normalized] += (x, y)
 
 
-    def commit(self, conn, analyzer, project_dir):
+    def commit(self, conn: Connection, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
         """ Update postcodes for the country from the postcodes selected so far
             as well as any externally supplied postcodes.
         """
@@ -94,7 +98,8 @@ class _CountryPostcodesCollector:
                               """).format(pysql.Literal(self.country)), to_update)
 
 
-    def _compute_changes(self, conn):
+    def _compute_changes(self, conn: Connection) \
+          -> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[str, float, float]]]:
         """ Compute which postcodes from the collected postcodes have to be
             added or modified and which from the location_postcode table
             have to be deleted.
@@ -116,12 +121,12 @@ class _CountryPostcodesCollector:
                     to_delete.append(postcode)
 
         to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
-        self.collected = None
+        self.collected = defaultdict(PointsCentroid)
 
         return to_add, to_delete, to_update
 
 
-    def _update_from_external(self, analyzer, project_dir):
+    def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
         """ Look for an external postcode file for the active country in
             the project directory and add missing postcodes when found.
         """
@@ -151,7 +156,7 @@ class _CountryPostcodesCollector:
             csvfile.close()
 
 
-    def _open_external(self, project_dir):
+    def _open_external(self, project_dir: Path) -> Optional[TextIO]:
         fname = project_dir / f'{self.country}_postcodes.csv'
 
         if fname.is_file():
@@ -167,7 +172,7 @@ class _CountryPostcodesCollector:
         return None
 
 
-def update_postcodes(dsn, project_dir, tokenizer):
+def update_postcodes(dsn: str, project_dir: Path, tokenizer: AbstractTokenizer) -> None:
     """ Update the table of artificial postcodes.
 
         Computes artificial postcode centroids from the placex table,
@@ -186,17 +191,17 @@ def update_postcodes(dsn, project_dir, tokenizer):
             # Recompute the list of valid postcodes from placex.
             with conn.cursor(name="placex_postcodes") as cur:
                 cur.execute("""
-                SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
+                SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
                 FROM (SELECT
                         COALESCE(plx.country_code,
                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
-                        token_normalized_postcode(pl.address->'postcode') as pc,
+                        pl.address->'postcode' as pc,
                         COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
                       FROM place AS pl LEFT OUTER JOIN placex AS plx
                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
                 WHERE pc IS NOT null AND cc IS NOT null
-                ORDER BY country_code, pc""")
+                ORDER BY cc, pc""")
 
                 collector = None
 
@@ -204,7 +209,7 @@ def update_postcodes(dsn, project_dir, tokenizer):
                     if collector is None or country != collector.country:
                         if collector is not None:
                             collector.commit(conn, analyzer, project_dir)
-                        collector = _CountryPostcodesCollector(country, matcher.get_matcher(country))
+                        collector = _PostcodeCollector(country, matcher.get_matcher(country))
                         todo_countries.discard(country)
                     collector.add(postcode, x, y)
 
@@ -213,13 +218,14 @@ def update_postcodes(dsn, project_dir, tokenizer):
 
             # Now handle any countries that are only in the postcode table.
             for country in todo_countries:
-                _CountryPostcodesCollector(country, matcher.get_matcher(country)).commit(conn, analyzer, project_dir)
+                fmt = matcher.get_matcher(country)
+                _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
 
             conn.commit()
 
         analyzer.update_postcodes_from_db()
 
-def can_compute(dsn):
+def can_compute(dsn: str) -> bool:
     """
         Check that the place table exists so that
         postcodes can be computed.