]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tools/postcodes.py
Merge pull request #2761 from lonvia/repair-index-analysis
[nominatim.git] / nominatim / tools / postcodes.py
index 7a0a0e07b4056a85f114ae5e3985584de39c8995..9c66719b5fe1ce55573985f8a653876c093102c6 100644 (file)
@@ -1,31 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
 """
 Functions for importing, updating and otherwise maintaining the table
 of artificial postcode centroids.
 """
 """
 Functions for importing, updating and otherwise maintaining the table
 of artificial postcode centroids.
 """
+from collections import defaultdict
 import csv
 import gzip
 import logging
 import csv
 import gzip
 import logging
+from math import isfinite
 
 
-from psycopg2.extras import execute_values
+from psycopg2 import sql as pysql
 
 from nominatim.db.connection import connect
 
 from nominatim.db.connection import connect
+from nominatim.utils.centroid import PointsCentroid
+from nominatim.data.postcode_format import PostcodeFormatter
 
 LOG = logging.getLogger()
 
 
 LOG = logging.getLogger()
 
-class _CountryPostcodesCollector:
+def _to_float(num, max_value):
+    """ Convert the number in string into a float. The number is expected
+        to be in the range of [-max_value, max_value]. Otherwise rises a
+        ValueError.
+    """
+    num = float(num)
+    if not isfinite(num) or num <= -max_value or num >= max_value:
+        raise ValueError()
+
+    return num
+
+class _PostcodeCollector:
     """ Collector for postcodes of a single country.
     """
 
     """ Collector for postcodes of a single country.
     """
 
-    def __init__(self, country):
+    def __init__(self, country, matcher):
         self.country = country
         self.country = country
-        self.collected = dict()
+        self.matcher = matcher
+        self.collected = defaultdict(PointsCentroid)
+        self.normalization_cache = None
 
 
     def add(self, postcode, x, y):
         """ Add the given postcode to the collection cache. If the postcode
             already existed, it is overwritten with the new centroid.
         """
 
 
     def add(self, postcode, x, y):
         """ Add the given postcode to the collection cache. If the postcode
             already existed, it is overwritten with the new centroid.
         """
-        self.collected[postcode] = (x, y)
+        if self.matcher is not None:
+            if self.normalization_cache and self.normalization_cache[0] == postcode:
+                normalized = self.normalization_cache[1]
+            else:
+                match = self.matcher.match(postcode)
+                normalized = self.matcher.normalize(match) if match else None
+                self.normalization_cache = (postcode, normalized)
+
+            if normalized:
+                self.collected[normalized] += (x, y)
 
 
     def commit(self, conn, analyzer, project_dir):
 
 
     def commit(self, conn, analyzer, project_dir):
@@ -35,54 +67,56 @@ class _CountryPostcodesCollector:
         self._update_from_external(analyzer, project_dir)
         to_add, to_delete, to_update = self._compute_changes(conn)
 
         self._update_from_external(analyzer, project_dir)
         to_add, to_delete, to_update = self._compute_changes(conn)
 
+        LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
+                 self.country, len(to_add), len(to_delete), len(to_update))
+
         with conn.cursor() as cur:
             if to_add:
         with conn.cursor() as cur:
             if to_add:
-                execute_values(cur,
-                               """INSERT INTO location_postcodes
-                                      (place_id, indexed_status, countrycode,
-                                       postcode, geometry) VALUES %s""",
-                               to_add,
-                               template="""(nextval('seq_place'), 1, '{}',
-                                           %s, 'SRID=4326;POINT(%s %s)')
-                                        """.format(self.country))
+                cur.execute_values(
+                    """INSERT INTO location_postcode
+                         (place_id, indexed_status, country_code,
+                          postcode, geometry) VALUES %s""",
+                    to_add,
+                    template=pysql.SQL("""(nextval('seq_place'), 1, {},
+                                          %s, 'SRID=4326;POINT(%s %s)')
+                                       """).format(pysql.Literal(self.country)))
             if to_delete:
             if to_delete:
-                cur.execute("""DELETE FROM location_postcodes
+                cur.execute("""DELETE FROM location_postcode
                                WHERE country_code = %s and postcode = any(%s)
                             """, (self.country, to_delete))
             if to_update:
                                WHERE country_code = %s and postcode = any(%s)
                             """, (self.country, to_delete))
             if to_update:
-                execute_values(cur,
-                               """UPDATE location_postcodes
-                                  SET indexed_status = 2,
-                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
-                                  FROM (VALUES %s) AS v (pc, x, y)
-                                  WHERE country_code = '{}' and postcode = pc
-                               """.format(self.country),
-                               to_update)
+                cur.execute_values(
+                    pysql.SQL("""UPDATE location_postcode
+                                 SET indexed_status = 2,
+                                     geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
+                                 FROM (VALUES %s) AS v (pc, x, y)
+                                 WHERE country_code = {} and postcode = pc
+                              """).format(pysql.Literal(self.country)), to_update)
 
 
     def _compute_changes(self, conn):
         """ Compute which postcodes from the collected postcodes have to be
 
 
     def _compute_changes(self, conn):
         """ Compute which postcodes from the collected postcodes have to be
-            added or modified and which from the location_postcodes table
+            added or modified and which from the location_postcode table
             have to be deleted.
         """
         to_update = []
         to_delete = []
         with conn.cursor() as cur:
             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
             have to be deleted.
         """
         to_update = []
         to_delete = []
         with conn.cursor() as cur:
             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
-                           FROM location_postcodes
+                           FROM location_postcode
                            WHERE country_code = %s""",
                         (self.country, ))
             for postcode, x, y in cur:
                            WHERE country_code = %s""",
                         (self.country, ))
             for postcode, x, y in cur:
-                oldx, oldy = self.collected.pop(postcode, (None, None))
-                if oldx is not None:
-                    dist = (x - oldx)**2 + (y - oldy)**2
-                    if dist > 0.000001:
-                        to_update.append(postcode, x, y)
+                pcobj = self.collected.pop(postcode, None)
+                if pcobj:
+                    newx, newy = pcobj.centroid()
+                    if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
+                        to_update.append((postcode, newx, newy))
                 else:
                     to_delete.append(postcode)
 
                 else:
                     to_delete.append(postcode)
 
-        to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
-        self.collected = []
+        to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
+        self.collected = None
 
         return to_add, to_delete, to_update
 
 
         return to_add, to_delete, to_update
 
@@ -105,7 +139,10 @@ class _CountryPostcodesCollector:
                 postcode = analyzer.normalize_postcode(row['postcode'])
                 if postcode not in self.collected:
                     try:
                 postcode = analyzer.normalize_postcode(row['postcode'])
                 if postcode not in self.collected:
                     try:
-                        self.collected[postcode] = (float(row['lon'], float(row['lat'])))
+                        # Do float conversation separately, it might throw
+                        centroid = (_to_float(row['lon'], 180),
+                                    _to_float(row['lat'], 90))
+                        self.collected[postcode] += centroid
                     except ValueError:
                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
                                     row['lat'], row['lon'], self.country)
                     except ValueError:
                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
                                     row['lat'], row['lon'], self.country)
@@ -115,13 +152,13 @@ class _CountryPostcodesCollector:
 
 
     def _open_external(self, project_dir):
 
 
     def _open_external(self, project_dir):
-        fname = project_dir / '{}_postcodes.csv'.format(self.country)
+        fname = project_dir / f'{self.country}_postcodes.csv'
 
         if fname.is_file():
             LOG.info("Using external postcode file '%s'.", fname)
 
         if fname.is_file():
             LOG.info("Using external postcode file '%s'.", fname)
-            return open(fname, 'r')
+            return open(fname, 'r', encoding='utf-8')
 
 
-        fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
+        fname = project_dir / f'{self.country}_postcodes.csv.gz'
 
         if fname.is_file():
             LOG.info("Using external postcode file '%s'.", fname)
 
         if fname.is_file():
             LOG.info("Using external postcode file '%s'.", fname)
@@ -137,19 +174,29 @@ def update_postcodes(dsn, project_dir, tokenizer):
         potentially enhances it with external data and then updates the
         postcodes in the table 'location_postcode'.
     """
         potentially enhances it with external data and then updates the
         postcodes in the table 'location_postcode'.
     """
+    matcher = PostcodeFormatter()
     with tokenizer.name_analyzer() as analyzer:
         with connect(dsn) as conn:
     with tokenizer.name_analyzer() as analyzer:
         with connect(dsn) as conn:
-            with conn.cursor("placex_postcodes") as cur:
-                cur.execute("""SELECT country_code, pc, ST_X(centroid), ST_Y(centroid)
-                               FROM (
-                                 SELECT country_code,
-                                        token_normalized_postcode(address->'postcode') as pc,
-                                        ST_Centroid(ST_Collect(ST_Centroid(geometry))) as centroid
-                                 FROM placex
-                                 WHERE address ? 'postcode' and geometry IS NOT null
-                                 GROUP BY country_code, pc) xx
-                               WHERE pc is not null
-                               ORDER BY country_code, pc""")
+            # First get the list of countries that currently have postcodes.
+            # (Doing this before starting to insert, so it is fast on import.)
+            with conn.cursor() as cur:
+                cur.execute("SELECT DISTINCT country_code FROM location_postcode")
+                todo_countries = set((row[0] for row in cur))
+
+            # Recompute the list of valid postcodes from placex.
+            with conn.cursor(name="placex_postcodes") as cur:
+                cur.execute("""
+                SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
+                FROM (SELECT
+                        COALESCE(plx.country_code,
+                                 get_country_code(ST_Centroid(pl.geometry))) as cc,
+                        pl.address->'postcode' as pc,
+                        COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
+                      FROM place AS pl LEFT OUTER JOIN placex AS plx
+                             ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
+                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
+                WHERE pc IS NOT null AND cc IS NOT null
+                ORDER BY cc, pc""")
 
                 collector = None
 
 
                 collector = None
 
@@ -157,12 +204,26 @@ def update_postcodes(dsn, project_dir, tokenizer):
                     if collector is None or country != collector.country:
                         if collector is not None:
                             collector.commit(conn, analyzer, project_dir)
                     if collector is None or country != collector.country:
                         if collector is not None:
                             collector.commit(conn, analyzer, project_dir)
-                        collector = _CountryPostcodesCollector(country)
+                        collector = _PostcodeCollector(country, matcher.get_matcher(country))
+                        todo_countries.discard(country)
                     collector.add(postcode, x, y)
 
                 if collector is not None:
                     collector.commit(conn, analyzer, project_dir)
 
                     collector.add(postcode, x, y)
 
                 if collector is not None:
                     collector.commit(conn, analyzer, project_dir)
 
+            # Now handle any countries that are only in the postcode table.
+            for country in todo_countries:
+                fmt = matcher.get_matcher(country)
+                _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
+
             conn.commit()
 
             conn.commit()
 
-        analyzer.add_postcodes_from_db()
+        analyzer.update_postcodes_from_db()
+
+def can_compute(dsn):
+    """
+        Check that the place table exists so that
+        postcodes can be computed.
+    """
+    with connect(dsn) as conn:
+        return conn.table_exists('place')