1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
11 from collections import defaultdict
15 from math import isfinite
17 from psycopg2 import sql as pysql
19 from nominatim.db.connection import connect
20 from nominatim.utils.centroid import PointsCentroid
21 from nominatim.data.postcode_format import PostcodeFormatter
23 LOG = logging.getLogger()
25 def _to_float(num, max_value):
26 """ Convert the number in string into a float. The number is expected
27 to be in the range of [-max_value, max_value]. Otherwise rises a
31 if not isfinite(num) or num <= -max_value or num >= max_value:
36 class _CountryPostcodesCollector:
37 """ Collector for postcodes of a single country.
40 def __init__(self, country):
41 self.country = country
42 self.collected = defaultdict(PointsCentroid)
45 def add(self, postcode, x, y):
46 """ Add the given postcode to the collection cache. If the postcode
47 already existed, it is overwritten with the new centroid.
49 self.collected[postcode] += (x, y)
52 def commit(self, conn, analyzer, project_dir):
53 """ Update postcodes for the country from the postcodes selected so far
54 as well as any externally supplied postcodes.
56 self._update_from_external(analyzer, project_dir)
57 to_add, to_delete, to_update = self._compute_changes(conn)
59 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
60 self.country, len(to_add), len(to_delete), len(to_update))
62 with conn.cursor() as cur:
65 """INSERT INTO location_postcode
66 (place_id, indexed_status, country_code,
67 postcode, geometry) VALUES %s""",
69 template=pysql.SQL("""(nextval('seq_place'), 1, {},
70 %s, 'SRID=4326;POINT(%s %s)')
71 """).format(pysql.Literal(self.country)))
73 cur.execute("""DELETE FROM location_postcode
74 WHERE country_code = %s and postcode = any(%s)
75 """, (self.country, to_delete))
78 pysql.SQL("""UPDATE location_postcode
79 SET indexed_status = 2,
80 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
81 FROM (VALUES %s) AS v (pc, x, y)
82 WHERE country_code = {} and postcode = pc
83 """).format(pysql.Literal(self.country)), to_update)
86 def _compute_changes(self, conn):
87 """ Compute which postcodes from the collected postcodes have to be
88 added or modified and which from the location_postcode table
93 with conn.cursor() as cur:
94 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
95 FROM location_postcode
96 WHERE country_code = %s""",
98 for postcode, x, y in cur:
99 pcobj = self.collected.pop(postcode, None)
101 newx, newy = pcobj.centroid()
102 if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
103 to_update.append((postcode, newx, newy))
105 to_delete.append(postcode)
107 to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
108 self.collected = None
110 return to_add, to_delete, to_update
113 def _update_from_external(self, analyzer, project_dir):
114 """ Look for an external postcode file for the active country in
115 the project directory and add missing postcodes when found.
117 csvfile = self._open_external(project_dir)
122 reader = csv.DictReader(csvfile)
124 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
125 LOG.warning("Bad format for external postcode file for country '%s'."
126 " Ignored.", self.country)
128 postcode = analyzer.normalize_postcode(row['postcode'])
129 if postcode not in self.collected:
131 # Do float conversation separately, it might throw
132 centroid = (_to_float(row['lon'], 180),
133 _to_float(row['lat'], 90))
134 self.collected[postcode] += centroid
136 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
137 row['lat'], row['lon'], self.country)
143 def _open_external(self, project_dir):
144 fname = project_dir / f'{self.country}_postcodes.csv'
147 LOG.info("Using external postcode file '%s'.", fname)
148 return open(fname, 'r', encoding='utf-8')
150 fname = project_dir / f'{self.country}_postcodes.csv.gz'
153 LOG.info("Using external postcode file '%s'.", fname)
154 return gzip.open(fname, 'rt')
159 def update_postcodes(dsn, project_dir, tokenizer):
160 """ Update the table of artificial postcodes.
162 Computes artificial postcode centroids from the placex table,
163 potentially enhances it with external data and then updates the
164 postcodes in the table 'location_postcode'.
166 matcher = PostcodeFormatter()
167 with tokenizer.name_analyzer() as analyzer:
168 with connect(dsn) as conn:
169 # First get the list of countries that currently have postcodes.
170 # (Doing this before starting to insert, so it is fast on import.)
171 with conn.cursor() as cur:
172 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
173 todo_countries = set((row[0] for row in cur))
175 # Recompute the list of valid postcodes from placex.
176 with conn.cursor(name="placex_postcodes") as cur:
178 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
180 COALESCE(plx.country_code,
181 get_country_code(ST_Centroid(pl.geometry))) as cc,
182 token_normalized_postcode(pl.address->'postcode') as pc,
183 COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
184 FROM place AS pl LEFT OUTER JOIN placex AS plx
185 ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
186 WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
187 WHERE pc IS NOT null AND cc IS NOT null
188 ORDER BY country_code, pc""")
192 for country, postcode, x, y in cur:
193 if collector is None or country != collector.country:
194 if collector is not None:
195 collector.commit(conn, analyzer, project_dir)
196 collector = _CountryPostcodesCollector(country)
197 todo_countries.discard(country)
198 match = matcher.match(country, postcode)
200 collector.add(matcher.normalize(country, match), x, y)
202 if collector is not None:
203 collector.commit(conn, analyzer, project_dir)
205 # Now handle any countries that are only in the postcode table.
206 for country in todo_countries:
207 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
211 analyzer.update_postcodes_from_db()
213 def can_compute(dsn):
215 Check that the place table exists so that
216 postcodes can be computed.
218 with connect(dsn) as conn:
219 return conn.table_exists('place')