1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
11 from collections import defaultdict
15 from math import isfinite
17 from psycopg2 import sql as pysql
19 from nominatim.db.connection import connect
20 from nominatim.utils.centroid import PointsCentroid
21 from nominatim.data.postcode_format import PostcodeFormatter
23 LOG = logging.getLogger()
25 def _to_float(num, max_value):
26 """ Convert the number in string into a float. The number is expected
27 to be in the range of [-max_value, max_value]. Otherwise rises a
31 if not isfinite(num) or num <= -max_value or num >= max_value:
36 class _PostcodeCollector:
37 """ Collector for postcodes of a single country.
40 def __init__(self, country, matcher):
41 self.country = country
42 self.matcher = matcher
43 self.collected = defaultdict(PointsCentroid)
44 self.normalization_cache = None
47 def add(self, postcode, x, y):
48 """ Add the given postcode to the collection cache. If the postcode
49 already existed, it is overwritten with the new centroid.
51 if self.matcher is not None:
52 if self.normalization_cache and self.normalization_cache[0] == postcode:
53 normalized = self.normalization_cache[1]
55 match = self.matcher.match(postcode)
56 normalized = self.matcher.normalize(match) if match else None
57 self.normalization_cache = (postcode, normalized)
60 self.collected[normalized] += (x, y)
63 def commit(self, conn, analyzer, project_dir):
64 """ Update postcodes for the country from the postcodes selected so far
65 as well as any externally supplied postcodes.
67 self._update_from_external(analyzer, project_dir)
68 to_add, to_delete, to_update = self._compute_changes(conn)
70 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
71 self.country, len(to_add), len(to_delete), len(to_update))
73 with conn.cursor() as cur:
76 """INSERT INTO location_postcode
77 (place_id, indexed_status, country_code,
78 postcode, geometry) VALUES %s""",
80 template=pysql.SQL("""(nextval('seq_place'), 1, {},
81 %s, 'SRID=4326;POINT(%s %s)')
82 """).format(pysql.Literal(self.country)))
84 cur.execute("""DELETE FROM location_postcode
85 WHERE country_code = %s and postcode = any(%s)
86 """, (self.country, to_delete))
89 pysql.SQL("""UPDATE location_postcode
90 SET indexed_status = 2,
91 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
92 FROM (VALUES %s) AS v (pc, x, y)
93 WHERE country_code = {} and postcode = pc
94 """).format(pysql.Literal(self.country)), to_update)
97 def _compute_changes(self, conn):
98 """ Compute which postcodes from the collected postcodes have to be
99 added or modified and which from the location_postcode table
104 with conn.cursor() as cur:
105 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
106 FROM location_postcode
107 WHERE country_code = %s""",
109 for postcode, x, y in cur:
110 pcobj = self.collected.pop(postcode, None)
112 newx, newy = pcobj.centroid()
113 if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
114 to_update.append((postcode, newx, newy))
116 to_delete.append(postcode)
118 to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
119 self.collected = None
121 return to_add, to_delete, to_update
124 def _update_from_external(self, analyzer, project_dir):
125 """ Look for an external postcode file for the active country in
126 the project directory and add missing postcodes when found.
128 csvfile = self._open_external(project_dir)
133 reader = csv.DictReader(csvfile)
135 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
136 LOG.warning("Bad format for external postcode file for country '%s'."
137 " Ignored.", self.country)
139 postcode = analyzer.normalize_postcode(row['postcode'])
140 if postcode not in self.collected:
142 # Do float conversation separately, it might throw
143 centroid = (_to_float(row['lon'], 180),
144 _to_float(row['lat'], 90))
145 self.collected[postcode] += centroid
147 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
148 row['lat'], row['lon'], self.country)
154 def _open_external(self, project_dir):
155 fname = project_dir / f'{self.country}_postcodes.csv'
158 LOG.info("Using external postcode file '%s'.", fname)
159 return open(fname, 'r', encoding='utf-8')
161 fname = project_dir / f'{self.country}_postcodes.csv.gz'
164 LOG.info("Using external postcode file '%s'.", fname)
165 return gzip.open(fname, 'rt')
170 def update_postcodes(dsn, project_dir, tokenizer):
171 """ Update the table of artificial postcodes.
173 Computes artificial postcode centroids from the placex table,
174 potentially enhances it with external data and then updates the
175 postcodes in the table 'location_postcode'.
177 matcher = PostcodeFormatter()
178 with tokenizer.name_analyzer() as analyzer:
179 with connect(dsn) as conn:
180 # First get the list of countries that currently have postcodes.
181 # (Doing this before starting to insert, so it is fast on import.)
182 with conn.cursor() as cur:
183 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
184 todo_countries = set((row[0] for row in cur))
186 # Recompute the list of valid postcodes from placex.
187 with conn.cursor(name="placex_postcodes") as cur:
189 SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
191 COALESCE(plx.country_code,
192 get_country_code(ST_Centroid(pl.geometry))) as cc,
193 pl.address->'postcode' as pc,
194 COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
195 FROM place AS pl LEFT OUTER JOIN placex AS plx
196 ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
197 WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
198 WHERE pc IS NOT null AND cc IS NOT null
203 for country, postcode, x, y in cur:
204 if collector is None or country != collector.country:
205 if collector is not None:
206 collector.commit(conn, analyzer, project_dir)
207 collector = _PostcodeCollector(country, matcher.get_matcher(country))
208 todo_countries.discard(country)
209 collector.add(postcode, x, y)
211 if collector is not None:
212 collector.commit(conn, analyzer, project_dir)
214 # Now handle any countries that are only in the postcode table.
215 for country in todo_countries:
216 fmt = matcher.get_matcher(country)
217 _PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
221 analyzer.update_postcodes_from_db()
223 def can_compute(dsn):
225 Check that the place table exists so that
226 postcodes can be computed.
228 with connect(dsn) as conn:
229 return conn.table_exists('place')