2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
8 from math import isfinite
10 from psycopg2.extras import execute_values
12 from nominatim.db.connection import connect
14 LOG = logging.getLogger()
16 def _to_float(num, max_value):
17 """ Convert the number in string into a float. The number is expected
18 to be in the range of [-max_value, max_value]. Otherwise rises a
22 if not isfinite(num) or num <= -max_value or num >= max_value:
27 class _CountryPostcodesCollector:
28 """ Collector for postcodes of a single country.
31 def __init__(self, country):
32 self.country = country
33 self.collected = dict()
36 def add(self, postcode, x, y):
37 """ Add the given postcode to the collection cache. If the postcode
38 already existed, it is overwritten with the new centroid.
40 self.collected[postcode] = (x, y)
43 def commit(self, conn, analyzer, project_dir):
44 """ Update postcodes for the country from the postcodes selected so far
45 as well as any externally supplied postcodes.
47 self._update_from_external(analyzer, project_dir)
48 to_add, to_delete, to_update = self._compute_changes(conn)
50 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
51 self.country, len(to_add), len(to_delete), len(to_update))
53 with conn.cursor() as cur:
56 """INSERT INTO location_postcode
57 (place_id, indexed_status, country_code,
58 postcode, geometry) VALUES %s""",
60 template="""(nextval('seq_place'), 1, '{}',
61 %s, 'SRID=4326;POINT(%s %s)')
62 """.format(self.country))
64 cur.execute("""DELETE FROM location_postcode
65 WHERE country_code = %s and postcode = any(%s)
66 """, (self.country, to_delete))
69 """UPDATE location_postcode
70 SET indexed_status = 2,
71 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
72 FROM (VALUES %s) AS v (pc, x, y)
73 WHERE country_code = '{}' and postcode = pc
74 """.format(self.country),
78 def _compute_changes(self, conn):
79 """ Compute which postcodes from the collected postcodes have to be
80 added or modified and which from the location_postcode table
85 with conn.cursor() as cur:
86 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
87 FROM location_postcode
88 WHERE country_code = %s""",
90 for postcode, x, y in cur:
91 newx, newy = self.collected.pop(postcode, (None, None))
93 dist = (x - newx)**2 + (y - newy)**2
95 to_update.append((postcode, newx, newy))
97 to_delete.append(postcode)
99 to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
102 return to_add, to_delete, to_update
105 def _update_from_external(self, analyzer, project_dir):
106 """ Look for an external postcode file for the active country in
107 the project directory and add missing postcodes when found.
109 csvfile = self._open_external(project_dir)
114 reader = csv.DictReader(csvfile)
116 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
117 LOG.warning("Bad format for external postcode file for country '%s'."
118 " Ignored.", self.country)
120 postcode = analyzer.normalize_postcode(row['postcode'])
121 if postcode not in self.collected:
123 self.collected[postcode] = (_to_float(row['lon'], 180),
124 _to_float(row['lat'], 90))
126 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
127 row['lat'], row['lon'], self.country)
133 def _open_external(self, project_dir):
134 fname = project_dir / '{}_postcodes.csv'.format(self.country)
137 LOG.info("Using external postcode file '%s'.", fname)
138 return open(fname, 'r')
140 fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
143 LOG.info("Using external postcode file '%s'.", fname)
144 return gzip.open(fname, 'rt')
149 def update_postcodes(dsn, project_dir, tokenizer):
150 """ Update the table of artificial postcodes.
152 Computes artificial postcode centroids from the placex table,
153 potentially enhances it with external data and then updates the
154 postcodes in the table 'location_postcode'.
156 with tokenizer.name_analyzer() as analyzer:
157 with connect(dsn) as conn:
158 # First get the list of countries that currently have postcodes.
159 # (Doing this before starting to insert, so it is fast on import.)
160 with conn.cursor() as cur:
161 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
162 todo_countries = set((row[0] for row in cur))
164 # Recompute the list of valid postcodes from placex.
165 with conn.cursor(name="placex_postcodes") as cur:
166 cur.execute("""SELECT country_code, pc, ST_X(centroid), ST_Y(centroid)
169 token_normalized_postcode(address->'postcode') as pc,
170 ST_Centroid(ST_Collect(ST_Centroid(geometry))) as centroid
172 WHERE address ? 'postcode' and geometry IS NOT null
173 and country_code is not null
174 GROUP BY country_code, pc) xx
176 ORDER BY country_code, pc""")
180 for country, postcode, x, y in cur:
181 if collector is None or country != collector.country:
182 if collector is not None:
183 collector.commit(conn, analyzer, project_dir)
184 collector = _CountryPostcodesCollector(country)
185 todo_countries.discard(country)
186 collector.add(postcode, x, y)
188 if collector is not None:
189 collector.commit(conn, analyzer, project_dir)
191 # Now handle any countries that are only in the postcode table.
192 for country in todo_countries:
193 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
197 analyzer.update_postcodes_from_db()