2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
8 from math import isfinite
10 from psycopg2 import sql as pysql
12 from nominatim.db.connection import connect
14 LOG = logging.getLogger()
16 def _to_float(num, max_value):
17 """ Convert the number in string into a float. The number is expected
18 to be in the range of [-max_value, max_value]. Otherwise rises a
22 if not isfinite(num) or num <= -max_value or num >= max_value:
27 class _CountryPostcodesCollector:
28 """ Collector for postcodes of a single country.
31 def __init__(self, country):
32 self.country = country
33 self.collected = dict()
36 def add(self, postcode, x, y):
37 """ Add the given postcode to the collection cache. If the postcode
38 already existed, it is overwritten with the new centroid.
40 self.collected[postcode] = (x, y)
43 def commit(self, conn, analyzer, project_dir):
44 """ Update postcodes for the country from the postcodes selected so far
45 as well as any externally supplied postcodes.
47 self._update_from_external(analyzer, project_dir)
48 to_add, to_delete, to_update = self._compute_changes(conn)
50 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
51 self.country, len(to_add), len(to_delete), len(to_update))
53 with conn.cursor() as cur:
56 """INSERT INTO location_postcode
57 (place_id, indexed_status, country_code,
58 postcode, geometry) VALUES %s""",
60 template=pysql.SQL("""(nextval('seq_place'), 1, {},
61 %s, 'SRID=4326;POINT(%s %s)')
62 """).format(pysql.Literal(self.country)))
64 cur.execute("""DELETE FROM location_postcode
65 WHERE country_code = %s and postcode = any(%s)
66 """, (self.country, to_delete))
69 pysql.SQL("""UPDATE location_postcode
70 SET indexed_status = 2,
71 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
72 FROM (VALUES %s) AS v (pc, x, y)
73 WHERE country_code = {} and postcode = pc
74 """).format(pysql.Literal(self.country)), to_update)
77 def _compute_changes(self, conn):
78 """ Compute which postcodes from the collected postcodes have to be
79 added or modified and which from the location_postcode table
84 with conn.cursor() as cur:
85 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
86 FROM location_postcode
87 WHERE country_code = %s""",
89 for postcode, x, y in cur:
90 newx, newy = self.collected.pop(postcode, (None, None))
92 dist = (x - newx)**2 + (y - newy)**2
94 to_update.append((postcode, newx, newy))
96 to_delete.append(postcode)
98 to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
101 return to_add, to_delete, to_update
104 def _update_from_external(self, analyzer, project_dir):
105 """ Look for an external postcode file for the active country in
106 the project directory and add missing postcodes when found.
108 csvfile = self._open_external(project_dir)
113 reader = csv.DictReader(csvfile)
115 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
116 LOG.warning("Bad format for external postcode file for country '%s'."
117 " Ignored.", self.country)
119 postcode = analyzer.normalize_postcode(row['postcode'])
120 if postcode not in self.collected:
122 self.collected[postcode] = (_to_float(row['lon'], 180),
123 _to_float(row['lat'], 90))
125 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
126 row['lat'], row['lon'], self.country)
132 def _open_external(self, project_dir):
133 fname = project_dir / '{}_postcodes.csv'.format(self.country)
136 LOG.info("Using external postcode file '%s'.", fname)
137 return open(fname, 'r')
139 fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
142 LOG.info("Using external postcode file '%s'.", fname)
143 return gzip.open(fname, 'rt')
148 def update_postcodes(dsn, project_dir, tokenizer):
149 """ Update the table of artificial postcodes.
151 Computes artificial postcode centroids from the placex table,
152 potentially enhances it with external data and then updates the
153 postcodes in the table 'location_postcode'.
155 with tokenizer.name_analyzer() as analyzer:
156 with connect(dsn) as conn:
157 # First get the list of countries that currently have postcodes.
158 # (Doing this before starting to insert, so it is fast on import.)
159 with conn.cursor() as cur:
160 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
161 todo_countries = set((row[0] for row in cur))
163 # Recompute the list of valid postcodes from placex.
164 with conn.cursor(name="placex_postcodes") as cur:
166 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
168 COALESCE(plx.country_code,
169 get_country_code(ST_Centroid(pl.geometry))) as cc,
170 token_normalized_postcode(pl.address->'postcode') as pc,
171 ST_Centroid(ST_Collect(COALESCE(plx.centroid,
172 ST_Centroid(pl.geometry)))) as centroid
173 FROM place AS pl LEFT OUTER JOIN placex AS plx
174 ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
175 WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
177 WHERE pc IS NOT null AND cc IS NOT null
178 ORDER BY country_code, pc""")
182 for country, postcode, x, y in cur:
183 if collector is None or country != collector.country:
184 if collector is not None:
185 collector.commit(conn, analyzer, project_dir)
186 collector = _CountryPostcodesCollector(country)
187 todo_countries.discard(country)
188 collector.add(postcode, x, y)
190 if collector is not None:
191 collector.commit(conn, analyzer, project_dir)
193 # Now handle any countries that are only in the postcode table.
194 for country in todo_countries:
195 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
199 analyzer.update_postcodes_from_db()
201 def can_compute(dsn):
203 Check that the place table exists so that
204 postcodes can be computed.
206 with connect(dsn) as conn:
207 return conn.table_exists('place')