2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
9 from psycopg2.extras import execute_values
11 from nominatim.db.connection import connect
13 LOG = logging.getLogger()
15 class _CountryPostcodesCollector:
16 """ Collector for postcodes of a single country.
19 def __init__(self, country):
20 self.country = country
21 self.collected = dict()
24 def add(self, postcode, x, y):
25 """ Add the given postcode to the collection cache. If the postcode
26 already existed, it is overwritten with the new centroid.
28 self.collected[postcode] = (x, y)
31 def commit(self, conn, analyzer, project_dir):
32 """ Update postcodes for the country from the postcodes selected so far
33 as well as any externally supplied postcodes.
35 self._update_from_external(analyzer, project_dir)
36 to_add, to_delete, to_update = self._compute_changes(conn)
38 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
39 self.country, len(to_add), len(to_delete), len(to_update))
41 with conn.cursor() as cur:
44 """INSERT INTO location_postcode
45 (place_id, indexed_status, country_code,
46 postcode, geometry) VALUES %s""",
48 template="""(nextval('seq_place'), 1, '{}',
49 %s, 'SRID=4326;POINT(%s %s)')
50 """.format(self.country))
52 cur.execute("""DELETE FROM location_postcode
53 WHERE country_code = %s and postcode = any(%s)
54 """, (self.country, to_delete))
57 """UPDATE location_postcode
58 SET indexed_status = 2,
59 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
60 FROM (VALUES %s) AS v (pc, x, y)
61 WHERE country_code = '{}' and postcode = pc
62 """.format(self.country),
66 def _compute_changes(self, conn):
67 """ Compute which postcodes from the collected postcodes have to be
68 added or modified and which from the location_postcode table
73 with conn.cursor() as cur:
74 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
75 FROM location_postcode
76 WHERE country_code = %s""",
78 for postcode, x, y in cur:
79 newx, newy = self.collected.pop(postcode, (None, None))
81 dist = (x - newx)**2 + (y - newy)**2
83 to_update.append((postcode, newx, newy))
85 to_delete.append(postcode)
87 to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
90 return to_add, to_delete, to_update
93 def _update_from_external(self, analyzer, project_dir):
94 """ Look for an external postcode file for the active country in
95 the project directory and add missing postcodes when found.
97 csvfile = self._open_external(project_dir)
102 reader = csv.DictReader(csvfile)
104 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
105 LOG.warning("Bad format for external postcode file for country '%s'."
106 " Ignored.", self.country)
108 postcode = analyzer.normalize_postcode(row['postcode'])
109 if postcode not in self.collected:
111 self.collected[postcode] = (float(row['lon']), float(row['lat']))
113 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
114 row['lat'], row['lon'], self.country)
120 def _open_external(self, project_dir):
121 fname = project_dir / '{}_postcodes.csv'.format(self.country)
124 LOG.info("Using external postcode file '%s'.", fname)
125 return open(fname, 'r')
127 fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
130 LOG.info("Using external postcode file '%s'.", fname)
131 return gzip.open(fname, 'rt')
136 def update_postcodes(dsn, project_dir, tokenizer):
137 """ Update the table of artificial postcodes.
139 Computes artificial postcode centroids from the placex table,
140 potentially enhances it with external data and then updates the
141 postcodes in the table 'location_postcode'.
143 with tokenizer.name_analyzer() as analyzer:
144 with connect(dsn) as conn:
145 # First get the list of countries that currently have postcodes.
146 # (Doing this before starting to insert, so it is fast on import.)
147 with conn.cursor() as cur:
148 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
149 todo_countries = set((row[0] for row in cur))
151 # Recompute the list of valid postcodes from placex.
152 with conn.cursor(name="placex_postcodes") as cur:
153 cur.execute("""SELECT country_code, pc, ST_X(centroid), ST_Y(centroid)
156 token_normalized_postcode(address->'postcode') as pc,
157 ST_Centroid(ST_Collect(ST_Centroid(geometry))) as centroid
159 WHERE address ? 'postcode' and geometry IS NOT null
160 GROUP BY country_code, pc) xx
162 ORDER BY country_code, pc""")
166 for country, postcode, x, y in cur:
167 if collector is None or country != collector.country:
168 if collector is not None:
169 collector.commit(conn, analyzer, project_dir)
170 collector = _CountryPostcodesCollector(country)
171 todo_countries.discard(country)
172 collector.add(postcode, x, y)
174 if collector is not None:
175 collector.commit(conn, analyzer, project_dir)
177 # Now handle any countries that are only in the postcode table.
178 for country in todo_countries:
179 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
183 analyzer.update_postcodes_from_db()