]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tools/postcodes.py
dad1edff7f9264b3e9fa5f676545b2f4aa95bfce
[nominatim.git] / nominatim / tools / postcodes.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
10 """
11 from collections import defaultdict
12 import csv
13 import gzip
14 import logging
15 from math import isfinite
16
17 from psycopg2 import sql as pysql
18
19 from nominatim.db.connection import connect
20 from nominatim.utils.centroid import PointsCentroid
21 from nominatim.data.postcode_format import PostcodeFormatter
22
23 LOG = logging.getLogger()
24
25 def _to_float(num, max_value):
26     """ Convert the number in string into a float. The number is expected
27         to be in the range of [-max_value, max_value]. Otherwise rises a
28         ValueError.
29     """
30     num = float(num)
31     if not isfinite(num) or num <= -max_value or num >= max_value:
32         raise ValueError()
33
34     return num
35
36 class _CountryPostcodesCollector:
37     """ Collector for postcodes of a single country.
38     """
39
40     def __init__(self, country):
41         self.country = country
42         self.collected = defaultdict(PointsCentroid)
43
44
45     def add(self, postcode, x, y):
46         """ Add the given postcode to the collection cache. If the postcode
47             already existed, it is overwritten with the new centroid.
48         """
49         self.collected[postcode] += (x, y)
50
51
52     def commit(self, conn, analyzer, project_dir):
53         """ Update postcodes for the country from the postcodes selected so far
54             as well as any externally supplied postcodes.
55         """
56         self._update_from_external(analyzer, project_dir)
57         to_add, to_delete, to_update = self._compute_changes(conn)
58
59         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
60                  self.country, len(to_add), len(to_delete), len(to_update))
61
62         with conn.cursor() as cur:
63             if to_add:
64                 cur.execute_values(
65                     """INSERT INTO location_postcode
66                          (place_id, indexed_status, country_code,
67                           postcode, geometry) VALUES %s""",
68                     to_add,
69                     template=pysql.SQL("""(nextval('seq_place'), 1, {},
70                                           %s, 'SRID=4326;POINT(%s %s)')
71                                        """).format(pysql.Literal(self.country)))
72             if to_delete:
73                 cur.execute("""DELETE FROM location_postcode
74                                WHERE country_code = %s and postcode = any(%s)
75                             """, (self.country, to_delete))
76             if to_update:
77                 cur.execute_values(
78                     pysql.SQL("""UPDATE location_postcode
79                                  SET indexed_status = 2,
80                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
81                                  FROM (VALUES %s) AS v (pc, x, y)
82                                  WHERE country_code = {} and postcode = pc
83                               """).format(pysql.Literal(self.country)), to_update)
84
85
86     def _compute_changes(self, conn):
87         """ Compute which postcodes from the collected postcodes have to be
88             added or modified and which from the location_postcode table
89             have to be deleted.
90         """
91         to_update = []
92         to_delete = []
93         with conn.cursor() as cur:
94             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
95                            FROM location_postcode
96                            WHERE country_code = %s""",
97                         (self.country, ))
98             for postcode, x, y in cur:
99                 pcobj = self.collected.pop(postcode, None)
100                 if pcobj:
101                     newx, newy = pcobj.centroid()
102                     if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
103                         to_update.append((postcode, newx, newy))
104                 else:
105                     to_delete.append(postcode)
106
107         to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
108         self.collected = None
109
110         return to_add, to_delete, to_update
111
112
113     def _update_from_external(self, analyzer, project_dir):
114         """ Look for an external postcode file for the active country in
115             the project directory and add missing postcodes when found.
116         """
117         csvfile = self._open_external(project_dir)
118         if csvfile is None:
119             return
120
121         try:
122             reader = csv.DictReader(csvfile)
123             for row in reader:
124                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
125                     LOG.warning("Bad format for external postcode file for country '%s'."
126                                 " Ignored.", self.country)
127                     return
128                 postcode = analyzer.normalize_postcode(row['postcode'])
129                 if postcode not in self.collected:
130                     try:
131                         # Do float conversation separately, it might throw
132                         centroid = (_to_float(row['lon'], 180),
133                                     _to_float(row['lat'], 90))
134                         self.collected[postcode] += centroid
135                     except ValueError:
136                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
137                                     row['lat'], row['lon'], self.country)
138
139         finally:
140             csvfile.close()
141
142
143     def _open_external(self, project_dir):
144         fname = project_dir / f'{self.country}_postcodes.csv'
145
146         if fname.is_file():
147             LOG.info("Using external postcode file '%s'.", fname)
148             return open(fname, 'r', encoding='utf-8')
149
150         fname = project_dir / f'{self.country}_postcodes.csv.gz'
151
152         if fname.is_file():
153             LOG.info("Using external postcode file '%s'.", fname)
154             return gzip.open(fname, 'rt')
155
156         return None
157
158
159 def update_postcodes(dsn, project_dir, tokenizer):
160     """ Update the table of artificial postcodes.
161
162         Computes artificial postcode centroids from the placex table,
163         potentially enhances it with external data and then updates the
164         postcodes in the table 'location_postcode'.
165     """
166     matcher = PostcodeFormatter()
167     with tokenizer.name_analyzer() as analyzer:
168         with connect(dsn) as conn:
169             # First get the list of countries that currently have postcodes.
170             # (Doing this before starting to insert, so it is fast on import.)
171             with conn.cursor() as cur:
172                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
173                 todo_countries = set((row[0] for row in cur))
174
175             # Recompute the list of valid postcodes from placex.
176             with conn.cursor(name="placex_postcodes") as cur:
177                 cur.execute("""
178                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
179                 FROM (SELECT
180                         COALESCE(plx.country_code,
181                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
182                         token_normalized_postcode(pl.address->'postcode') as pc,
183                         COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
184                       FROM place AS pl LEFT OUTER JOIN placex AS plx
185                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
186                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
187                 WHERE pc IS NOT null AND cc IS NOT null
188                 ORDER BY country_code, pc""")
189
190                 collector = None
191
192                 for country, postcode, x, y in cur:
193                     if collector is None or country != collector.country:
194                         if collector is not None:
195                             collector.commit(conn, analyzer, project_dir)
196                         collector = _CountryPostcodesCollector(country)
197                         todo_countries.discard(country)
198                     match = matcher.match(country, postcode)
199                     if match:
200                         collector.add(matcher.normalize(country, match), x, y)
201
202                 if collector is not None:
203                     collector.commit(conn, analyzer, project_dir)
204
205             # Now handle any countries that are only in the postcode table.
206             for country in todo_countries:
207                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
208
209             conn.commit()
210
211         analyzer.update_postcodes_from_db()
212
213 def can_compute(dsn):
214     """
215         Check that the place table exists so that
216         postcodes can be computed.
217     """
218     with connect(dsn) as conn:
219         return conn.table_exists('place')