]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tools/postcodes.py
Merge pull request #2732 from lonvia/fix-ordering-address-parts
[nominatim.git] / nominatim / tools / postcodes.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
10 """
11 import csv
12 import gzip
13 import logging
14 from math import isfinite
15
16 from psycopg2 import sql as pysql
17
18 from nominatim.db.connection import connect
19
20 LOG = logging.getLogger()
21
22 def _to_float(num, max_value):
23     """ Convert the number in string into a float. The number is expected
24         to be in the range of [-max_value, max_value]. Otherwise rises a
25         ValueError.
26     """
27     num = float(num)
28     if not isfinite(num) or num <= -max_value or num >= max_value:
29         raise ValueError()
30
31     return num
32
33 class _CountryPostcodesCollector:
34     """ Collector for postcodes of a single country.
35     """
36
37     def __init__(self, country):
38         self.country = country
39         self.collected = {}
40
41
42     def add(self, postcode, x, y):
43         """ Add the given postcode to the collection cache. If the postcode
44             already existed, it is overwritten with the new centroid.
45         """
46         self.collected[postcode] = (x, y)
47
48
49     def commit(self, conn, analyzer, project_dir):
50         """ Update postcodes for the country from the postcodes selected so far
51             as well as any externally supplied postcodes.
52         """
53         self._update_from_external(analyzer, project_dir)
54         to_add, to_delete, to_update = self._compute_changes(conn)
55
56         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
57                  self.country, len(to_add), len(to_delete), len(to_update))
58
59         with conn.cursor() as cur:
60             if to_add:
61                 cur.execute_values(
62                     """INSERT INTO location_postcode
63                          (place_id, indexed_status, country_code,
64                           postcode, geometry) VALUES %s""",
65                     to_add,
66                     template=pysql.SQL("""(nextval('seq_place'), 1, {},
67                                           %s, 'SRID=4326;POINT(%s %s)')
68                                        """).format(pysql.Literal(self.country)))
69             if to_delete:
70                 cur.execute("""DELETE FROM location_postcode
71                                WHERE country_code = %s and postcode = any(%s)
72                             """, (self.country, to_delete))
73             if to_update:
74                 cur.execute_values(
75                     pysql.SQL("""UPDATE location_postcode
76                                  SET indexed_status = 2,
77                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
78                                  FROM (VALUES %s) AS v (pc, x, y)
79                                  WHERE country_code = {} and postcode = pc
80                               """).format(pysql.Literal(self.country)), to_update)
81
82
83     def _compute_changes(self, conn):
84         """ Compute which postcodes from the collected postcodes have to be
85             added or modified and which from the location_postcode table
86             have to be deleted.
87         """
88         to_update = []
89         to_delete = []
90         with conn.cursor() as cur:
91             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
92                            FROM location_postcode
93                            WHERE country_code = %s""",
94                         (self.country, ))
95             for postcode, x, y in cur:
96                 newx, newy = self.collected.pop(postcode, (None, None))
97                 if newx is not None:
98                     dist = (x - newx)**2 + (y - newy)**2
99                     if dist > 0.0000001:
100                         to_update.append((postcode, newx, newy))
101                 else:
102                     to_delete.append(postcode)
103
104         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
105         self.collected = []
106
107         return to_add, to_delete, to_update
108
109
110     def _update_from_external(self, analyzer, project_dir):
111         """ Look for an external postcode file for the active country in
112             the project directory and add missing postcodes when found.
113         """
114         csvfile = self._open_external(project_dir)
115         if csvfile is None:
116             return
117
118         try:
119             reader = csv.DictReader(csvfile)
120             for row in reader:
121                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
122                     LOG.warning("Bad format for external postcode file for country '%s'."
123                                 " Ignored.", self.country)
124                     return
125                 postcode = analyzer.normalize_postcode(row['postcode'])
126                 if postcode not in self.collected:
127                     try:
128                         self.collected[postcode] = (_to_float(row['lon'], 180),
129                                                     _to_float(row['lat'], 90))
130                     except ValueError:
131                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
132                                     row['lat'], row['lon'], self.country)
133
134         finally:
135             csvfile.close()
136
137
138     def _open_external(self, project_dir):
139         fname = project_dir / f'{self.country}_postcodes.csv'
140
141         if fname.is_file():
142             LOG.info("Using external postcode file '%s'.", fname)
143             return open(fname, 'r', encoding='utf-8')
144
145         fname = project_dir / f'{self.country}_postcodes.csv.gz'
146
147         if fname.is_file():
148             LOG.info("Using external postcode file '%s'.", fname)
149             return gzip.open(fname, 'rt')
150
151         return None
152
153
154 def update_postcodes(dsn, project_dir, tokenizer):
155     """ Update the table of artificial postcodes.
156
157         Computes artificial postcode centroids from the placex table,
158         potentially enhances it with external data and then updates the
159         postcodes in the table 'location_postcode'.
160     """
161     with tokenizer.name_analyzer() as analyzer:
162         with connect(dsn) as conn:
163             # First get the list of countries that currently have postcodes.
164             # (Doing this before starting to insert, so it is fast on import.)
165             with conn.cursor() as cur:
166                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
167                 todo_countries = set((row[0] for row in cur))
168
169             # Recompute the list of valid postcodes from placex.
170             with conn.cursor(name="placex_postcodes") as cur:
171                 cur.execute("""
172                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
173                 FROM (SELECT
174                         COALESCE(plx.country_code,
175                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
176                         token_normalized_postcode(pl.address->'postcode') as pc,
177                         ST_Centroid(ST_Collect(COALESCE(plx.centroid,
178                                                         ST_Centroid(pl.geometry)))) as centroid
179                       FROM place AS pl LEFT OUTER JOIN placex AS plx
180                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
181                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
182                     GROUP BY cc, pc) xx
183                 WHERE pc IS NOT null AND cc IS NOT null
184                 ORDER BY country_code, pc""")
185
186                 collector = None
187
188                 for country, postcode, x, y in cur:
189                     if collector is None or country != collector.country:
190                         if collector is not None:
191                             collector.commit(conn, analyzer, project_dir)
192                         collector = _CountryPostcodesCollector(country)
193                         todo_countries.discard(country)
194                     collector.add(postcode, x, y)
195
196                 if collector is not None:
197                     collector.commit(conn, analyzer, project_dir)
198
199             # Now handle any countries that are only in the postcode table.
200             for country in todo_countries:
201                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
202
203             conn.commit()
204
205         analyzer.update_postcodes_from_db()
206
207 def can_compute(dsn):
208     """
209         Check that the place table exists so that
210         postcodes can be computed.
211     """
212     with connect(dsn) as conn:
213         return conn.table_exists('place')