]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tools/postcodes.py
icu tokenizer: switch to matching against partial names
[nominatim.git] / nominatim / tools / postcodes.py
1 """
2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
4 """
5 import csv
6 import gzip
7 import logging
8 from math import isfinite
9
10 from psycopg2 import sql as pysql
11
12 from nominatim.db.connection import connect
13
14 LOG = logging.getLogger()
15
16 def _to_float(num, max_value):
17     """ Convert the number in string into a float. The number is expected
18         to be in the range of [-max_value, max_value]. Otherwise rises a
19         ValueError.
20     """
21     num = float(num)
22     if not isfinite(num) or num <= -max_value or num >= max_value:
23         raise ValueError()
24
25     return num
26
27 class _CountryPostcodesCollector:
28     """ Collector for postcodes of a single country.
29     """
30
31     def __init__(self, country):
32         self.country = country
33         self.collected = dict()
34
35
36     def add(self, postcode, x, y):
37         """ Add the given postcode to the collection cache. If the postcode
38             already existed, it is overwritten with the new centroid.
39         """
40         self.collected[postcode] = (x, y)
41
42
43     def commit(self, conn, analyzer, project_dir):
44         """ Update postcodes for the country from the postcodes selected so far
45             as well as any externally supplied postcodes.
46         """
47         self._update_from_external(analyzer, project_dir)
48         to_add, to_delete, to_update = self._compute_changes(conn)
49
50         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
51                  self.country, len(to_add), len(to_delete), len(to_update))
52
53         with conn.cursor() as cur:
54             if to_add:
55                 cur.execute_values(
56                     """INSERT INTO location_postcode
57                          (place_id, indexed_status, country_code,
58                           postcode, geometry) VALUES %s""",
59                     to_add,
60                     template=pysql.SQL("""(nextval('seq_place'), 1, {},
61                                           %s, 'SRID=4326;POINT(%s %s)')
62                                        """).format(pysql.Literal(self.country)))
63             if to_delete:
64                 cur.execute("""DELETE FROM location_postcode
65                                WHERE country_code = %s and postcode = any(%s)
66                             """, (self.country, to_delete))
67             if to_update:
68                 cur.execute_values(
69                     pysql.SQL("""UPDATE location_postcode
70                                  SET indexed_status = 2,
71                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
72                                  FROM (VALUES %s) AS v (pc, x, y)
73                                  WHERE country_code = {} and postcode = pc
74                               """).format(pysql.Literal(self.country)), to_update)
75
76
77     def _compute_changes(self, conn):
78         """ Compute which postcodes from the collected postcodes have to be
79             added or modified and which from the location_postcode table
80             have to be deleted.
81         """
82         to_update = []
83         to_delete = []
84         with conn.cursor() as cur:
85             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
86                            FROM location_postcode
87                            WHERE country_code = %s""",
88                         (self.country, ))
89             for postcode, x, y in cur:
90                 newx, newy = self.collected.pop(postcode, (None, None))
91                 if newx is not None:
92                     dist = (x - newx)**2 + (y - newy)**2
93                     if dist > 0.0000001:
94                         to_update.append((postcode, newx, newy))
95                 else:
96                     to_delete.append(postcode)
97
98         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
99         self.collected = []
100
101         return to_add, to_delete, to_update
102
103
104     def _update_from_external(self, analyzer, project_dir):
105         """ Look for an external postcode file for the active country in
106             the project directory and add missing postcodes when found.
107         """
108         csvfile = self._open_external(project_dir)
109         if csvfile is None:
110             return
111
112         try:
113             reader = csv.DictReader(csvfile)
114             for row in reader:
115                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
116                     LOG.warning("Bad format for external postcode file for country '%s'."
117                                 " Ignored.", self.country)
118                     return
119                 postcode = analyzer.normalize_postcode(row['postcode'])
120                 if postcode not in self.collected:
121                     try:
122                         self.collected[postcode] = (_to_float(row['lon'], 180),
123                                                     _to_float(row['lat'], 90))
124                     except ValueError:
125                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
126                                     row['lat'], row['lon'], self.country)
127
128         finally:
129             csvfile.close()
130
131
132     def _open_external(self, project_dir):
133         fname = project_dir / '{}_postcodes.csv'.format(self.country)
134
135         if fname.is_file():
136             LOG.info("Using external postcode file '%s'.", fname)
137             return open(fname, 'r')
138
139         fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
140
141         if fname.is_file():
142             LOG.info("Using external postcode file '%s'.", fname)
143             return gzip.open(fname, 'rt')
144
145         return None
146
147
148 def update_postcodes(dsn, project_dir, tokenizer):
149     """ Update the table of artificial postcodes.
150
151         Computes artificial postcode centroids from the placex table,
152         potentially enhances it with external data and then updates the
153         postcodes in the table 'location_postcode'.
154     """
155     with tokenizer.name_analyzer() as analyzer:
156         with connect(dsn) as conn:
157             # First get the list of countries that currently have postcodes.
158             # (Doing this before starting to insert, so it is fast on import.)
159             with conn.cursor() as cur:
160                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
161                 todo_countries = set((row[0] for row in cur))
162
163             # Recompute the list of valid postcodes from placex.
164             with conn.cursor(name="placex_postcodes") as cur:
165                 cur.execute("""
166                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
167                 FROM (SELECT
168                         COALESCE(plx.country_code,
169                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
170                         token_normalized_postcode(pl.address->'postcode') as pc,
171                         ST_Centroid(ST_Collect(COALESCE(plx.centroid,
172                                                         ST_Centroid(pl.geometry)))) as centroid
173                       FROM place AS pl LEFT OUTER JOIN placex AS plx
174                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
175                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
176                     GROUP BY cc, pc) xx
177                 WHERE pc IS NOT null AND cc IS NOT null
178                 ORDER BY country_code, pc""")
179
180                 collector = None
181
182                 for country, postcode, x, y in cur:
183                     if collector is None or country != collector.country:
184                         if collector is not None:
185                             collector.commit(conn, analyzer, project_dir)
186                         collector = _CountryPostcodesCollector(country)
187                         todo_countries.discard(country)
188                     collector.add(postcode, x, y)
189
190                 if collector is not None:
191                     collector.commit(conn, analyzer, project_dir)
192
193             # Now handle any countries that are only in the postcode table.
194             for country in todo_countries:
195                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
196
197             conn.commit()
198
199         analyzer.update_postcodes_from_db()
200
201 def can_compute(dsn):
202     """
203         Check that the place table exists so that
204         postcodes can be computed.
205     """
206     with connect(dsn) as conn:
207         return conn.table_exists('place')