]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/api/results.py
filter duplicate results after DB query
[nominatim.git] / nominatim / api / results.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Dataclasses for search results and helper functions to fill them.
9
10 Data classes are part of the public API while the functions are for
11 internal use only. That's why they are implemented as free-standing functions
12 instead of member functions.
13 """
14 from typing import Optional, Tuple, Dict, Sequence, TypeVar, Type, List, Any, Union
15 import enum
16 import dataclasses
17 import datetime as dt
18
19 import sqlalchemy as sa
20
21 from nominatim.typing import SaSelect, SaRow, SaColumn
22 from nominatim.api.types import Point, Bbox, LookupDetails
23 from nominatim.api.connection import SearchConnection
24 from nominatim.api.logging import log
25 from nominatim.api.localization import Locales
26
27 # This file defines complex result data classes.
28 # pylint: disable=too-many-instance-attributes
29
30 def _mingle_name_tags(names: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
31     """ Mix-in names from linked places, so that they show up
32         as standard names where necessary.
33     """
34     if not names:
35         return None
36
37     out = {}
38     for k, v in names.items():
39         if k.startswith('_place_'):
40             outkey = k[7:]
41             out[k if outkey in names else outkey] = v
42         else:
43             out[k] = v
44
45     return out
46
47
48 class SourceTable(enum.Enum):
49     """ The `SourceTable` type lists the possible sources a result can have.
50     """
51     PLACEX = 1
52     """ The placex table is the main source for result usually containing
53         OSM data.
54     """
55     OSMLINE = 2
56     """ The osmline table contains address interpolations from OSM data.
57         Interpolation addresses are always approximate. The OSM id in the
58         result refers to the OSM way with the interpolation line object.
59     """
60     TIGER = 3
61     """ TIGER address data contains US addresses imported on the side,
62         see [Installing TIGER data](../customize/Tiger.md).
63         TIGER address are also interpolations. The addresses always refer
64         to a street from OSM data. The OSM id in the result refers to
65         that street.
66     """
67     POSTCODE = 4
68     """ The postcode table contains artificial centroids for postcodes,
69         computed from the postcodes available with address points. Results
70         are always approximate.
71     """
72     COUNTRY = 5
73     """ The country table provides a fallback, when country data is missing
74         in the OSM data.
75     """
76
77
78 @dataclasses.dataclass
79 class AddressLine:
80     """ The `AddressLine` may contain the following fields about a related place
81         and its function as an address object. Most fields are optional.
82         Their presence depends on the kind and function of the address part.
83     """
84     place_id: Optional[int]
85     """ Internal ID of the place.
86     """
87     osm_object: Optional[Tuple[str, int]]
88     """ OSM type and ID of the place, if such an object exists.
89     """
90     category: Tuple[str, str]
91     """ Main category of the place, described by a key-value pair.
92     """
93     names: Dict[str, str]
94     """ All available names for the place including references, alternative
95         names and translations.
96     """
97     extratags: Optional[Dict[str, str]]
98     """ Any extra information available about the place. This is a dictionary
99         that usually contains OSM tag key-value pairs.
100     """
101
102     admin_level: Optional[int]
103     """ The administrative level of a boundary as tagged in the input data.
104         This field is only meaningful for places of the category
105         (boundary, administrative).
106     """
107     fromarea: bool
108     """ If true, then the exact area of the place is known. Without area
109         information, Nominatim has to make an educated guess if an address
110         belongs to one place or another.
111     """
112     isaddress: bool
113     """ If true, this place should be considered for the final address display.
114         Nominatim will sometimes include more than one candidate for
115         the address in the list when it cannot reliably determine where the
116         place belongs. It will consider names of all candidates when searching
117         but when displaying the result, only the most likely candidate should
118         be shown.
119     """
120     rank_address: int
121     """ [Address rank](../customize/Ranking.md#address-rank) of the place.
122     """
123     distance: float
124     """ Distance in degrees between the result place and this address part.
125     """
126
127     local_name: Optional[str] = None
128     """ Place holder for localization of this address part. See
129         [Localization](#localization) below.
130     """
131
132
133 class AddressLines(List[AddressLine]):
134     """ Sequence of address lines order in descending order by their rank.
135     """
136
137     def localize(self, locales: Locales) -> List[str]:
138         """ Set the local name of address parts according to the chosen
139             locale. Return the list of local names without duplicates.
140
141             Only address parts that are marked as isaddress are localized
142             and returned.
143         """
144         label_parts: List[str] = []
145
146         for line in self:
147             if line.isaddress and line.names:
148                 line.local_name = locales.display_name(line.names)
149                 if not label_parts or label_parts[-1] != line.local_name:
150                     label_parts.append(line.local_name)
151
152         return label_parts
153
154
155
156 @dataclasses.dataclass
157 class WordInfo:
158     """ Each entry in the list of search terms contains the
159         following detailed information.
160     """
161     word_id: int
162     """ Internal identifier for the word.
163     """
164     word_token: str
165     """ Normalised and transliterated form of the word.
166         This form is used for searching.
167     """
168     word: Optional[str] = None
169     """ Untransliterated form, if available.
170     """
171
172
173 WordInfos = Sequence[WordInfo]
174
175
176 @dataclasses.dataclass
177 class BaseResult:
178     """ Data class collecting information common to all
179         types of search results.
180     """
181     source_table: SourceTable
182     category: Tuple[str, str]
183     centroid: Point
184
185     place_id : Optional[int] = None
186     osm_object: Optional[Tuple[str, int]] = None
187
188     locale_name: Optional[str] = None
189     display_name: Optional[str] = None
190
191     names: Optional[Dict[str, str]] = None
192     address: Optional[Dict[str, str]] = None
193     extratags: Optional[Dict[str, str]] = None
194
195     housenumber: Optional[str] = None
196     postcode: Optional[str] = None
197     wikipedia: Optional[str] = None
198
199     rank_address: int = 30
200     rank_search: int = 30
201     importance: Optional[float] = None
202
203     country_code: Optional[str] = None
204
205     address_rows: Optional[AddressLines] = None
206     linked_rows: Optional[AddressLines] = None
207     parented_rows: Optional[AddressLines] = None
208     name_keywords: Optional[WordInfos] = None
209     address_keywords: Optional[WordInfos] = None
210
211     geometry: Dict[str, str] = dataclasses.field(default_factory=dict)
212
213     @property
214     def lat(self) -> float:
215         """ Get the latitude (or y) of the center point of the place.
216         """
217         return self.centroid[1]
218
219
220     @property
221     def lon(self) -> float:
222         """ Get the longitude (or x) of the center point of the place.
223         """
224         return self.centroid[0]
225
226
227     def calculated_importance(self) -> float:
228         """ Get a valid importance value. This is either the stored importance
229             of the value or an artificial value computed from the place's
230             search rank.
231         """
232         return self.importance or (0.7500001 - (self.rank_search/40.0))
233
234
235     def localize(self, locales: Locales) -> None:
236         """ Fill the locale_name and the display_name field for the
237             place and, if available, its address information.
238         """
239         self.locale_name = locales.display_name(self.names)
240         if self.address_rows:
241             self.display_name = ', '.join(self.address_rows.localize(locales))
242         else:
243             self.display_name = self.locale_name
244
245
246
247 BaseResultT = TypeVar('BaseResultT', bound=BaseResult)
248
249 @dataclasses.dataclass
250 class DetailedResult(BaseResult):
251     """ A search result with more internal information from the database
252         added.
253     """
254     parent_place_id: Optional[int] = None
255     linked_place_id: Optional[int] = None
256     admin_level: int = 15
257     indexed_date: Optional[dt.datetime] = None
258
259
260 @dataclasses.dataclass
261 class ReverseResult(BaseResult):
262     """ A search result for reverse geocoding.
263     """
264     distance: Optional[float] = None
265     bbox: Optional[Bbox] = None
266
267
268 class ReverseResults(List[ReverseResult]):
269     """ Sequence of reverse lookup results ordered by distance.
270         May be empty when no result was found.
271     """
272
273
274 @dataclasses.dataclass
275 class SearchResult(BaseResult):
276     """ A search result for forward geocoding.
277     """
278     bbox: Optional[Bbox] = None
279     accuracy: float = 0.0
280
281
282     @property
283     def ranking(self) -> float:
284         """ Return the ranking, a combined measure of accuracy and importance.
285         """
286         return (self.accuracy if self.accuracy is not None else 1) \
287                - self.calculated_importance()
288
289
290 class SearchResults(List[SearchResult]):
291     """ Sequence of forward lookup results ordered by relevance.
292         May be empty when no result was found.
293     """
294
295
296 def _filter_geometries(row: SaRow) -> Dict[str, str]:
297     return {k[9:]: v for k, v in row._mapping.items() # pylint: disable=W0212
298             if k.startswith('geometry_')}
299
300
301 def create_from_placex_row(row: Optional[SaRow],
302                            class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
303     """ Construct a new result and add the data from the result row
304         from the placex table. 'class_type' defines the type of result
305         to return. Returns None if the row is None.
306     """
307     if row is None:
308         return None
309
310     return class_type(source_table=SourceTable.PLACEX,
311                       place_id=row.place_id,
312                       osm_object=(row.osm_type, row.osm_id),
313                       category=(row.class_, row.type),
314                       names=_mingle_name_tags(row.name),
315                       address=row.address,
316                       extratags=row.extratags,
317                       housenumber=row.housenumber,
318                       postcode=row.postcode,
319                       wikipedia=row.wikipedia,
320                       rank_address=row.rank_address,
321                       rank_search=row.rank_search,
322                       importance=row.importance,
323                       country_code=row.country_code,
324                       centroid=Point.from_wkb(row.centroid),
325                       geometry=_filter_geometries(row))
326
327
328 def create_from_osmline_row(row: Optional[SaRow],
329                             class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
330     """ Construct a new result and add the data from the result row
331         from the address interpolation table osmline. 'class_type' defines
332         the type of result to return. Returns None if the row is None.
333
334         If the row contains a housenumber, then the housenumber is filled out.
335         Otherwise the result contains the interpolation information in extratags.
336     """
337     if row is None:
338         return None
339
340     hnr = getattr(row, 'housenumber', None)
341
342     res = class_type(source_table=SourceTable.OSMLINE,
343                      place_id=row.place_id,
344                      osm_object=('W', row.osm_id),
345                      category=('place', 'houses' if hnr is None else 'house'),
346                      address=row.address,
347                      postcode=row.postcode,
348                      country_code=row.country_code,
349                      centroid=Point.from_wkb(row.centroid),
350                      geometry=_filter_geometries(row))
351
352     if hnr is None:
353         res.extratags = {'startnumber': str(row.startnumber),
354                          'endnumber': str(row.endnumber),
355                          'step': str(row.step)}
356     else:
357         res.housenumber = str(hnr)
358
359     return res
360
361
362 def create_from_tiger_row(row: Optional[SaRow],
363                           class_type: Type[BaseResultT],
364                           osm_type: Optional[str] = None,
365                           osm_id: Optional[int] = None) -> Optional[BaseResultT]:
366     """ Construct a new result and add the data from the result row
367         from the Tiger data interpolation table. 'class_type' defines
368         the type of result to return. Returns None if the row is None.
369
370         If the row contains a housenumber, then the housenumber is filled out.
371         Otherwise the result contains the interpolation information in extratags.
372     """
373     if row is None:
374         return None
375
376     hnr = getattr(row, 'housenumber', None)
377
378     res = class_type(source_table=SourceTable.TIGER,
379                      place_id=row.place_id,
380                      osm_object=(osm_type or row.osm_type, osm_id or row.osm_id),
381                      category=('place', 'houses' if hnr is None else 'house'),
382                      postcode=row.postcode,
383                      country_code='us',
384                      centroid=Point.from_wkb(row.centroid),
385                      geometry=_filter_geometries(row))
386
387     if hnr is None:
388         res.extratags = {'startnumber': str(row.startnumber),
389                          'endnumber': str(row.endnumber),
390                          'step': str(row.step)}
391     else:
392         res.housenumber = str(hnr)
393
394     return res
395
396
397 def create_from_postcode_row(row: Optional[SaRow],
398                           class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
399     """ Construct a new result and add the data from the result row
400         from the postcode table. 'class_type' defines
401         the type of result to return. Returns None if the row is None.
402     """
403     if row is None:
404         return None
405
406     return class_type(source_table=SourceTable.POSTCODE,
407                       place_id=row.place_id,
408                       category=('place', 'postcode'),
409                       names={'ref': row.postcode},
410                       rank_search=row.rank_search,
411                       rank_address=row.rank_address,
412                       country_code=row.country_code,
413                       centroid=Point.from_wkb(row.centroid),
414                       geometry=_filter_geometries(row))
415
416
417 def create_from_country_row(row: Optional[SaRow],
418                         class_type: Type[BaseResultT]) -> Optional[BaseResultT]:
419     """ Construct a new result and add the data from the result row
420         from the fallback country tables. 'class_type' defines
421         the type of result to return. Returns None if the row is None.
422     """
423     if row is None:
424         return None
425
426     return class_type(source_table=SourceTable.COUNTRY,
427                       category=('place', 'country'),
428                       centroid=Point.from_wkb(row.centroid),
429                       names=row.name,
430                       rank_address=4, rank_search=4,
431                       country_code=row.country_code)
432
433
434 async def add_result_details(conn: SearchConnection, results: List[BaseResultT],
435                              details: LookupDetails) -> None:
436     """ Retrieve more details from the database according to the
437         parameters specified in 'details'.
438     """
439     if results:
440         log().section('Query details for result')
441         if details.address_details:
442             log().comment('Query address details')
443             await complete_address_details(conn, results)
444         if details.linked_places:
445             log().comment('Query linked places')
446             for result in results:
447                 await complete_linked_places(conn, result)
448         if details.parented_places:
449             log().comment('Query parent places')
450             for result in results:
451                 await complete_parented_places(conn, result)
452         if details.keywords:
453             log().comment('Query keywords')
454             for result in results:
455                 await complete_keywords(conn, result)
456         for result in results:
457             result.localize(details.locales)
458
459
460 def _result_row_to_address_row(row: SaRow) -> AddressLine:
461     """ Create a new AddressLine from the results of a datbase query.
462     """
463     extratags: Dict[str, str] = getattr(row, 'extratags', {})
464     if hasattr(row, 'place_type') and row.place_type:
465         extratags['place'] = row.place_type
466
467     names = _mingle_name_tags(row.name) or {}
468     if getattr(row, 'housenumber', None) is not None:
469         names['housenumber'] = row.housenumber
470
471     return AddressLine(place_id=row.place_id,
472                        osm_object=None if row.osm_type is None else (row.osm_type, row.osm_id),
473                        category=(getattr(row, 'class'), row.type),
474                        names=names,
475                        extratags=extratags,
476                        admin_level=row.admin_level,
477                        fromarea=row.fromarea,
478                        isaddress=getattr(row, 'isaddress', True),
479                        rank_address=row.rank_address,
480                        distance=row.distance)
481
482
483 def _get_housenumber_details(results: List[BaseResultT]) -> Tuple[List[int], List[int]]:
484     places = []
485     hnrs = []
486     for result in results:
487         if result.place_id:
488             housenumber = -1
489             if result.source_table in (SourceTable.TIGER, SourceTable.OSMLINE):
490                 if result.housenumber is not None:
491                     housenumber = int(result.housenumber)
492                 elif result.extratags is not None and 'startnumber' in result.extratags:
493                     # details requests do not come with a specific house number
494                     housenumber = int(result.extratags['startnumber'])
495             places.append(result.place_id)
496             hnrs.append(housenumber)
497
498     return places, hnrs
499
500
501 async def complete_address_details(conn: SearchConnection, results: List[BaseResultT]) -> None:
502     """ Retrieve information about places that make up the address of the result.
503     """
504     places, hnrs = _get_housenumber_details(results)
505
506     if not places:
507         return
508
509     def _get_addressdata(place_id: Union[int, SaColumn], hnr: Union[int, SaColumn]) -> Any:
510         return sa.func.get_addressdata(place_id, hnr)\
511                     .table_valued( # type: ignore[no-untyped-call]
512                         sa.column('place_id', type_=sa.Integer),
513                         'osm_type',
514                         sa.column('osm_id', type_=sa.BigInteger),
515                         sa.column('name', type_=conn.t.types.Composite),
516                         'class', 'type', 'place_type',
517                         sa.column('admin_level', type_=sa.Integer),
518                         sa.column('fromarea', type_=sa.Boolean),
519                         sa.column('isaddress', type_=sa.Boolean),
520                         sa.column('rank_address', type_=sa.SmallInteger),
521                         sa.column('distance', type_=sa.Float),
522                         joins_implicitly=True)
523
524
525     if len(places) == 1:
526         # Optimized case for exactly one result (reverse)
527         sql = sa.select(_get_addressdata(places[0], hnrs[0]))\
528                 .order_by(sa.column('rank_address').desc(),
529                           sa.column('isaddress').desc())
530
531         alines = AddressLines()
532         for row in await conn.execute(sql):
533             alines.append(_result_row_to_address_row(row))
534
535         for result in results:
536             if result.place_id == places[0]:
537                 result.address_rows = alines
538                 return
539
540
541     darray = sa.func.unnest(conn.t.types.to_array(places), conn.t.types.to_array(hnrs))\
542                     .table_valued( # type: ignore[no-untyped-call]
543                        sa.column('place_id', type_= sa.Integer),
544                        sa.column('housenumber', type_= sa.Integer)
545                     ).render_derived()
546
547     sfn = _get_addressdata(darray.c.place_id, darray.c.housenumber)
548
549     sql = sa.select(darray.c.place_id.label('result_place_id'), sfn)\
550             .order_by(darray.c.place_id,
551                       sa.column('rank_address').desc(),
552                       sa.column('isaddress').desc())
553
554     current_result = None
555     for row in await conn.execute(sql):
556         if current_result is None or row.result_place_id != current_result.place_id:
557             for result in results:
558                 if result.place_id == row.result_place_id:
559                     current_result = result
560                     break
561             else:
562                 assert False
563             current_result.address_rows = AddressLines()
564         current_result.address_rows.append(_result_row_to_address_row(row))
565
566
567 # pylint: disable=consider-using-f-string
568 def _placex_select_address_row(conn: SearchConnection,
569                                centroid: Point) -> SaSelect:
570     t = conn.t.placex
571     return sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name,
572                      t.c.class_.label('class'), t.c.type,
573                      t.c.admin_level, t.c.housenumber,
574                      sa.literal_column("""ST_GeometryType(geometry) in
575                                         ('ST_Polygon','ST_MultiPolygon')""").label('fromarea'),
576                      t.c.rank_address,
577                      sa.literal_column(
578                          """ST_DistanceSpheroid(geometry, 'SRID=4326;POINT(%f %f)'::geometry,
579                               'SPHEROID["WGS 84",6378137,298.257223563, AUTHORITY["EPSG","7030"]]')
580                          """ % centroid).label('distance'))
581
582
583 async def complete_linked_places(conn: SearchConnection, result: BaseResult) -> None:
584     """ Retrieve information about places that link to the result.
585     """
586     result.linked_rows = AddressLines()
587     if result.source_table != SourceTable.PLACEX:
588         return
589
590     sql = _placex_select_address_row(conn, result.centroid)\
591             .where(conn.t.placex.c.linked_place_id == result.place_id)
592
593     for row in await conn.execute(sql):
594         result.linked_rows.append(_result_row_to_address_row(row))
595
596
597 async def complete_keywords(conn: SearchConnection, result: BaseResult) -> None:
598     """ Retrieve information about the search terms used for this place.
599
600         Requires that the query analyzer was initialised to get access to
601         the word table.
602     """
603     t = conn.t.search_name
604     sql = sa.select(t.c.name_vector, t.c.nameaddress_vector)\
605             .where(t.c.place_id == result.place_id)
606
607     result.name_keywords = []
608     result.address_keywords = []
609
610     t = conn.t.meta.tables['word']
611     sel = sa.select(t.c.word_id, t.c.word_token, t.c.word)
612
613     for name_tokens, address_tokens in await conn.execute(sql):
614         for row in await conn.execute(sel.where(t.c.word_id == sa.any_(name_tokens))):
615             result.name_keywords.append(WordInfo(*row))
616
617         for row in await conn.execute(sel.where(t.c.word_id == sa.any_(address_tokens))):
618             result.address_keywords.append(WordInfo(*row))
619
620
621 async def complete_parented_places(conn: SearchConnection, result: BaseResult) -> None:
622     """ Retrieve information about places that the result provides the
623         address for.
624     """
625     result.parented_rows = AddressLines()
626     if result.source_table != SourceTable.PLACEX:
627         return
628
629     sql = _placex_select_address_row(conn, result.centroid)\
630             .where(conn.t.placex.c.parent_place_id == result.place_id)\
631             .where(conn.t.placex.c.rank_search == 30)
632
633     for row in await conn.execute(sql):
634         result.parented_rows.append(_result_row_to_address_row(row))