]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/api/reverse.py
add penalty for single words that look like stop words
[nominatim.git] / nominatim / api / reverse.py
index 00605d45b3841ad5742e25287bb2d956fd1f10ad..e16742cfa34170f6a8afbd0da75b6c945d7635d7 100644 (file)
@@ -7,11 +7,13 @@
 """
 Implementation of reverse geocoding.
 """
-from typing import Optional, List, Callable, Type, Tuple, Dict, Any
+from typing import Optional, List, Callable, Type, Tuple, Dict, Any, cast, Union
+import functools
 
 import sqlalchemy as sa
 
-from nominatim.typing import SaColumn, SaSelect, SaFromClause, SaLabel, SaRow, SaBind
+from nominatim.typing import SaColumn, SaSelect, SaFromClause, SaLabel, SaRow,\
+                             SaBind, SaLambdaSelect
 from nominatim.api.connection import SearchConnection
 import nominatim.api.results as nres
 from nominatim.api.logging import log
@@ -27,6 +29,13 @@ RowFunc = Callable[[Optional[SaRow], Type[nres.ReverseResult]], Optional[nres.Re
 WKT_PARAM: SaBind = sa.bindparam('wkt', type_=Geometry)
 MAX_RANK_PARAM: SaBind = sa.bindparam('max_rank')
 
+def no_index(expr: SaColumn) -> SaColumn:
+    """ Wrap the given expression, so that the query planner will
+        refrain from using the expression for index lookup.
+    """
+    return sa.func.coalesce(sa.null(), expr) # pylint: disable=not-callable
+
+
 def _select_from_placex(t: SaFromClause, use_wkt: bool = True) -> SaSelect:
     """ Create a select statement with the columns relevant for reverse
         results.
@@ -47,6 +56,7 @@ def _select_from_placex(t: SaFromClause, use_wkt: bool = True) -> SaSelect:
                      t.c.importance, t.c.wikipedia,
                      t.c.parent_place_id, t.c.rank_address, t.c.rank_search,
                      centroid,
+                     t.c.linked_place_id, t.c.admin_level,
                      distance.label('distance'),
                      t.c.geometry.ST_Expand(0).label('bbox'))
 
@@ -74,12 +84,6 @@ def _locate_interpolation(table: SaFromClause) -> SaLabel:
                    else_=0).label('position')
 
 
-def _is_address_point(table: SaFromClause) -> SaColumn:
-    return sa.and_(table.c.rank_address == 30,
-                   sa.or_(table.c.housenumber != None,
-                          table.c.name.has_key('housename')))
-
-
 def _get_closest(*rows: Optional[SaRow]) -> Optional[SaRow]:
     return min(rows, key=lambda row: 1000 if row is None else row.distance)
 
@@ -89,9 +93,11 @@ class ReverseGeocoder:
         coordinate.
     """
 
-    def __init__(self, conn: SearchConnection, params: ReverseDetails) -> None:
+    def __init__(self, conn: SearchConnection, params: ReverseDetails,
+                 restrict_to_country_areas: bool = False) -> None:
         self.conn = conn
         self.params = params
+        self.restrict_to_country_areas = restrict_to_country_areas
 
         self.bind_params: Dict[str, Any] = {'max_rank': params.max_rank}
 
@@ -127,23 +133,20 @@ class ReverseGeocoder:
         return self.layer_enabled(DataLayer.RAILWAY, DataLayer.MANMADE, DataLayer.NATURAL)
 
 
-    def _add_geometry_columns(self, sql: SaSelect, col: SaColumn) -> SaSelect:
-        if not self.has_geometries():
-            return sql
-
+    def _add_geometry_columns(self, sql: SaLambdaSelect, col: SaColumn) -> SaSelect:
         out = []
 
         if self.params.geometry_simplification > 0.0:
             col = sa.func.ST_SimplifyPreserveTopology(col, self.params.geometry_simplification)
 
         if self.params.geometry_output & GeometryFormat.GEOJSON:
-            out.append(sa.func.ST_AsGeoJSON(col).label('geometry_geojson'))
+            out.append(sa.func.ST_AsGeoJSON(col, 7).label('geometry_geojson'))
         if self.params.geometry_output & GeometryFormat.TEXT:
             out.append(sa.func.ST_AsText(col).label('geometry_text'))
         if self.params.geometry_output & GeometryFormat.KML:
-            out.append(sa.func.ST_AsKML(col).label('geometry_kml'))
+            out.append(sa.func.ST_AsKML(col, 7).label('geometry_kml'))
         if self.params.geometry_output & GeometryFormat.SVG:
-            out.append(sa.func.ST_AsSVG(col).label('geometry_svg'))
+            out.append(sa.func.ST_AsSVG(col, 0, 7).label('geometry_svg'))
 
         return sql.add_columns(*out)
 
@@ -171,31 +174,37 @@ class ReverseGeocoder:
         """
         t = self.conn.t.placex
 
-        sql = _select_from_placex(t)\
-                .where(t.c.geometry.ST_DWithin(WKT_PARAM, distance))\
-                .where(t.c.indexed_status == 0)\
-                .where(t.c.linked_place_id == None)\
+        # PostgreSQL must not get the distance as a parameter because
+        # there is a danger it won't be able to properly estimate index use
+        # when used with prepared statements
+        diststr = sa.text(f"{distance}")
+
+        sql: SaLambdaSelect = sa.lambda_stmt(lambda: _select_from_placex(t)
+                .where(t.c.geometry.within_distance(WKT_PARAM, diststr))
+                .where(t.c.indexed_status == 0)
+                .where(t.c.linked_place_id == None)
                 .where(sa.or_(sa.not_(t.c.geometry.is_area()),
-                              t.c.centroid.ST_Distance(WKT_PARAM) < distance))\
-                .order_by('distance')\
-                .limit(1)
+                              t.c.centroid.ST_Distance(WKT_PARAM) < diststr))
+                .order_by('distance')
+                .limit(1))
 
-        sql = self._add_geometry_columns(sql, t.c.geometry)
+        if self.has_geometries():
+            sql = self._add_geometry_columns(sql, t.c.geometry)
 
-        restrict: List[SaColumn] = []
+        restrict: List[Union[SaColumn, Callable[[], SaColumn]]] = []
 
         if self.layer_enabled(DataLayer.ADDRESS):
-            restrict.append(sa.and_(t.c.rank_address >= 26,
-                                    t.c.rank_address <= min(29, self.max_rank)))
+            max_rank = min(29, self.max_rank)
+            restrict.append(lambda: no_index(t.c.rank_address).between(26, max_rank))
             if self.max_rank == 30:
-                restrict.append(_is_address_point(t))
+                restrict.append(lambda: sa.func.IsAddressPoint(t))
         if self.layer_enabled(DataLayer.POI) and self.max_rank == 30:
-            restrict.append(sa.and_(t.c.rank_search == 30,
-                                    t.c.class_.not_in(('place', 'building')),
-                                    sa.not_(t.c.geometry.is_line_like())))
+            restrict.append(lambda: sa.and_(no_index(t.c.rank_search) == 30,
+                                            t.c.class_.not_in(('place', 'building')),
+                                            sa.not_(t.c.geometry.is_line_like())))
         if self.has_feature_layers():
-            restrict.append(sa.and_(t.c.rank_search.between(26, MAX_RANK_PARAM),
-                                    t.c.rank_address == 0,
+            restrict.append(sa.and_(no_index(t.c.rank_search).between(26, MAX_RANK_PARAM),
+                                    no_index(t.c.rank_address) == 0,
                                     self._filter_by_layer(t)))
 
         if not restrict:
@@ -209,16 +218,21 @@ class ReverseGeocoder:
     async def _find_housenumber_for_street(self, parent_place_id: int) -> Optional[SaRow]:
         t = self.conn.t.placex
 
-        sql = _select_from_placex(t)\
-                .where(t.c.geometry.ST_DWithin(WKT_PARAM, 0.001))\
+        def _base_query() -> SaSelect:
+            return _select_from_placex(t)\
+                .where(t.c.geometry.within_distance(WKT_PARAM, 0.001))\
                 .where(t.c.parent_place_id == parent_place_id)\
-                .where(_is_address_point(t))\
+                .where(sa.func.IsAddressPoint(t))\
                 .where(t.c.indexed_status == 0)\
                 .where(t.c.linked_place_id == None)\
                 .order_by('distance')\
                 .limit(1)
 
-        sql = self._add_geometry_columns(sql, t.c.geometry)
+        sql: SaLambdaSelect
+        if self.has_geometries():
+            sql = self._add_geometry_columns(_base_query(), t.c.geometry)
+        else:
+            sql = sa.lambda_stmt(_base_query)
 
         return (await self.conn.execute(sql, self.bind_params)).one_or_none()
 
@@ -230,7 +244,7 @@ class ReverseGeocoder:
         sql = sa.select(t,
                         t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
                         _locate_interpolation(t))\
-                .where(t.c.linegeo.ST_DWithin(WKT_PARAM, distance))\
+                .where(t.c.linegeo.within_distance(WKT_PARAM, distance))\
                 .where(t.c.startnumber != None)\
                 .order_by('distance')\
                 .limit(1)
@@ -241,11 +255,11 @@ class ReverseGeocoder:
         inner = sql.subquery('ipol')
 
         sql = sa.select(inner.c.place_id, inner.c.osm_id,
-                        inner.c.parent_place_id, inner.c.address,
-                        _interpolated_housenumber(inner),
-                        _interpolated_position(inner),
-                        inner.c.postcode, inner.c.country_code,
-                        inner.c.distance)
+                             inner.c.parent_place_id, inner.c.address,
+                             _interpolated_housenumber(inner),
+                             _interpolated_position(inner),
+                             inner.c.postcode, inner.c.country_code,
+                             inner.c.distance)
 
         if self.has_geometries():
             sub = sql.subquery('geom')
@@ -254,32 +268,32 @@ class ReverseGeocoder:
         return (await self.conn.execute(sql, self.bind_params)).one_or_none()
 
 
-    async def _find_tiger_number_for_street(self, parent_place_id: int,
-                                            parent_type: str,
-                                            parent_id: int) -> Optional[SaRow]:
+    async def _find_tiger_number_for_street(self, parent_place_id: int) -> Optional[SaRow]:
         t = self.conn.t.tiger
 
-        inner = sa.select(t,
-                          t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
-                          _locate_interpolation(t))\
-                  .where(t.c.linegeo.ST_DWithin(WKT_PARAM, 0.001))\
-                  .where(t.c.parent_place_id == parent_place_id)\
-                  .order_by('distance')\
-                  .limit(1)\
-                  .subquery('tiger')
-
-        sql = sa.select(inner.c.place_id,
-                        inner.c.parent_place_id,
-                        sa.literal(parent_type).label('osm_type'),
-                        sa.literal(parent_id).label('osm_id'),
-                        _interpolated_housenumber(inner),
-                        _interpolated_position(inner),
-                        inner.c.postcode,
-                        inner.c.distance)
+        def _base_query() -> SaSelect:
+            inner = sa.select(t,
+                              t.c.linegeo.ST_Distance(WKT_PARAM).label('distance'),
+                              _locate_interpolation(t))\
+                      .where(t.c.linegeo.within_distance(WKT_PARAM, 0.001))\
+                      .where(t.c.parent_place_id == parent_place_id)\
+                      .order_by('distance')\
+                      .limit(1)\
+                      .subquery('tiger')
 
+            return sa.select(inner.c.place_id,
+                             inner.c.parent_place_id,
+                             _interpolated_housenumber(inner),
+                             _interpolated_position(inner),
+                             inner.c.postcode,
+                             inner.c.distance)
+
+        sql: SaLambdaSelect
         if self.has_geometries():
-            sub = sql.subquery('geom')
+            sub = _base_query().subquery('geom')
             sql = self._add_geometry_columns(sa.select(sub), sub.c.centroid)
+        else:
+            sql = sa.lambda_stmt(_base_query)
 
         return (await self.conn.execute(sql, self.bind_params)).one_or_none()
 
@@ -313,14 +327,15 @@ class ReverseGeocoder:
                     distance = addr_row.distance
                 elif row.country_code == 'us' and parent_place_id is not None:
                     log().comment('Find TIGER housenumber for street')
-                    addr_row = await self._find_tiger_number_for_street(parent_place_id,
-                                                                        row.osm_type,
-                                                                        row.osm_id)
+                    addr_row = await self._find_tiger_number_for_street(parent_place_id)
                     log().var_dump('Result (street Tiger housenumber)', addr_row)
 
                     if addr_row is not None:
+                        row_func = cast(RowFunc,
+                                        functools.partial(nres.create_from_tiger_row,
+                                                          osm_type=row.osm_type,
+                                                          osm_id=row.osm_id))
                         row = addr_row
-                        row_func = nres.create_from_tiger_row
             else:
                 distance = row.distance
 
@@ -344,59 +359,60 @@ class ReverseGeocoder:
         log().comment('Reverse lookup by larger address area features')
         t = self.conn.t.placex
 
-        # The inner SQL brings results in the right order, so that
-        # later only a minimum of results needs to be checked with ST_Contains.
-        inner = sa.select(t, sa.literal(0.0).label('distance'))\
-                  .where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
-                  .where(t.c.rank_address.between(5, 25))\
-                  .where(t.c.geometry.is_area())\
-                  .where(t.c.geometry.intersects(WKT_PARAM))\
-                  .where(t.c.name != None)\
-                  .where(t.c.indexed_status == 0)\
-                  .where(t.c.linked_place_id == None)\
-                  .where(t.c.type != 'postcode')\
-                  .order_by(sa.desc(t.c.rank_search))\
-                  .limit(50)\
-                  .subquery('area')
+        def _base_query() -> SaSelect:
+            # The inner SQL brings results in the right order, so that
+            # later only a minimum of results needs to be checked with ST_Contains.
+            inner = sa.select(t, sa.literal(0.0).label('distance'))\
+                      .where(t.c.rank_search.between(5, MAX_RANK_PARAM))\
+                      .where(t.c.geometry.intersects(WKT_PARAM))\
+                      .where(sa.func.PlacexGeometryReverseLookuppolygon())\
+                      .order_by(sa.desc(t.c.rank_search))\
+                      .limit(50)\
+                      .subquery('area')
 
-        sql = _select_from_placex(inner, False)\
-                  .where(inner.c.geometry.ST_Contains(WKT_PARAM))\
-                  .order_by(sa.desc(inner.c.rank_search))\
-                  .limit(1)
+            return _select_from_placex(inner, False)\
+                      .where(inner.c.geometry.ST_Contains(WKT_PARAM))\
+                      .order_by(sa.desc(inner.c.rank_search))\
+                      .limit(1)
 
-        sql = self._add_geometry_columns(sql, inner.c.geometry)
+        sql: SaLambdaSelect = sa.lambda_stmt(_base_query)
+        if self.has_geometries():
+            sql = self._add_geometry_columns(sql, sa.literal_column('area.geometry'))
 
         address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
         log().var_dump('Result (area)', address_row)
 
         if address_row is not None and address_row.rank_search < self.max_rank:
             log().comment('Search for better matching place nodes inside the area')
-            inner = sa.select(t,
+
+            address_rank = address_row.rank_search
+            address_id = address_row.place_id
+
+            def _place_inside_area_query() -> SaSelect:
+                inner = \
+                    sa.select(t,
                               t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
-                      .where(t.c.osm_type == 'N')\
-                      .where(t.c.rank_search > address_row.rank_search)\
+                      .where(t.c.rank_search > address_rank)\
                       .where(t.c.rank_search <= MAX_RANK_PARAM)\
-                      .where(t.c.rank_address.between(5, 25))\
-                      .where(t.c.name != None)\
                       .where(t.c.indexed_status == 0)\
-                      .where(t.c.linked_place_id == None)\
-                      .where(t.c.type != 'postcode')\
-                      .where(t.c.geometry
-                                .ST_Buffer(sa.func.reverse_place_diameter(t.c.rank_search))
-                                .intersects(WKT_PARAM))\
+                      .where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
                       .order_by(sa.desc(t.c.rank_search))\
                       .limit(50)\
                       .subquery('places')
 
-            touter = self.conn.t.placex.alias('outer')
-            sql = _select_from_placex(inner, False)\
-                  .join(touter, touter.c.geometry.ST_Contains(inner.c.geometry))\
-                  .where(touter.c.place_id == address_row.place_id)\
-                  .where(inner.c.distance < sa.func.reverse_place_diameter(inner.c.rank_search))\
-                  .order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
-                  .limit(1)
-
-            sql = self._add_geometry_columns(sql, inner.c.geometry)
+                touter = t.alias('outer')
+                return _select_from_placex(inner, False)\
+                    .join(touter, touter.c.geometry.ST_Contains(inner.c.geometry))\
+                    .where(touter.c.place_id == address_id)\
+                    .where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
+                    .order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
+                    .limit(1)
+
+            if self.has_geometries():
+                sql = self._add_geometry_columns(_place_inside_area_query(),
+                                                 sa.literal_column('places.geometry'))
+            else:
+                sql = sa.lambda_stmt(_place_inside_area_query)
 
             place_address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
             log().var_dump('Result (place node)', place_address_row)
@@ -417,10 +433,9 @@ class ReverseGeocoder:
                   .where(t.c.indexed_status == 0)\
                   .where(t.c.linked_place_id == None)\
                   .where(self._filter_by_layer(t))\
-                  .where(t.c.geometry
-                                .ST_Buffer(sa.func.reverse_place_diameter(t.c.rank_search))
-                                .intersects(WKT_PARAM))\
+                  .where(t.c.geometry.intersects(sa.func.ST_Expand(WKT_PARAM, 0.007)))\
                   .order_by(sa.desc(t.c.rank_search))\
+                  .order_by('distance')\
                   .limit(50)\
                   .subquery()
 
@@ -430,7 +445,8 @@ class ReverseGeocoder:
                   .order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
                   .limit(1)
 
-        sql = self._add_geometry_columns(sql, inner.c.geometry)
+        if self.has_geometries():
+            sql = self._add_geometry_columns(sql, inner.c.geometry)
 
         row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
         log().var_dump('Result (non-address feature)', row)
@@ -456,7 +472,7 @@ class ReverseGeocoder:
         return _get_closest(address_row, other_row)
 
 
-    async def lookup_country(self) -> Optional[SaRow]:
+    async def lookup_country_codes(self) -> List[str]:
         """ Lookup the country for the current search.
         """
         log().section('Reverse lookup by country code')
@@ -464,8 +480,16 @@ class ReverseGeocoder:
         sql = sa.select(t.c.country_code).distinct()\
                 .where(t.c.geometry.ST_Contains(WKT_PARAM))
 
-        ccodes = tuple((r[0] for r in await self.conn.execute(sql, self.bind_params)))
+        ccodes = [cast(str, r[0]) for r in await self.conn.execute(sql, self.bind_params)]
         log().var_dump('Country codes', ccodes)
+        return ccodes
+
+
+    async def lookup_country(self, ccodes: List[str]) -> Optional[SaRow]:
+        """ Lookup the country for the current search.
+        """
+        if not ccodes:
+            ccodes = await self.lookup_country_codes()
 
         if not ccodes:
             return None
@@ -474,30 +498,30 @@ class ReverseGeocoder:
         if self.max_rank > 4:
             log().comment('Search for place nodes in country')
 
-            inner = sa.select(t,
+            def _base_query() -> SaSelect:
+                inner = \
+                    sa.select(t,
                               t.c.geometry.ST_Distance(WKT_PARAM).label('distance'))\
-                      .where(t.c.osm_type == 'N')\
                       .where(t.c.rank_search > 4)\
                       .where(t.c.rank_search <= MAX_RANK_PARAM)\
-                      .where(t.c.rank_address.between(5, 25))\
-                      .where(t.c.name != None)\
                       .where(t.c.indexed_status == 0)\
-                      .where(t.c.linked_place_id == None)\
-                      .where(t.c.type != 'postcode')\
                       .where(t.c.country_code.in_(ccodes))\
-                      .where(t.c.geometry
-                                .ST_Buffer(sa.func.reverse_place_diameter(t.c.rank_search))
-                                .intersects(WKT_PARAM))\
+                      .where(sa.func.IntersectsReverseDistance(t, WKT_PARAM))\
                       .order_by(sa.desc(t.c.rank_search))\
                       .limit(50)\
-                      .subquery()
+                      .subquery('area')
 
-            sql = _select_from_placex(inner, False)\
-                  .where(inner.c.distance < sa.func.reverse_place_diameter(inner.c.rank_search))\
-                  .order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
-                  .limit(1)
+                return _select_from_placex(inner, False)\
+                    .where(sa.func.IsBelowReverseDistance(inner.c.distance, inner.c.rank_search))\
+                    .order_by(sa.desc(inner.c.rank_search), inner.c.distance)\
+                    .limit(1)
 
-            sql = self._add_geometry_columns(sql, inner.c.geometry)
+            sql: SaLambdaSelect
+            if self.has_geometries():
+                sql = self._add_geometry_columns(_base_query(),
+                                                 sa.literal_column('area.geometry'))
+            else:
+                sql = sa.lambda_stmt(_base_query)
 
             address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
             log().var_dump('Result (addressable place node)', address_row)
@@ -506,15 +530,19 @@ class ReverseGeocoder:
 
         if address_row is None:
             # Still nothing, then return a country with the appropriate country code.
-            sql = _select_from_placex(t)\
-                      .where(t.c.country_code.in_(ccodes))\
-                      .where(t.c.rank_address == 4)\
-                      .where(t.c.rank_search == 4)\
-                      .where(t.c.linked_place_id == None)\
-                      .order_by('distance')\
-                      .limit(1)
-
-            sql = self._add_geometry_columns(sql, t.c.geometry)
+            def _country_base_query() -> SaSelect:
+                return _select_from_placex(t)\
+                         .where(t.c.country_code.in_(ccodes))\
+                         .where(t.c.rank_address == 4)\
+                         .where(t.c.rank_search == 4)\
+                         .where(t.c.linked_place_id == None)\
+                         .order_by('distance')\
+                         .limit(1)
+
+            if self.has_geometries():
+                sql = self._add_geometry_columns(_country_base_query(), t.c.geometry)
+            else:
+                sql = sa.lambda_stmt(_country_base_query)
 
             address_row = (await self.conn.execute(sql, self.bind_params)).one_or_none()
 
@@ -537,10 +565,19 @@ class ReverseGeocoder:
             row, tmp_row_func = await self.lookup_street_poi()
             if row is not None:
                 row_func = tmp_row_func
-        if row is None and self.max_rank > 4:
-            row = await self.lookup_area()
-        if row is None and self.layer_enabled(DataLayer.ADDRESS):
-            row = await self.lookup_country()
+
+        if row is None:
+            if self.restrict_to_country_areas:
+                ccodes = await self.lookup_country_codes()
+                if not ccodes:
+                    return None
+            else:
+                ccodes = []
+
+            if self.max_rank > 4:
+                row = await self.lookup_area()
+            if row is None and self.layer_enabled(DataLayer.ADDRESS):
+                row = await self.lookup_country(ccodes)
 
         result = row_func(row, nres.ReverseResult)
         if result is not None: