From: Sarah Hoffmann Date: Tue, 9 Apr 2024 12:58:53 +0000 (+0200) Subject: Merge pull request #3389 from mtmail/cli-autodiscover-valid-formats X-Git-Tag: deploy~9^2~1 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/1a0f851d0dadb9cc2067acf4baba733dc70b4e3b?hp=28444d94350287d6c27675e740ec8fa64a5403ae Merge pull request #3389 from mtmail/cli-autodiscover-valid-formats CLI: get valid --format values via autodiscover --- diff --git a/.pylintrc b/.pylintrc index c1384c00..27214bae 100644 --- a/.pylintrc +++ b/.pylintrc @@ -13,6 +13,6 @@ ignored-classes=NominatimArgs,closing # 'too-many-ancestors' is triggered already by deriving from UserDict # 'not-context-manager' disabled because it causes false positives once # typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273 -disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager,use-dict-literal,chained-comparison,attribute-defined-outside-init +disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager,use-dict-literal,chained-comparison,attribute-defined-outside-init,too-many-boolean-expressions good-names=i,j,x,y,m,t,fd,db,cc,x1,x2,y1,y2,pt,k,v,nr diff --git a/docs/admin/Import.md b/docs/admin/Import.md index 884dd44d..7b227410 100644 --- a/docs/admin/Import.md +++ b/docs/admin/Import.md @@ -153,7 +153,7 @@ if you plan to use the installation only for exports to a [photon](https://photon.komoot.io/) database, then you can set up a database without search indexes. Add `--reverse-only` to your setup command above. -This saves about 5% of disk space. +This saves about 5% of disk space, import time won't be significant faster. ### Filtering Imported Data diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py index e27a24d6..c2f98c47 100644 --- a/nominatim/api/search/db_search_builder.py +++ b/nominatim/api/search/db_search_builder.py @@ -227,8 +227,6 @@ class SearchBuilder: name_fulls = self.query.get_tokens(name, TokenType.WORD) if name_fulls: fulls_count = sum(t.count for t in name_fulls) - if len(name_partials) == 1: - penalty += min(0.5, max(0, (exp_count - 50 * fulls_count) / (2000 * fulls_count))) if partials_indexed: penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) diff --git a/nominatim/api/search/db_searches.py b/nominatim/api/search/db_searches.py index 3f294de7..d74812e6 100644 --- a/nominatim/api/search/db_searches.py +++ b/nominatim/api/search/db_searches.py @@ -645,97 +645,145 @@ class PlaceSearch(AbstractSearch): self.expected_count = expected_count - async def lookup(self, conn: SearchConnection, - details: SearchDetails) -> nres.SearchResults: - """ Find results for the search in the database. + def _inner_search_name_cte(self, conn: SearchConnection, + details: SearchDetails) -> 'sa.CTE': + """ Create a subquery that preselects the rows in the search_name + table. """ - t = conn.t.placex - tsearch = conn.t.search_name - - sql: SaLambdaSelect = sa.lambda_stmt(lambda: - _select_placex(t).where(t.c.place_id == tsearch.c.place_id)) - - - if details.geometry_output: - sql = _add_geometry_columns(sql, t.c.geometry, details) + t = conn.t.search_name penalty: SaExpression = sa.literal(self.penalty) for ranking in self.rankings: - penalty += ranking.sql_penalty(tsearch) + penalty += ranking.sql_penalty(t) + + sql = sa.select(t.c.place_id, t.c.search_rank, t.c.address_rank, + t.c.country_code, t.c.centroid, + t.c.name_vector, t.c.nameaddress_vector, + sa.case((t.c.importance > 0, t.c.importance), + else_=0.40001-(sa.cast(t.c.search_rank, sa.Float())/75)) + .label('importance'), + penalty.label('penalty')) for lookup in self.lookups: - sql = sql.where(lookup.sql_condition(tsearch)) + sql = sql.where(lookup.sql_condition(t)) if self.countries: - sql = sql.where(tsearch.c.country_code.in_(self.countries.values)) + sql = sql.where(t.c.country_code.in_(self.countries.values)) if self.postcodes: # if a postcode is given, don't search for state or country level objects - sql = sql.where(tsearch.c.address_rank > 9) - tpc = conn.t.postcode - pcs = self.postcodes.values - if self.expected_count > 5000: + sql = sql.where(t.c.address_rank > 9) + if self.expected_count > 10000: # Many results expected. Restrict by postcode. + tpc = conn.t.postcode sql = sql.where(sa.select(tpc.c.postcode) - .where(tpc.c.postcode.in_(pcs)) - .where(tsearch.c.centroid.within_distance(tpc.c.geometry, 0.12)) + .where(tpc.c.postcode.in_(self.postcodes.values)) + .where(t.c.centroid.within_distance(tpc.c.geometry, 0.4)) .exists()) - # Less results, only have a preference for close postcodes - pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(tsearch.c.centroid)))\ + if details.viewbox is not None: + if details.bounded_viewbox: + sql = sql.where(t.c.centroid + .intersects(VIEWBOX_PARAM, + use_index=details.viewbox.area < 0.2)) + elif not self.postcodes and not self.housenumbers and self.expected_count >= 10000: + sql = sql.where(t.c.centroid + .intersects(VIEWBOX2_PARAM, + use_index=details.viewbox.area < 0.5)) + + if details.near is not None and details.near_radius is not None: + if details.near_radius < 0.1: + sql = sql.where(t.c.centroid.within_distance(NEAR_PARAM, + NEAR_RADIUS_PARAM)) + else: + sql = sql.where(t.c.centroid + .ST_Distance(NEAR_PARAM) < NEAR_RADIUS_PARAM) + + if self.housenumbers: + sql = sql.where(t.c.address_rank.between(16, 30)) + else: + if details.excluded: + sql = sql.where(_exclude_places(t)) + if details.min_rank > 0: + sql = sql.where(sa.or_(t.c.address_rank >= MIN_RANK_PARAM, + t.c.search_rank >= MIN_RANK_PARAM)) + if details.max_rank < 30: + sql = sql.where(sa.or_(t.c.address_rank <= MAX_RANK_PARAM, + t.c.search_rank <= MAX_RANK_PARAM)) + + inner = sql.limit(10000).order_by(sa.desc(sa.text('importance'))).subquery() + + sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank, + inner.c.country_code, inner.c.centroid, inner.c.importance, + inner.c.penalty) + + # If the query is not an address search or has a geographic preference, + # preselect most important items to restrict the number of places + # that need to be looked up in placex. + if not self.housenumbers\ + and (details.viewbox is None or details.bounded_viewbox)\ + and (details.near is None or details.near_radius is not None)\ + and not self.qualifiers: + sql = sql.add_columns(sa.func.first_value(inner.c.penalty - inner.c.importance) + .over(order_by=inner.c.penalty - inner.c.importance) + .label('min_penalty')) + + inner = sql.subquery() + + sql = sa.select(inner.c.place_id, inner.c.search_rank, inner.c.address_rank, + inner.c.country_code, inner.c.centroid, inner.c.importance, + inner.c.penalty)\ + .where(inner.c.penalty - inner.c.importance < inner.c.min_penalty + 0.5) + + return sql.cte('searches') + + + async def lookup(self, conn: SearchConnection, + details: SearchDetails) -> nres.SearchResults: + """ Find results for the search in the database. + """ + t = conn.t.placex + tsearch = self._inner_search_name_cte(conn, details) + + sql = _select_placex(t).join(tsearch, t.c.place_id == tsearch.c.place_id) + + if details.geometry_output: + sql = _add_geometry_columns(sql, t.c.geometry, details) + + penalty: SaExpression = tsearch.c.penalty + + if self.postcodes: + tpc = conn.t.postcode + pcs = self.postcodes.values + + pc_near = sa.select(sa.func.min(tpc.c.geometry.ST_Distance(t.c.centroid)))\ .where(tpc.c.postcode.in_(pcs))\ .scalar_subquery() penalty += sa.case((t.c.postcode.in_(pcs), 0.0), else_=sa.func.coalesce(pc_near, cast(SaColumn, 2.0))) - if details.viewbox is not None: - if details.bounded_viewbox: - sql = sql.where(tsearch.c.centroid - .intersects(VIEWBOX_PARAM, - use_index=details.viewbox.area < 0.2)) - elif not self.postcodes and not self.housenumbers and self.expected_count >= 10000: - sql = sql.where(tsearch.c.centroid - .intersects(VIEWBOX2_PARAM, - use_index=details.viewbox.area < 0.5)) - else: - penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM, use_index=False), 0.0), - (t.c.geometry.intersects(VIEWBOX2_PARAM, use_index=False), 0.5), - else_=1.0) + if details.viewbox is not None and not details.bounded_viewbox: + penalty += sa.case((t.c.geometry.intersects(VIEWBOX_PARAM, use_index=False), 0.0), + (t.c.geometry.intersects(VIEWBOX2_PARAM, use_index=False), 0.5), + else_=1.0) if details.near is not None: - if details.near_radius is not None: - if details.near_radius < 0.1: - sql = sql.where(tsearch.c.centroid.within_distance(NEAR_PARAM, - NEAR_RADIUS_PARAM)) - else: - sql = sql.where(tsearch.c.centroid - .ST_Distance(NEAR_PARAM) < NEAR_RADIUS_PARAM) sql = sql.add_columns((-tsearch.c.centroid.ST_Distance(NEAR_PARAM)) .label('importance')) sql = sql.order_by(sa.desc(sa.text('importance'))) else: - if self.expected_count < 10000\ - or (details.viewbox is not None and details.viewbox.area < 0.5): - sql = sql.order_by( - penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance), - else_=0.40001-(sa.cast(tsearch.c.search_rank, sa.Float())/75))) - sql = sql.add_columns(t.c.importance) - + sql = sql.order_by(penalty - tsearch.c.importance) + sql = sql.add_columns(tsearch.c.importance) - sql = sql.add_columns(penalty.label('accuracy')) - if self.expected_count < 10000: - sql = sql.order_by(sa.text('accuracy')) + sql = sql.add_columns(penalty.label('accuracy'))\ + .order_by(sa.text('accuracy')) if self.housenumbers: hnr_list = '|'.join(self.housenumbers.values) - sql = sql.where(tsearch.c.address_rank.between(16, 30))\ - .where(sa.or_(tsearch.c.address_rank < 30, - sa.func.RegexpWord(hnr_list, t.c.housenumber))) - - # Cross check for housenumbers, need to do that on a rather large - # set. Worst case there are 40.000 main streets in OSM. - inner = sql.limit(10000).subquery() + inner = sql.where(sa.or_(tsearch.c.address_rank < 30, + sa.func.RegexpWord(hnr_list, t.c.housenumber)))\ + .subquery() # Housenumbers from placex thnr = conn.t.placex.alias('hnr') @@ -783,14 +831,6 @@ class PlaceSearch(AbstractSearch): .where(t.c.indexed_status == 0) if self.qualifiers: sql = sql.where(self.qualifiers.sql_restrict(t)) - if details.excluded: - sql = sql.where(_exclude_places(tsearch)) - if details.min_rank > 0: - sql = sql.where(sa.or_(tsearch.c.address_rank >= MIN_RANK_PARAM, - tsearch.c.search_rank >= MIN_RANK_PARAM)) - if details.max_rank < 30: - sql = sql.where(sa.or_(tsearch.c.address_rank <= MAX_RANK_PARAM, - tsearch.c.search_rank <= MAX_RANK_PARAM)) if details.layers is not None: sql = sql.where(_filter_by_layer(t, details.layers)) diff --git a/nominatim/api/search/token_assignment.py b/nominatim/api/search/token_assignment.py index ca907b79..95eb7f70 100644 --- a/nominatim/api/search/token_assignment.py +++ b/nominatim/api/search/token_assignment.py @@ -225,13 +225,14 @@ class _TokenSequence: def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool: - if priors == 2: - self.penalty += 1.0 - elif priors > 2: + if priors >= 2: if self.direction == 0: self.direction = new_dir else: - return False + if priors == 2: + self.penalty += 0.8 + else: + return False return True diff --git a/nominatim/api/v1/format_json.py b/nominatim/api/v1/format_json.py index 80560c95..1c17a032 100644 --- a/nominatim/api/v1/format_json.py +++ b/nominatim/api/v1/format_json.py @@ -247,7 +247,8 @@ def format_base_geocodejson(results: Union[napi.ReverseResults, napi.SearchResul out.key('admin').start_object() if result.address_rows: for line in result.address_rows: - if line.isaddress and (line.admin_level or 15) < 15 and line.local_name: + if line.isaddress and (line.admin_level or 15) < 15 and line.local_name \ + and line.category[0] == 'boundary' and line.category[1] == 'administrative': out.keyval(f"level{line.admin_level}", line.local_name) out.end_object().next() diff --git a/test/python/api/search/test_search_places.py b/test/python/api/search/test_search_places.py index c446a35f..5e06776d 100644 --- a/test/python/api/search/test_search_places.py +++ b/test/python/api/search/test_search_places.py @@ -68,7 +68,7 @@ class TestNameOnlySearches: ([20], [101, 100])]) def test_lookup_all_match(self, apiobj, frontend, lookup_type, rank, res): lookup = FieldLookup('name_vector', [1,2], lookup_type) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, rank)]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, rank)]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking]) @@ -78,7 +78,7 @@ class TestNameOnlySearches: @pytest.mark.parametrize('lookup_type', [LookupAll, Restrict]) def test_lookup_all_partial_match(self, apiobj, frontend, lookup_type): lookup = FieldLookup('name_vector', [1,20], lookup_type) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [21])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [21])]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking]) @@ -89,7 +89,7 @@ class TestNameOnlySearches: ([20], [101, 100])]) def test_lookup_any_match(self, apiobj, frontend, rank, res): lookup = FieldLookup('name_vector', [11,21], LookupAny) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, rank)]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, rank)]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking]) @@ -98,7 +98,7 @@ class TestNameOnlySearches: def test_lookup_any_partial_match(self, apiobj, frontend): lookup = FieldLookup('name_vector', [20], LookupAll) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [21])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [21])]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking]) @@ -109,7 +109,7 @@ class TestNameOnlySearches: @pytest.mark.parametrize('cc,res', [('us', 100), ('mx', 101)]) def test_lookup_restrict_country(self, apiobj, frontend, cc, res): lookup = FieldLookup('name_vector', [1,2], LookupAll) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [10])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [10])]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking], ccodes=[cc]) @@ -118,7 +118,7 @@ class TestNameOnlySearches: def test_lookup_restrict_placeid(self, apiobj, frontend): lookup = FieldLookup('name_vector', [1,2], LookupAll) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [10])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [10])]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking], details=SearchDetails(excluded=[101])) @@ -132,7 +132,7 @@ class TestNameOnlySearches: napi.GeometryFormat.TEXT]) def test_return_geometries(self, apiobj, frontend, geom): lookup = FieldLookup('name_vector', [20], LookupAll) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [21])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [21])]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking], details=SearchDetails(geometry_output=geom)) @@ -149,7 +149,7 @@ class TestNameOnlySearches: centroid=(5.6, 4.3)) lookup = FieldLookup('name_vector', [55], LookupAll) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [21])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [21])]) results = run_search(apiobj, frontend, 0.1, [lookup], [ranking], details=SearchDetails(geometry_output=napi.GeometryFormat.GEOJSON, @@ -191,7 +191,7 @@ class TestNameOnlySearches: def test_prefer_near(self, apiobj, frontend): lookup = FieldLookup('name_vector', [1, 2], LookupAll) - ranking = FieldRanking('name_vector', 0.9, [RankedTokens(0.0, [21])]) + ranking = FieldRanking('name_vector', 0.4, [RankedTokens(0.0, [21])]) api = frontend(apiobj, options=APIOPTIONS) results = run_search(api, None, 0.1, [lookup], [ranking]) @@ -368,9 +368,9 @@ def test_name_and_postcode(apiobj, frontend, wcount, rids): apiobj.add_placex(place_id=991, class_='highway', type='service', rank_search=27, rank_address=27, postcode='11221', - centroid=(10.1, 10.1), - geometry='LINESTRING(9.995 10.1, 10.005 10.1)') - apiobj.add_search_name(991, names=[111], centroid=(10.1, 10.1), + centroid=(10.3, 10.3), + geometry='LINESTRING(9.995 10.3, 10.005 10.3)') + apiobj.add_search_name(991, names=[111], centroid=(10.3, 10.3), search_rank=27, address_rank=27) apiobj.add_postcode(place_id=100, country_code='ch', postcode='11225', geometry='POINT(10 10)')