From: Sarah Hoffmann Date: Wed, 2 Apr 2025 10:01:50 +0000 (+0200) Subject: release 5.1.0post2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/HEAD?ds=inline;hp=0a7624039bc4189fd999eb23ea5a175a1c4b2dfb release 5.1.0post2 --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 311414fe..6c90cd3c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -113,3 +113,5 @@ Checklist for releases: * run `nominatim --version` to confirm correct version * [ ] tag new release and add a release on github.com * [ ] build pip packages and upload to pypi + * `make build` + * `twine upload dist/*` diff --git a/ChangeLog b/ChangeLog index 9ffe4038..dff198eb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +5.1.0 + * replace datrie with simple internal trie implementation + * add pattern-based postcode parser for queries, + postcodes no longer need to be present in OSM to be found + * take variants into account when computing token similarity + * add extratags output to geocodejson format + * fix default layer setting used for structured queries + * update abbreviation lists for Russian and English + (thanks @shoorick, @IvanShift, @mhsrn21) + * fix variant generation for Norwegian + * fix normalization around space-like characters + * improve postcode search and handling of postcodes in queries + * reorganise internal query structure and get rid of slow enums + * enable code linting for tests + * various code moderinsations in test code (thanks @eumiro) + * remove setting osm2pgsql location via config.lib_dir + * make SQL functions parallel save as far as possible (thanks @otbutz) + * various fixes and improvements to documentation (thanks @TuringVerified) + 5.0.0 * increase required versions for PostgreSQL (12+), PostGIS (3.0+) * remove installation via cmake and debundle osm2pgsql diff --git a/SECURITY.md b/SECURITY.md index e3660bcd..98295e1f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -9,7 +9,8 @@ versions. | Version | End of support for security updates | | ------- | ----------------------------------- | -| 5.0.x | 2027-02-06 +| 5.1.x | 2027-04-01 | +| 5.0.x | 2027-02-06 | | 4.5.x | 2026-09-12 | | 4.4.x | 2026-03-07 | | 4.3.x | 2025-09-07 | diff --git a/docs/customize/Settings.md b/docs/customize/Settings.md index 94726ca7..edf2241b 100644 --- a/docs/customize/Settings.md +++ b/docs/customize/Settings.md @@ -602,6 +602,43 @@ results gathered so far. Note that under high load you may observe that users receive different results than usual without seeing an error. This may cause some confusion. +#### NOMINATIM_OUTPUT_NAMES + +| Summary | | +| -------------- | --------------------------------------------------- | +| **Description:** | Specifies order of name tags | +| **Format:** | string: comma-separated list of tag names | +| **Default:** | name:XX,name,brand,official_name:XX,short_name:XX,official_name,short_name,ref | + +Specifies the order in which different name tags are used. +The values in this list determine the preferred order of name variants, +including language-specific names (in OSM: the name tag with and without any language suffix). + +Comma-separated list, where :XX stands for language suffix +(e.g. name:en) and no :XX stands for general tags (e.g. name). + +See also [NOMINATIM_DEFAULT_LANGUAGE](#nominatim_default_language). + +!!! note + If NOMINATIM_OUTPUT_NAMES = `name:XX,name,short_name:XX,short_name` the search follows + + ``` + 'name', 'short_name' + ``` + + if we have no preferred language order for showing search results. + + For languages ['en', 'es'] the search follows + + ``` + 'name:en', 'name:es', + 'name', + 'short_name:en', 'short_name:es', + 'short_name' + ``` + + For those familiar with the internal implementation, the `_place_*` expansion is added, but to simplify, it is not included in this example. + ### Logging Settings #### NOMINATIM_LOG_DB diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index d290c148..23db34c9 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -67,7 +67,13 @@ Here is an example configuration file: ``` yaml query-preprocessing: - - normalize + - step: split_japanese_phrases + - step: regex_replace + replacements: + - pattern: https?://[^\s]* # Filter URLs starting with http or https + replace: '' + - step: normalize + normalization: - ":: lower ()" - "ß > 'ss'" # German szet is unambiguously equal to double ss @@ -88,8 +94,8 @@ token-analysis: replacements: ['ä', 'ae'] ``` -The configuration file contains four sections: -`normalization`, `transliteration`, `sanitizers` and `token-analysis`. +The configuration file contains five sections: +`query-preprocessing`, `normalization`, `transliteration`, `sanitizers` and `token-analysis`. #### Query preprocessing @@ -106,6 +112,19 @@ The following is a list of preprocessors that are shipped with Nominatim. heading_level: 6 docstring_section_style: spacy +##### regex-replace + +::: nominatim_api.query_preprocessing.regex_replace + options: + members: False + heading_level: 6 + docstring_section_style: spacy + description: + This option runs any given regex pattern on the input and replaces values accordingly + replacements: + - pattern: regex pattern + replace: string to replace with + #### Normalization and Transliteration diff --git a/docs/develop/Development-Environment.md b/docs/develop/Development-Environment.md index 9ade7916..709f9b7d 100644 --- a/docs/develop/Development-Environment.md +++ b/docs/develop/Development-Environment.md @@ -69,9 +69,9 @@ To set up the virtual environment with all necessary packages run: ```sh virtualenv ~/nominatim-dev-venv ~/nominatim-dev-venv/bin/pip install\ - psutil psycopg[binary] PyICU SQLAlchemy \ + psutil 'psycopg[binary]' PyICU SQLAlchemy \ python-dotenv jinja2 pyYAML behave \ - mkdocs mkdocstrings mkdocs-gen-files pytest pytest-asyncio flake8 \ + mkdocs 'mkdocstrings[python]' mkdocs-gen-files pytest pytest-asyncio flake8 \ types-jinja2 types-markupsafe types-psutil types-psycopg2 \ types-pygments types-pyyaml types-requests types-ujson \ types-urllib3 typing-extensions unicorn falcon starlette \ diff --git a/lib-sql/functions.sql b/lib-sql/functions.sql index 158969d9..737a3f21 100644 --- a/lib-sql/functions.sql +++ b/lib-sql/functions.sql @@ -8,7 +8,6 @@ {% include('functions/utils.sql') %} {% include('functions/ranking.sql') %} {% include('functions/importance.sql') %} -{% include('functions/address_lookup.sql') %} {% include('functions/interpolation.sql') %} {% if 'place' in db.tables %} diff --git a/lib-sql/functions/address_lookup.sql b/lib-sql/functions/address_lookup.sql deleted file mode 100644 index b59b7656..00000000 --- a/lib-sql/functions/address_lookup.sql +++ /dev/null @@ -1,334 +0,0 @@ --- SPDX-License-Identifier: GPL-2.0-only --- --- This file is part of Nominatim. (https://nominatim.org) --- --- Copyright (C) 2022 by the Nominatim developer community. --- For a full list of authors see the git log. - --- Functions for returning address information for a place. - -DROP TYPE IF EXISTS addressline CASCADE; -CREATE TYPE addressline as ( - place_id BIGINT, - osm_type CHAR(1), - osm_id BIGINT, - name HSTORE, - class TEXT, - type TEXT, - place_type TEXT, - admin_level INTEGER, - fromarea BOOLEAN, - isaddress BOOLEAN, - rank_address INTEGER, - distance FLOAT -); - - -CREATE OR REPLACE FUNCTION get_name_by_language(name hstore, languagepref TEXT[]) - RETURNS TEXT - AS $$ -DECLARE - result TEXT; -BEGIN - IF name is null THEN - RETURN null; - END IF; - - FOR j IN 1..array_upper(languagepref,1) LOOP - IF name ? languagepref[j] THEN - result := trim(name->languagepref[j]); - IF result != '' THEN - return result; - END IF; - END IF; - END LOOP; - - -- as a fallback - take the last element since it is the default name - RETURN trim((avals(name))[array_length(avals(name), 1)]); -END; -$$ -LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; - - ---housenumber only needed for tiger data -CREATE OR REPLACE FUNCTION get_address_by_language(for_place_id BIGINT, - housenumber INTEGER, - languagepref TEXT[]) - RETURNS TEXT - AS $$ -DECLARE - result TEXT[]; - currresult TEXT; - prevresult TEXT; - location RECORD; -BEGIN - - result := '{}'; - prevresult := ''; - - FOR location IN - SELECT name, - CASE WHEN place_id = for_place_id THEN 99 ELSE rank_address END as rank_address - FROM get_addressdata(for_place_id, housenumber) - WHERE isaddress order by rank_address desc - LOOP - currresult := trim(get_name_by_language(location.name, languagepref)); - IF currresult != prevresult AND currresult IS NOT NULL - AND result[(100 - location.rank_address)] IS NULL - THEN - result[(100 - location.rank_address)] := currresult; - prevresult := currresult; - END IF; - END LOOP; - - RETURN array_to_string(result,', '); -END; -$$ -LANGUAGE plpgsql STABLE PARALLEL SAFE; - -DROP TYPE IF EXISTS addressdata_place; -CREATE TYPE addressdata_place AS ( - place_id BIGINT, - country_code VARCHAR(2), - housenumber TEXT, - postcode TEXT, - class TEXT, - type TEXT, - name HSTORE, - address HSTORE, - centroid GEOMETRY -); - --- Compute the list of address parts for the given place. --- --- If in_housenumber is greator or equal 0, look for an interpolation. -CREATE OR REPLACE FUNCTION get_addressdata(in_place_id BIGINT, in_housenumber INTEGER) - RETURNS setof addressline - AS $$ -DECLARE - place addressdata_place; - location RECORD; - country RECORD; - current_rank_address INTEGER; - location_isaddress BOOLEAN; -BEGIN - -- The place in question might not have a direct entry in place_addressline. - -- Look for the parent of such places then and save it in place. - - -- first query osmline (interpolation lines) - IF in_housenumber >= 0 THEN - SELECT parent_place_id as place_id, country_code, - in_housenumber as housenumber, postcode, - 'place' as class, 'house' as type, - null as name, null as address, - ST_Centroid(linegeo) as centroid - INTO place - FROM location_property_osmline - WHERE place_id = in_place_id - AND in_housenumber between startnumber and endnumber; - END IF; - - --then query tiger data - {% if config.get_bool('USE_US_TIGER_DATA') %} - IF place IS NULL AND in_housenumber >= 0 THEN - SELECT parent_place_id as place_id, 'us' as country_code, - in_housenumber as housenumber, postcode, - 'place' as class, 'house' as type, - null as name, null as address, - ST_Centroid(linegeo) as centroid - INTO place - FROM location_property_tiger - WHERE place_id = in_place_id - AND in_housenumber between startnumber and endnumber; - END IF; - {% endif %} - - -- postcode table - IF place IS NULL THEN - SELECT parent_place_id as place_id, country_code, - null::text as housenumber, postcode, - 'place' as class, 'postcode' as type, - null as name, null as address, - null as centroid - INTO place - FROM location_postcode - WHERE place_id = in_place_id; - END IF; - - -- POI objects in the placex table - IF place IS NULL THEN - SELECT parent_place_id as place_id, country_code, - coalesce(address->'housenumber', - address->'streetnumber', - address->'conscriptionnumber')::text as housenumber, - postcode, - class, type, - name, address, - centroid - INTO place - FROM placex - WHERE place_id = in_place_id and rank_search > 27; - END IF; - - -- If place is still NULL at this point then the object has its own - -- entry in place_address line. However, still check if there is not linked - -- place we should be using instead. - IF place IS NULL THEN - select coalesce(linked_place_id, place_id) as place_id, country_code, - null::text as housenumber, postcode, - class, type, - null as name, address, - null as centroid - INTO place - FROM placex where place_id = in_place_id; - END IF; - ---RAISE WARNING '% % % %',searchcountrycode, searchhousenumber, searchpostcode; - - -- --- Return the record for the base entry. - - current_rank_address := 1000; - FOR location IN - SELECT placex.place_id, osm_type, osm_id, name, - coalesce(extratags->'linked_place', extratags->'place') as place_type, - class, type, admin_level, - CASE WHEN rank_address = 0 THEN 100 - WHEN rank_address = 11 THEN 5 - ELSE rank_address END as rank_address, - country_code - FROM placex - WHERE place_id = place.place_id - LOOP ---RAISE WARNING '%',location; - -- mix in default names for countries - IF location.rank_address = 4 and place.country_code is not NULL THEN - FOR country IN - SELECT coalesce(name, ''::hstore) as name FROM country_name - WHERE country_code = place.country_code LIMIT 1 - LOOP - place.name := country.name || place.name; - END LOOP; - END IF; - - IF location.rank_address < 4 THEN - -- no country locations for ranks higher than country - place.country_code := NULL::varchar(2); - ELSEIF place.country_code IS NULL AND location.country_code IS NOT NULL THEN - place.country_code := location.country_code; - END IF; - - RETURN NEXT ROW(location.place_id, location.osm_type, location.osm_id, - location.name, location.class, location.type, - location.place_type, - location.admin_level, true, - location.type not in ('postcode', 'postal_code'), - location.rank_address, 0)::addressline; - - current_rank_address := location.rank_address; - END LOOP; - - -- --- Return records for address parts. - - FOR location IN - SELECT placex.place_id, osm_type, osm_id, name, class, type, - coalesce(extratags->'linked_place', extratags->'place') as place_type, - admin_level, fromarea, isaddress and linked_place_id is NULL as isaddress, - CASE WHEN rank_address = 11 THEN 5 ELSE rank_address END as rank_address, - distance, country_code, postcode - FROM place_addressline join placex on (address_place_id = placex.place_id) - WHERE place_addressline.place_id IN (place.place_id, in_place_id) - AND linked_place_id is null - AND (placex.country_code IS NULL OR place.country_code IS NULL - OR placex.country_code = place.country_code) - ORDER BY rank_address desc, - (place_addressline.place_id = in_place_id) desc, - (CASE WHEN coalesce((avals(name) && avals(place.address)), False) THEN 2 - WHEN isaddress THEN 0 - WHEN fromarea - and place.centroid is not null - and ST_Contains(geometry, place.centroid) THEN 1 - ELSE -1 END) desc, - fromarea desc, distance asc, rank_search desc - LOOP - -- RAISE WARNING '%',location; - location_isaddress := location.rank_address != current_rank_address; - - IF place.country_code IS NULL AND location.country_code IS NOT NULL THEN - place.country_code := location.country_code; - END IF; - IF location.type in ('postcode', 'postal_code') - AND place.postcode is not null - THEN - -- If the place had a postcode assigned, take this one only - -- into consideration when it is an area and the place does not have - -- a postcode itself. - IF location.fromarea AND location_isaddress - AND (place.address is null or not place.address ? 'postcode') - THEN - place.postcode := null; -- remove the less exact postcode - ELSE - location_isaddress := false; - END IF; - END IF; - RETURN NEXT ROW(location.place_id, location.osm_type, location.osm_id, - location.name, location.class, location.type, - location.place_type, - location.admin_level, location.fromarea, - location_isaddress, - location.rank_address, - location.distance)::addressline; - - current_rank_address := location.rank_address; - END LOOP; - - -- If no country was included yet, add the name information from country_name. - IF current_rank_address > 4 THEN - FOR location IN - SELECT name || coalesce(derived_name, ''::hstore) as name FROM country_name - WHERE country_code = place.country_code LIMIT 1 - LOOP ---RAISE WARNING '% % %',current_rank_address,searchcountrycode,countryname; - RETURN NEXT ROW(null, null, null, location.name, 'place', 'country', NULL, - null, true, true, 4, 0)::addressline; - END LOOP; - END IF; - - -- Finally add some artificial rows. - IF place.country_code IS NOT NULL THEN - location := ROW(null, null, null, hstore('ref', place.country_code), - 'place', 'country_code', null, null, true, false, 4, 0)::addressline; - RETURN NEXT location; - END IF; - - IF place.name IS NOT NULL THEN - location := ROW(in_place_id, null, null, place.name, place.class, - place.type, null, null, true, true, 29, 0)::addressline; - RETURN NEXT location; - END IF; - - IF place.housenumber IS NOT NULL THEN - location := ROW(null, null, null, hstore('ref', place.housenumber), - 'place', 'house_number', null, null, true, true, 28, 0)::addressline; - RETURN NEXT location; - END IF; - - IF place.address is not null and place.address ? '_unlisted_place' THEN - RETURN NEXT ROW(null, null, null, hstore('name', place.address->'_unlisted_place'), - 'place', 'locality', null, null, true, true, 25, 0)::addressline; - END IF; - - IF place.postcode is not null THEN - location := ROW(null, null, null, hstore('ref', place.postcode), 'place', - 'postcode', null, null, false, true, 5, 0)::addressline; - RETURN NEXT location; - ELSEIF place.address is not null and place.address ? 'postcode' - and not place.address->'postcode' SIMILAR TO '%(,|;)%' THEN - location := ROW(null, null, null, hstore('ref', place.address->'postcode'), 'place', - 'postcode', null, null, false, true, 5, 0)::addressline; - RETURN NEXT location; - END IF; - - RETURN; -END; -$$ -LANGUAGE plpgsql STABLE PARALLEL SAFE; diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index f0c30f1b..8cf13120 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -128,16 +128,14 @@ DECLARE partial_terms TEXT[] = '{}'::TEXT[]; term TEXT; term_id INTEGER; - term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token FROM word WHERE word = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, type, word, info) - SELECT full_token, lookup_term, 'W', norm_term, - json_build_object('count', 0) + INSERT INTO word (word_id, word_token, type, word) + SELECT full_token, lookup_term, 'W', norm_term FROM unnest(lookup_terms) as lookup_term; END IF; @@ -150,14 +148,67 @@ BEGIN partial_tokens := '{}'::INT[]; FOR term IN SELECT unnest(partial_terms) LOOP - SELECT min(word_id), max(info->>'count') INTO term_id, term_count + SELECT min(word_id) INTO term_id + FROM word WHERE word_token = term and type = 'w'; + + IF term_id IS NULL THEN + term_id := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type) + VALUES (term_id, term, 'w'); + END IF; + + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); + END LOOP; +END; +$$ +LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, + lookup_terms TEXT[], + lookup_norm_terms TEXT[], + OUT full_token INT, + OUT partial_tokens INT[]) + AS $$ +DECLARE + partial_terms TEXT[] = '{}'::TEXT[]; + term TEXT; + term_id INTEGER; +BEGIN + SELECT min(word_id) INTO full_token + FROM word WHERE word = norm_term and type = 'W'; + + IF full_token IS NULL THEN + full_token := nextval('seq_word'); + IF lookup_norm_terms IS NULL THEN + INSERT INTO word (word_id, word_token, type, word) + SELECT full_token, lookup_term, 'W', norm_term + FROM unnest(lookup_terms) as lookup_term; + ELSE + INSERT INTO word (word_id, word_token, type, word, info) + SELECT full_token, t.lookup, 'W', norm_term, + CASE WHEN norm_term = t.norm THEN null + ELSE json_build_object('lookup', t.norm) END + FROM unnest(lookup_terms, lookup_norm_terms) as t(lookup, norm); + END IF; + END IF; + + FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP + term := trim(term); + IF NOT (ARRAY[term] <@ partial_terms) THEN + partial_terms := partial_terms || term; + END IF; + END LOOP; + + partial_tokens := '{}'::INT[]; + FOR term IN SELECT unnest(partial_terms) LOOP + SELECT min(word_id) INTO term_id FROM word WHERE word_token = term and type = 'w'; IF term_id IS NULL THEN term_id := nextval('seq_word'); - term_count := 0; - INSERT INTO word (word_id, word_token, type, info) - VALUES (term_id, term, 'w', json_build_object('count', term_count)); + INSERT INTO word (word_id, word_token, type) + VALUES (term_id, term, 'w'); END IF; partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); diff --git a/packaging/nominatim-api/pyproject.toml b/packaging/nominatim-api/pyproject.toml index 601029ca..ab1ed080 100644 --- a/packaging/nominatim-api/pyproject.toml +++ b/packaging/nominatim-api/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "nominatim-api" -version = "5.0.0.post7" +version = "5.1.0.post2" description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Search library." readme = "README.md" requires-python = ">=3.7" @@ -16,7 +16,7 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "SQLAlchemy==2.0.39", + "SQLAlchemy==2.0.40", "falcon==4.0.2", "uvicorn==0.34.0", "gunicorn==23.0.0" diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml index 0c9a7055..88ac096b 100644 --- a/packaging/nominatim-db/pyproject.toml +++ b/packaging/nominatim-db/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "nominatim-db" -version = "5.0.0.post7" +version = "5.1.0.post2" description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Database backend." readme = "README.md" requires-python = ">=3.7" @@ -17,11 +17,11 @@ classifiers = [ ] dependencies = [ "psycopg[binary]==3.2.6", - "python-dotenv==1.0.1", + "python-dotenv==1.1.0", "jinja2==3.1.6", "pyYAML==6.0.2", "psutil==7.0.0", - "PyICU==2.14", + "PyICU==2.15", "osmium==4.0.2", ] diff --git a/settings/env.defaults b/settings/env.defaults index b8c66667..3ebb288f 100644 --- a/settings/env.defaults +++ b/settings/env.defaults @@ -192,6 +192,13 @@ NOMINATIM_REQUEST_TIMEOUT=60 # to geocode" instead. NOMINATIM_SEARCH_WITHIN_COUNTRIES=False +# Specifies the order in which different name tags are used. +# The values in this list determine the preferred order of name variants, +# including language-specific names. +# Comma-separated list, where :XX stands for language-specific tags +# (e.g. name:en) and no :XX stands for general tags (e.g. name). +NOMINATIM_OUTPUT_NAMES=name:XX,name,brand,official_name:XX,short_name:XX,official_name,short_name,ref + ### Log settings # # The following options allow to enable logging of API requests. diff --git a/settings/icu-rules/variants-en.yaml b/settings/icu-rules/variants-en.yaml index 99cd6da6..54a7b475 100644 --- a/settings/icu-rules/variants-en.yaml +++ b/settings/icu-rules/variants-en.yaml @@ -6,25 +6,19 @@ - Air Force Base -> AFB - Air National Guard Base -> ANGB - Airport -> Aprt - - Alley -> Al - - Alley -> All - - Alley -> Ally - - Alley -> Aly + - Alley -> Al,All,Ally,Aly - Alleyway -> Alwy - Amble -> Ambl - Anex -> Anx - Apartments -> Apts - - Approach -> Apch - - Approach -> App + - Approach -> Apch,App - Arcade -> Arc - Arterial -> Artl - Artery -> Arty - - Avenue -> Av - - Avenue -> Ave + - Avenue -> Av,Ave - Back -> Bk - Banan -> Ba - - Basin -> Basn - - Basin -> Bsn + - Basin -> Basn,Bsn - Bayou -> Byu - Beach -> Bch - Bend -> Bnd @@ -33,71 +27,51 @@ - Bluffs -> Blfs - Boardwalk -> Bwlk - Bottom -> Btm - - Boulevard -> Blvd - - Boulevard -> Bvd + - Boulevard -> Blvd,Bvd - Boundary -> Bdy - Bowl -> Bl - Brace -> Br - Brae -> Br - Branch -> Br - Break -> Brk - - Bridge -> Bdge - - Bridge -> Br - - Bridge -> Brdg - - Bridge -> Brg - - Bridge -> Bri - - Broadway -> Bdwy - - Broadway -> Bway - - Broadway -> Bwy + - Bridge$ -> Bdge,Br,Brdg,Brg,Bri + - Broadway -> Bdwy,Bway,Bwy - Brook -> Brk - Brooks -> Brks - Brow -> Brw - - Buildings -> Bldgs - - Buildings -> Bldngs + - Buildings -> Bldgs,Bldngs - Business -> Bus - Burg -> Bg - Burgs -> Bgs - - Bypass -> Bps - - Bypass -> Byp - - Bypass -> Bypa + - Bypass -> Bps,Byp,Bypa - Byway -> Bywy - Camp -> Cp - Canyon -> Cyn - Cape -> Cpe - Caravan -> Cvn - - Causeway -> Caus - - Causeway -> Cswy - - Causeway -> Cway - - Center -> Cen - - Center -> Ctr + - Causeway -> Caus,Cswy,Cway + - Center,Centre -> Cen,Ctr - Centers -> Ctrs - Central -> Ctrl - - Centre -> Cen - - Centre -> Ctr - Centreway -> Cnwy - Chase -> Ch - Church -> Ch - Circle -> Cir - Circles -> Cirs - - Circuit -> Cct - - Circuit -> Ci - - Circus -> Crc - - Circus -> Crcs + - Circuit -> Cct,Ci + - Circus -> Crc,Crcs - City -> Cty - Cliff -> Clf - Cliffs -> Clfs - Close -> Cl - Club -> Clb - - Common -> Cmn - - Common -> Comm + - Common -> Cmn,Comm - Commons -> Cmns - Community -> Comm - Concourse -> Cnc - Concourse -> Con - Copse -> Cps - - Corner -> Cor - - Corner -> Cnr - - Corner -> Crn + - Corner -> Cor,Cnr,Crn - Corners -> Cors - Corso -> Cso - Cottages -> Cotts @@ -105,36 +79,24 @@ - County Road -> CR - County Route -> CR - Course -> Crse - - Court -> Crt - - Court -> Ct + - Court -> Crt,Ct - Courts -> Cts - Courtyard -> Cyd - Courtyard -> Ctyd - - Cove -> Ce - - Cove -> Cov - - Cove -> Cv + - Cove$ -> Ce,Cov,Cv - Coves -> Cvs - - Creek -> Ck - - Creek -> Cr - - Creek -> Crk + - Creek$ -> Ck,Cr,Crk - Crescent -> Cr - Crescent -> Cres - - Crest -> Crst - - Crest -> Cst + - Crest -> Crst,Cst - Croft -> Cft - - Cross -> Cs - - Cross -> Crss - - Crossing -> Crsg - - Crossing -> Csg - - Crossing -> Xing - - Crossroad -> Crd - - Crossroad -> Xrd + - Cross -> Cs,Crss + - Crossing -> Crsg,Csg,Xing + - Crossroad -> Crd,Xrd - Crossroads -> Xrds - Crossway -> Cowy - - Cul-de-sac -> Cds - - Cul-de-sac -> Csac - - Curve -> Cve - - Curve -> Curv + - Cul-de-sac -> Cds,Csac + - Curve -> Cve,Curv - Cutting -> Cutt - Dale -> Dle - Dam -> Dm @@ -143,14 +105,10 @@ - Divide -> Dv - Down -> Dn - Downs -> Dn - - Drive -> Dr - - Drive -> Drv - - Drive -> Dv + - Drive -> Dr,Drv,Dv - Drives -> Drs - Drive-In => Drive-In # prevent abbreviation here - - Driveway -> Drwy - - Driveway -> Dvwy - - Driveway -> Dwy + - Driveway -> Drwy,Dvwy,Dwy - East -> E - Edge -> Edg - Elbow -> Elb @@ -158,25 +116,18 @@ - Esplanade -> Esp - Estate -> Est - Estates -> Ests - - Expressway -> Exp - - Expressway -> Expy - - Expressway -> Expwy - - Expressway -> Xway + - Expressway -> Exp,Expy,Expwy,Xway - Extension -> Ex - Extensions -> Exts - - Fairway -> Fawy - - Fairway -> Fy + - Fairway -> Fawy,Fy - Falls -> Fls - Father -> Fr - - Ferry -> Fy - - Ferry -> Fry - - Field -> Fd - - Field -> Fld + - Ferry -> Fy,Fry + - Field -> Fd,Fld - Fields -> Flds - Fire Track -> Ftrk - Firetrail -> Fit - - Flat -> Fl - - Flat -> Flt + - Flat -> Fl,Flt - Flats -> Flts - Follow -> Folw - Footway -> Ftwy @@ -191,67 +142,47 @@ - Fork -> Frk - Forks -> Frks - Fort -> Ft - - Freeway -> Frwy - - Freeway -> Fwy + - Freeway -> Frwy,Fwy - Front -> Frnt - - Frontage -> Fr - - Frontage -> Frtg + - Frontage -> Fr,Frtg - Garden -> Gdn - - Gardens -> Gdn - - Gardens -> Gdns - - Gate -> Ga - - Gate -> Gte - - Gates -> Ga - - Gates -> Gte - - Gateway -> Gwy - - Gateway -> Gtwy + - Gardens -> Gdn,Gdns + - Gate,Gates -> Ga,Gte + - Gateway -> Gwy,Gtwy - George -> Geo - - Glade -> Gl - - Glade -> Gld - - Glade -> Glde + - Glade$ -> Gl,Gld,Glde - Glen -> Gln - Glens -> Glns - Grange -> Gra - - Green -> Gn - - Green -> Grn + - Green -> Gn,Grn - Greens -> Grns - Ground -> Grnd - - Grove -> Gr - - Grove -> Gro - - Grove -> Grv + - Grove$ -> Gr,Gro,Grv - Groves -> Grvs - Grovet -> Gr - Gully -> Gly - - Harbor -> Hbr + - Harbor -> Hbr,Harbour - Harbors -> Hbrs - - Harbour -> Hbr + - Harbour -> Hbr,Harbor - Haven -> Hvn - Head -> Hd - Heads -> Hd - - Heights -> Hgts - - Heights -> Ht - - Heights -> Hts + - Heights -> Hgts,Ht,Hts - High School -> HS - - Highroad -> Hird - - Highroad -> Hrd + - Highroad -> Hird,Hrd - Highway -> Hwy - Hill -> Hl - - Hills -> Hl - - Hills -> Hls + - Hills -> Hl,Hls - Hollow -> Holw - Hospital -> Hosp - - House -> Ho - - House -> Hse + - House -> Ho,Hse - Industrial -> Ind - Inlet -> Inlt - Interchange -> Intg - International -> Intl - - Island -> I - - Island -> Is + - Island -> I,Is - Islands -> Iss - - Junction -> Jct - - Junction -> Jctn - - Junction -> Jnc + - Junction -> Jct,Jctn,Jnc - Junctions -> Jcts - Junior -> Jr - Key -> Ky @@ -260,40 +191,31 @@ - Knolls -> Knls - Lagoon -> Lgn - Lake -> Lk - - Lakes -> L - - Lakes -> Lks - - Landing -> Ldg - - Landing -> Lndg - - Lane -> La - - Lane -> Ln + - Lakes -> L,Lks + - Landing -> Ldg,Lndg + - Lane -> La,Ln - Laneway -> Lnwy - Light -> Lgt - Lights -> Lgts - Line -> Ln - Link -> Lk - - Little -> Lit - - Little -> Lt + - Little -> Lit,Lt - Loaf -> Lf - Lock -> Lck - Locks -> Lcks - Lodge -> Ldg - Lookout -> Lkt - Loop -> Lp - - Lower -> Low - - Lower -> Lr - - Lower -> Lwr + - Lower -> Low,Lr,Lwr - Mall -> Ml - Manor -> Mnr - Manors -> Mnrs - Mansions -> Mans - Market -> Mkt - Meadow -> Mdw - - Meadows -> Mdw - - Meadows -> Mdws + - Meadows -> Mdw,Mdws - Mead -> Md - - Meander -> Mdr - - Meander -> Mndr - - Meander -> Mr + - Meander -> Mdr,Mndr,Mr - Medical -> Med - Memorial -> Mem - Mews -> Mw @@ -304,12 +226,10 @@ - Mill -> Ml - Mills -> Mls - Mission -> Msn - - Motorway -> Mtwy - - Motorway -> Mwy + - Motorway -> Mtwy,Mwy - Mount -> Mt - Mountain -> Mtn - - Mountains -> Mtn - - Mountains -> Mtns + - Mountains$ -> Mtn,Mtns - Municipal -> Mun - Museum -> Mus - National Park -> NP @@ -321,50 +241,37 @@ - Northeast -> NE - Northwest -> NW - Orchard -> Orch - - Outlook -> Out - - Outlook -> Otlk + - Outlook -> Out,Otlk - Overpass -> Opas - Parade -> Pde - Paradise -> Pdse - Park -> Pk - Parklands -> Pkld - - Parkway -> Pkwy - - Parkway -> Pky - - Parkway -> Pwy + - Parkway -> Pkwy,Pky,Pwy - Parkways -> Pkwy - Pass -> Ps - Passage -> Psge - - Pathway -> Phwy - - Pathway -> Pway - - Pathway -> Pwy + - Pathway -> Phwy,Pway,Pwy - Piazza -> Piaz - Pike -> Pk - Pine -> Pne - Pines -> Pnes - Place -> Pl - - Plain -> Pl - - Plain -> Pln - - Plains -> Pl - - Plains -> Plns + - Plain -> Pl,Pln + - Plains -> Pl,Plns - Plateau -> Plat - - Plaza -> Pl - - Plaza -> Plz - - Plaza -> Plza + - Plaza -> Pl,Plz,Plza - Pocket -> Pkt - - Point -> Pnt - - Point -> Pt + - Point -> Pnt,Pt - Points -> Pts - - Port -> Prt - - Port -> Pt + - Port -> Prt,Pt - Ports -> Prts - Post Office -> PO - Prairie -> Pr - Precinct -> Pct - - Promenade -> Prm - - Promenade -> Prom + - Promenade -> Prm,Prom - Quadrangle -> Qdgl - - Quadrant -> Qdrt - - Quadrant -> Qd + - Quadrant -> Qdrt,Qd - Quay -> Qy - Quays -> Qy - Quays -> Qys @@ -372,8 +279,7 @@ - Ramble -> Ra - Ramble -> Rmbl - Ranch -> Rnch - - Range -> Rge - - Range -> Rnge + - Range -> Rge,Rnge - Rapid -> Rpd - Rapids -> Rpds - Reach -> Rch @@ -381,37 +287,31 @@ - Reserve -> Res - Reservoir -> Res - Rest -> Rst - - Retreat -> Rt - - Retreat -> Rtt + - Retreat -> Rt,Rtt - Return -> Rtn - - Ridge -> Rdg - - Ridge -> Rdge + - Ridge -> Rdg,Rdge - Ridges -> Rdgs - Ridgeway -> Rgwy - Right of Way -> Rowy - Rise -> Ri - - River -> R - - River -> Riv - - River -> Rvr + - ^River -> R,Riv,Rvr + - River$ -> R,Riv,Rvr - Riverway -> Rvwy - Riviera -> Rvra - Road -> Rd - Roads -> Rds - Roadside -> Rdsd - - Roadway -> Rdwy - - Roadway -> Rdy + - Roadway -> Rdwy,Rdy - Rocks -> Rks - Ronde -> Rnde - Rosebowl -> Rsbl - Rotary -> Rty - Round -> Rnd - - Route -> Rt - - Route -> Rte + - Route -> Rt,Rte - Saint -> St - Saints -> SS - Senior -> Sr - - Serviceway -> Swy - - Serviceway -> Svwy + - Serviceway -> Swy,Svwy - Shoal -> Shl - Shore -> Shr - Shores -> Shrs @@ -421,8 +321,7 @@ - Skyway -> Skwy - Slope -> Slpe - Sound -> Snd - - South -> S - - South -> Sth + - South -> S,Sth - Southeast -> SE - Southwest -> SW - Spring -> Spg @@ -431,13 +330,10 @@ - Square -> Sq - Squares -> Sqs - Stairway -> Strwy - - State Highway -> SH - - State Highway -> SHwy + - State Highway -> SH,SHwy - State Route -> SR - - Station -> Sta - - Station -> Stn - - Strand -> Sd - - Strand -> Stra + - Station -> Sta,Stn + - Strand -> Sd,Stra - Stravenue -> Stra - Stream -> Strm - Street -> St @@ -447,61 +343,43 @@ - Summit -> Smt - Tarn -> Tn - Terminal -> Term - - Terrace -> Tce - - Terrace -> Ter - - Terrace -> Terr - - Thoroughfare -> Thfr - - Thoroughfare -> Thor + - Terrace -> Tce,Ter,Terr + - Thoroughfare -> Thfr,Thor - Throughway -> Trwy - - Tollway -> Tlwy - - Tollway -> Twy + - Tollway -> Tlwy,Twy - Towers -> Twrs - Township -> Twp - Trace -> Trce - - Track -> Tr - - Track -> Trak - - Track -> Trk + - Track -> Tr,Trak,Trk - Trafficway -> Trfy - Trail -> Trl - Trailer -> Trlr - Triangle -> Tri - Trunkway -> Tkwy - - Tunnel -> Tun - - Tunnel -> Tunl - - Turn -> Tn - - Turn -> Trn - - Turnpike -> Tpk - - Turnpike -> Tpke - - Underpass -> Upas - - Underpass -> Ups + - Tunnel -> Tun,Tunl + - Turn -> Tn,Trn + - Turnpike -> Tpk,Tpke + - Underpass -> Upas,Ups - Union -> Un - Unions -> Uns - - University -> Uni - - University -> Univ + - University -> Uni,Univ - Upper -> Up - Upper -> Upr - Vale -> Va - Valley -> Vly - Valley -> Vy - Valleys -> Vlys - - Viaduct -> Vdct - - Viaduct -> Via - - Viaduct -> Viad + - Viaduct$ -> Vdct,Via,Viad - View -> Vw - Views -> Vws - - Village -> Vill - - Village -> Vlg + - Village -> Vill,Vlg - Villages -> Vlgs - Villas -> Vlls - Ville -> Vl - - Vista -> Vis - - Vista -> Vst - - Vista -> Vsta - - Walk -> Wk - - Walk -> Wlk + - Vista -> Vis,Vst,Vsta + - Walk -> Wk,Wlk - Walks -> Walk - - Walkway -> Wkwy - - Walkway -> Wky + - Walkway -> Wkwy,Wky - Waters -> Wtr - Way -> Wy - Well -> Wl diff --git a/src/nominatim_api/localization.py b/src/nominatim_api/localization.py index bbf9225b..3414286e 100644 --- a/src/nominatim_api/localization.py +++ b/src/nominatim_api/localization.py @@ -8,6 +8,7 @@ Helper functions for localizing names of results. """ from typing import Mapping, List, Optional +from .config import Configuration import re @@ -20,14 +21,18 @@ class Locales: """ def __init__(self, langs: Optional[List[str]] = None): + self.config = Configuration(None) self.languages = langs or [] self.name_tags: List[str] = [] - # Build the list of supported tags. It is currently hard-coded. - self._add_lang_tags('name') - self._add_tags('name', 'brand') - self._add_lang_tags('official_name', 'short_name') - self._add_tags('official_name', 'short_name', 'ref') + parts = self.config.OUTPUT_NAMES.split(',') + + for part in parts: + part = part.strip() + if part.endswith(":XX"): + self._add_lang_tags(part[:-3]) + else: + self._add_tags(part) def __bool__(self) -> bool: return len(self.languages) > 0 diff --git a/src/nominatim_api/query_preprocessing/regex_replace.py b/src/nominatim_api/query_preprocessing/regex_replace.py new file mode 100644 index 00000000..b3a02495 --- /dev/null +++ b/src/nominatim_api/query_preprocessing/regex_replace.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This preprocessor replaces values in a given input based on pre-defined regex rules. + +Arguments: + pattern: Regex pattern to be applied on the input + replace: The string that it is to be replaced with +""" +from typing import List +import re + +from .config import QueryConfig +from .base import QueryProcessingFunc +from ..search.query import Phrase + + +class _GenericPreprocessing: + """Perform replacements to input phrases using custom regex patterns.""" + + def __init__(self, config: QueryConfig) -> None: + """Initialise the _GenericPreprocessing class with patterns from the ICU config file.""" + self.config = config + + match_patterns = self.config.get('replacements', 'Key not found') + self.compiled_patterns = [ + (re.compile(item['pattern']), item['replace']) for item in match_patterns + ] + + def split_phrase(self, phrase: Phrase) -> Phrase: + """This function performs replacements on the given text using regex patterns.""" + for item in self.compiled_patterns: + phrase.text = item[0].sub(item[1], phrase.text) + + return phrase + + def __call__(self, phrases: List[Phrase]) -> List[Phrase]: + """ + Return the final Phrase list. + Returns an empty list if there is nothing left after split_phrase. + """ + result = [p for p in map(self.split_phrase, phrases) if p.text.strip()] + return result + + +def create(config: QueryConfig) -> QueryProcessingFunc: + """ Create a function for generic preprocessing.""" + return _GenericPreprocessing(config) diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py index c63803d2..0292335e 100644 --- a/src/nominatim_api/search/db_search_builder.py +++ b/src/nominatim_api/search/db_search_builder.py @@ -208,7 +208,7 @@ class SearchBuilder: addr_partials = [t for r in address for t in self.query.get_partials_list(r)] addr_tokens = list({t.token for t in addr_partials}) - exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1)) + exp_count = min(t.count for t in name_partials.values()) / (3**(len(name_partials) - 1)) if (len(name_partials) > 3 or exp_count < 8000): yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens) @@ -264,8 +264,6 @@ class SearchBuilder: address lookups will use the index, when the occurrences are not too many. """ - # At this point drop unindexed partials from the address. - # This might yield wrong results, nothing we can do about that. if use_lookup: addr_restrict_tokens = [] addr_lookup_tokens = [t.token for t in addr_partials] diff --git a/src/nominatim_api/sql/sqlalchemy_functions.py b/src/nominatim_api/sql/sqlalchemy_functions.py index 81fc83d6..00830f33 100644 --- a/src/nominatim_api/sql/sqlalchemy_functions.py +++ b/src/nominatim_api/sql/sqlalchemy_functions.py @@ -122,15 +122,18 @@ class IsAddressPoint(sa.sql.functions.GenericFunction[Any]): def __init__(self, table: sa.Table) -> None: super().__init__(table.c.rank_address, - table.c.housenumber, table.c.name) + table.c.housenumber, table.c.name, table.c.address) @compiles(IsAddressPoint) def default_is_address_point(element: IsAddressPoint, compiler: 'sa.Compiled', **kw: Any) -> str: - rank, hnr, name = list(element.clauses) - return "(%s = 30 AND (%s IS NOT NULL OR %s ? 'addr:housename'))" % ( + rank, hnr, name, address = list(element.clauses) + return "(%s = 30 AND (%s IS NULL OR NOT %s ? '_inherited')" \ + " AND (%s IS NOT NULL OR %s ? 'addr:housename'))" % ( compiler.process(rank, **kw), + compiler.process(address, **kw), + compiler.process(address, **kw), compiler.process(hnr, **kw), compiler.process(name, **kw)) @@ -138,9 +141,11 @@ def default_is_address_point(element: IsAddressPoint, @compiles(IsAddressPoint, 'sqlite') def sqlite_is_address_point(element: IsAddressPoint, compiler: 'sa.Compiled', **kw: Any) -> str: - rank, hnr, name = list(element.clauses) - return "(%s = 30 AND coalesce(%s, json_extract(%s, '$.addr:housename')) IS NOT NULL)" % ( + rank, hnr, name, address = list(element.clauses) + return "(%s = 30 AND json_extract(%s, '$._inherited') IS NULL" \ + " AND coalesce(%s, json_extract(%s, '$.addr:housename')) IS NOT NULL)" % ( compiler.process(rank, **kw), + compiler.process(address, **kw), compiler.process(hnr, **kw), compiler.process(name, **kw)) diff --git a/src/nominatim_api/sql/sqlalchemy_types/geometry.py b/src/nominatim_api/sql/sqlalchemy_types/geometry.py index 90adcce8..583568c4 100644 --- a/src/nominatim_api/sql/sqlalchemy_types/geometry.py +++ b/src/nominatim_api/sql/sqlalchemy_types/geometry.py @@ -173,7 +173,7 @@ class Geometry(types.UserDefinedType): # type: ignore[type-arg] def __init__(self, subtype: str = 'Geometry'): self.subtype = subtype - def get_col_spec(self) -> str: + def get_col_spec(self, **_: Any) -> str: return f'GEOMETRY({self.subtype}, 4326)' def bind_processor(self, dialect: 'sa.Dialect') -> Callable[[Any], str]: diff --git a/src/nominatim_api/version.py b/src/nominatim_api/version.py index fc401248..3c98435d 100644 --- a/src/nominatim_api/version.py +++ b/src/nominatim_api/version.py @@ -8,4 +8,4 @@ Version information for the Nominatim API. """ -NOMINATIM_API_VERSION = '5.0.0' +NOMINATIM_API_VERSION = '5.1.0' diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 858cb64c..19b83863 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -121,10 +121,10 @@ class ICUTokenizer(AbstractTokenizer): SELECT unnest(nameaddress_vector) as id, count(*) FROM search_name GROUP BY id) SELECT coalesce(a.id, w.id) as id, - (CASE WHEN w.count is null THEN '{}'::JSONB + (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB ELSE jsonb_build_object('count', w.count) END || - CASE WHEN a.count is null THEN '{}'::JSONB + CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB ELSE jsonb_build_object('addr_count', a.count) END) as info FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; """) @@ -134,9 +134,10 @@ class ICUTokenizer(AbstractTokenizer): drop_tables(conn, 'tmp_word') cur.execute("""CREATE TABLE tmp_word AS SELECT word_id, word_token, type, word, - (CASE WHEN wf.info is null THEN word.info - ELSE coalesce(word.info, '{}'::jsonb) || wf.info - END) as info + coalesce(word.info, '{}'::jsonb) + - 'count' - 'addr_count' || + coalesce(wf.info, '{}'::jsonb) + as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id ORDER BY word_id @@ -585,10 +586,14 @@ class ICUNameAnalyzer(AbstractAnalyzer): if word_id: result = self._cache.housenumbers.get(word_id, result) if result[0] is None: - variants = analyzer.compute_variants(word_id) + varout = analyzer.compute_variants(word_id) + if isinstance(varout, tuple): + variants = varout[0] + else: + variants = varout if variants: hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)", - (word_id, list(variants))) + (word_id, variants)) result = hid, variants[0] self._cache.housenumbers[word_id] = result @@ -633,13 +638,17 @@ class ICUNameAnalyzer(AbstractAnalyzer): full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = analyzer.compute_variants(word_id) + varset = analyzer.compute_variants(word_id) + if isinstance(varset, tuple): + variants, lookups = varset + else: + variants, lookups = varset, None if not variants: continue with self.conn.cursor() as cur: - cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", - (token_id, variants)) + cur.execute("SELECT * FROM getorcreate_full_word(%s, %s, %s)", + (token_id, variants, lookups)) full, part = cast(Tuple[int, List[int]], cur.fetchone()) self._cache.names[token_id] = (full, part) diff --git a/src/nominatim_db/tokenizer/token_analysis/base.py b/src/nominatim_db/tokenizer/token_analysis/base.py index 52ee8013..186f1d3e 100644 --- a/src/nominatim_db/tokenizer/token_analysis/base.py +++ b/src/nominatim_db/tokenizer/token_analysis/base.py @@ -7,7 +7,7 @@ """ Common data types and protocols for analysers. """ -from typing import Mapping, List, Any +from typing import Mapping, List, Any, Union, Tuple from ...typing import Protocol from ...data.place_name import PlaceName @@ -33,7 +33,7 @@ class Analyzer(Protocol): for example because the character set in use does not match. """ - def compute_variants(self, canonical_id: str) -> List[str]: + def compute_variants(self, canonical_id: str) -> Union[List[str], Tuple[List[str], List[str]]]: """ Compute the transliterated spelling variants for the given canonical ID. diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py index fa9dc4df..b01cebf7 100644 --- a/src/nominatim_db/tokenizer/token_analysis/generic.py +++ b/src/nominatim_db/tokenizer/token_analysis/generic.py @@ -7,7 +7,7 @@ """ Generic processor for names that creates abbreviation variants. """ -from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast +from typing import Mapping, Dict, Any, Iterable, Optional, List, cast, Tuple import itertools from ...errors import UsageError @@ -78,7 +78,7 @@ class GenericTokenAnalysis: """ return cast(str, self.norm.transliterate(name.name)).strip() - def compute_variants(self, norm_name: str) -> List[str]: + def compute_variants(self, norm_name: str) -> Tuple[List[str], List[str]]: """ Compute the spelling variants for the given normalized name and transliterate the result. """ @@ -87,18 +87,20 @@ class GenericTokenAnalysis: for mutation in self.mutations: variants = mutation.generate(variants) - return [name for name in self._transliterate_unique_list(norm_name, variants) if name] - - def _transliterate_unique_list(self, norm_name: str, - iterable: Iterable[str]) -> Iterator[Optional[str]]: - seen = set() + varset = set(map(str.strip, variants)) if self.variant_only: - seen.add(norm_name) + varset.discard(norm_name) + + trans = [] + norm = [] + + for var in varset: + t = self.to_ascii.transliterate(var).strip() + if t: + trans.append(t) + norm.append(var) - for variant in map(str.strip, iterable): - if variant not in seen: - seen.add(variant) - yield self.to_ascii.transliterate(variant).strip() + return trans, norm def _generate_word_variants(self, norm_name: str) -> Iterable[str]: baseform = '^ ' + norm_name + ' ^' diff --git a/src/nominatim_db/version.py b/src/nominatim_db/version.py index 26856498..070417e3 100644 --- a/src/nominatim_db/version.py +++ b/src/nominatim_db/version.py @@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion: return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')]) -NOMINATIM_VERSION = parse_version('5.0.0-0') +NOMINATIM_VERSION = parse_version('5.1.0-0') POSTGRESQL_REQUIRED_VERSION = (12, 0) POSTGIS_REQUIRED_VERSION = (3, 0) diff --git a/test/python/api/query_processing/test_regex_replace.py b/test/python/api/query_processing/test_regex_replace.py new file mode 100644 index 00000000..ef759ba1 --- /dev/null +++ b/test/python/api/query_processing/test_regex_replace.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +''' +Tests for replacing values in an input using custom regex. +''' +import pytest + +import nominatim_api.search.query as qmod +from nominatim_api.query_preprocessing.config import QueryConfig +from nominatim_api.query_preprocessing import regex_replace + + +def run_preprocessor_on(query): + config = QueryConfig() + config.set_normalizer(None) + + config['replacements'] = [ + {'pattern': r'\b(?:\d{1,3}\.){3}\d{1,3}\b', 'replace': ''}, # IPv4 + {'pattern': r'https?://\S+', 'replace': ''} # HTTP/HTTPS URLs + ] + + proc = regex_replace.create(config) + return proc(query) + + +@pytest.mark.parametrize('inp,outp', [ + (['45.67.89.101'], []), + (['198.51.100.23'], []), + (['203.0.113.255'], []), + (['http://www.openstreetmap.org'], []), + (['https://www.openstreetmap.org/edit'], []), + (['http://osm.org'], []), + (['https://www.openstreetmap.org/user/abc'], []), + (['https://tile.openstreetmap.org/12/2048/2048.png'], []), + (['Check the map at https://www.openstreetmap.org'], ['Check the map at ']), + (['Use 203.0.113.255 for routing'], ['Use for routing']), + (['Find maps at https://osm.org and http://openstreetmap.org'], ['Find maps at and ']), + (['203.0.113.255', 'Some Address'], ['Some Address']), + (['https://osm.org', 'Another Place'], ['Another Place']), +]) +def test_split_phrases(inp, outp): + query = [qmod.Phrase(qmod.PHRASE_ANY, text) for text in inp] + + out = run_preprocessor_on(query) + assert out == [qmod.Phrase(qmod.PHRASE_ANY, text) for text in outp] diff --git a/test/python/api/test_api_reverse.py b/test/python/api/test_api_reverse.py index 91074ecb..d7d3ba7e 100644 --- a/test/python/api/test_api_reverse.py +++ b/test/python/api/test_api_reverse.py @@ -68,7 +68,8 @@ def test_reverse_ignore_unindexed(apiobj, frontend): (0.7, napi.DataLayer.NATURAL, 227), (0.70003, napi.DataLayer.MANMADE | napi.DataLayer.RAILWAY, 225), (0.70003, napi.DataLayer.MANMADE | napi.DataLayer.NATURAL, 225), - (5, napi.DataLayer.ADDRESS, 229)]) + (5, napi.DataLayer.ADDRESS, 229), + (5.0001, napi.DataLayer.ADDRESS, 229)]) def test_reverse_rank_30_layers(apiobj, frontend, y, layer, place_id): apiobj.add_placex(place_id=223, osm_type='N', class_='place', type='house', housenumber='1', @@ -96,6 +97,12 @@ def test_reverse_rank_30_layers(apiobj, frontend, y, layer, place_id): rank_address=30, rank_search=30, centroid=(1.3, 5)) + apiobj.add_placex(place_id=230, class_='place', type='house', + housenumber='2', + address={'_inherited': ''}, + rank_address=30, + rank_search=30, + centroid=(1.3, 5.0001)) api = frontend(apiobj, options=API_OPTIONS) assert api.reverse((1.3, y), layers=layer).place_id == place_id diff --git a/test/python/api/test_localization.py b/test/python/api/test_localization.py index 0a30cdc1..c3e02596 100644 --- a/test/python/api/test_localization.py +++ b/test/python/api/test_localization.py @@ -27,6 +27,62 @@ def test_display_name_none_localized(): assert loc.display_name({'ref': '34', 'name:de': 'DE'}) == '34' +def test_output_names_none_localized(): + loc = Locales() + + expected_tags = [ + 'name', '_place_name', 'brand', '_place_brand', 'official_name', '_place_official_name', + 'short_name', '_place_short_name', 'ref', '_place_ref' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_none_localized_and_custom_output_names(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,entrance:XX,name,brand,test_tag,' + 'official_name:XX,short_name:XX,alt_name:XX' + ) + loc = Locales() + + expected_tags = [ + 'name', '_place_name', 'brand', '_place_brand', 'test_tag', '_place_test_tag' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_none_localized_and_custom_output_names_more_than_two_changes(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,brand,test_tag:XX,official_name,short_name:XX,' + 'alt_name,another_tag_with:XX,another_tag_withoutXX' + ) + loc = Locales() + + expected_tags = [ + 'brand', '_place_brand', 'official_name', '_place_official_name', 'alt_name', + '_place_alt_name', 'another_tag_withoutXX', '_place_another_tag_withoutXX' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_none_localized_and_custom_output_names_including_space(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,name ,short_name:XX, short_name' + ) + loc = Locales() + + expected_tags = [ + 'name', '_place_name', 'short_name', '_place_short_name' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + def test_display_name_localized(): loc = Locales(['en', 'de']) @@ -35,6 +91,146 @@ def test_display_name_localized(): assert loc.display_name({'ref': '34', 'name:de': 'DE'}) == 'DE' +def test_output_names_localized(): + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name:en', '_place_name:en', 'name:es', '_place_name:es', 'name', '_place_name', 'brand', + '_place_brand', 'official_name:en', '_place_official_name:en', 'official_name:es', + '_place_official_name:es', 'short_name:en', '_place_short_name:en', 'short_name:es', + '_place_short_name:es', 'official_name', '_place_official_name', 'short_name', + '_place_short_name', 'ref', '_place_ref' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names_including_space(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,name ,short_name:XX, short_name' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name:en', '_place_name:en', 'name:es', '_place_name:es', + 'name', '_place_name', + 'short_name:en', '_place_short_name:en', 'short_name:es', '_place_short_name:es', + 'short_name', '_place_short_name' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,entrance:XX,name,brand,test_tag,official_name:XX,short_name:XX,alt_name:XX' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name:en', '_place_name:en', 'name:es', '_place_name:es', 'entrance:en', + '_place_entrance:en', 'entrance:es', '_place_entrance:es', 'name', '_place_name', + 'brand', '_place_brand', 'test_tag', '_place_test_tag', 'official_name:en', + '_place_official_name:en', 'official_name:es', '_place_official_name:es', + 'short_name:en', '_place_short_name:en', 'short_name:es', '_place_short_name:es', + 'alt_name:en', '_place_alt_name:en', 'alt_name:es', '_place_alt_name:es' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names_start_with_tag_that_has_no_XX(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name,brand,test_tag,official_name:XX,short_name:XX,alt_name:XX' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name', '_place_name', 'brand', '_place_brand', 'test_tag', '_place_test_tag', + 'official_name:en', '_place_official_name:en', 'official_name:es', + '_place_official_name:es', 'short_name:en', '_place_short_name:en', 'short_name:es', + '_place_short_name:es', 'alt_name:en', '_place_alt_name:en', 'alt_name:es', + '_place_alt_name:es' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names_no_named_tags(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name,brand,test_tag' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name', '_place_name', 'brand', '_place_brand', 'test_tag', '_place_test_tag' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names_only_named_tags(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,entrance:XX,official_name:XX,short_name:XX,alt_name:XX' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name:en', '_place_name:en', 'name:es', '_place_name:es', 'entrance:en', + '_place_entrance:en', 'entrance:es', '_place_entrance:es', 'official_name:en', + '_place_official_name:en', 'official_name:es', '_place_official_name:es', + 'short_name:en', '_place_short_name:en', 'short_name:es', '_place_short_name:es', + 'alt_name:en', '_place_alt_name:en', 'alt_name:es', '_place_alt_name:es' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names_more_than_two_changes(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,brand,test_tag:XX,official_name,short_name:XX,' + 'alt_name,another_tag_with:XX,another_tag_withoutXX' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name:en', '_place_name:en', 'name:es', '_place_name:es', 'brand', '_place_brand', + 'test_tag:en', '_place_test_tag:en', 'test_tag:es', '_place_test_tag:es', 'official_name', + '_place_official_name', 'short_name:en', '_place_short_name:en', 'short_name:es', + '_place_short_name:es', 'alt_name', '_place_alt_name', 'another_tag_with:en', + '_place_another_tag_with:en', 'another_tag_with:es', '_place_another_tag_with:es', + 'another_tag_withoutXX', '_place_another_tag_withoutXX' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + +def test_output_names_localized_and_custom_output_names_XX_in_the_middle(monkeypatch): + monkeypatch.setenv( + 'NOMINATIM_OUTPUT_NAMES', + 'name:XX,br:XXand,test_tag:XX,official_name,sh:XXort_name:XX,' + 'alt_name,another_tag_with:XX,another_tag_withoutXX' + ) + loc = Locales(['en', 'es']) + + expected_tags = [ + 'name:en', '_place_name:en', 'name:es', '_place_name:es', 'br:XXand', '_place_br:XXand', + 'test_tag:en', '_place_test_tag:en', 'test_tag:es', '_place_test_tag:es', 'official_name', + '_place_official_name', 'sh:XXort_name:en', '_place_sh:XXort_name:en', 'sh:XXort_name:es', + '_place_sh:XXort_name:es', 'alt_name', '_place_alt_name', 'another_tag_with:en', + '_place_another_tag_with:en', 'another_tag_with:es', '_place_another_tag_with:es', + 'another_tag_withoutXX', '_place_another_tag_withoutXX' + ] + + assert loc.name_tags == expected_tags, f'Expected {expected_tags}, but got {loc.name_tags}' + + def test_display_name_preference(): loc = Locales(['en', 'de']) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index ce00281c..12cef894 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -230,19 +230,20 @@ def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config): word_table.add_full_word(1000, 'hello') word_table.add_full_word(1001, 'bye') + word_table.add_full_word(1002, 'town') table_factory('search_name', 'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]', - [(12, [1000], [1001])]) + [(12, [1000], [1001]), (13, [1001], [1002]), (14, [1000, 1001], [1002])]) tok = tokenizer_factory() tok.update_statistics(test_config) - assert temp_db_cursor.scalar("""SELECT count(*) FROM word - WHERE type = 'W' and word_id = 1000 and - (info->>'count')::int > 0""") == 1 - assert temp_db_cursor.scalar("""SELECT count(*) FROM word - WHERE type = 'W' and word_id = 1001 and - (info->>'addr_count')::int > 0""") == 1 + assert temp_db_cursor.row_set("""SELECT word_id, + (info->>'count')::int, + (info->>'addr_count')::int + FROM word + WHERE type = 'W'""") == \ + {(1000, 2, None), (1001, 2, None), (1002, None, 2)} def test_normalize_postcode(analyzer): diff --git a/test/python/tokenizer/token_analysis/test_generic.py b/test/python/tokenizer/token_analysis/test_generic.py index 02870f24..48f2483b 100644 --- a/test/python/tokenizer/token_analysis/test_generic.py +++ b/test/python/tokenizer/token_analysis/test_generic.py @@ -40,7 +40,7 @@ def make_analyser(*variants, variant_only=False): def get_normalized_variants(proc, name): norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) - return proc.compute_variants(norm.transliterate(name).strip()) + return proc.compute_variants(norm.transliterate(name).strip())[0] def test_no_variants(): diff --git a/test/python/tokenizer/token_analysis/test_generic_mutation.py b/test/python/tokenizer/token_analysis/test_generic_mutation.py index 2ce2236a..e0507e4c 100644 --- a/test/python/tokenizer/token_analysis/test_generic_mutation.py +++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py @@ -40,7 +40,7 @@ class TestMutationNoVariants: def variants(self, name): norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) - return set(self.analysis.compute_variants(norm.transliterate(name).strip())) + return set(self.analysis.compute_variants(norm.transliterate(name).strip())[0]) @pytest.mark.parametrize('pattern', ('(capture)', ['a list'])) def test_bad_pattern(self, pattern):