From 2535780282ee3635df6e4857a864301624f1dadd Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 16 Dec 2024 10:44:37 +0100
Subject: [PATCH 01/16] exclude more tourism=information types

---
 lib-lua/themes/nominatim/presets.lua   | 7 +++++--
 test/bdd/osm2pgsql/import/tags.feature | 6 +++++-
 2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/lib-lua/themes/nominatim/presets.lua b/lib-lua/themes/nominatim/presets.lua
index 7afb204a..aa51ac14 100644
--- a/lib-lua/themes/nominatim/presets.lua
+++ b/lib-lua/themes/nominatim/presets.lua
@@ -190,7 +190,10 @@ module.MAIN_TAGS_POIS = function (group)
     historic = {'always',
                 yes = group,
                 no = group},
-    information = {include_when_tag_present('tourism', 'information')},
+    information = {include_when_tag_present('tourism', 'information'),
+                   yes = 'delete',
+                   route_marker = 'never',
+                   trail_blaze = 'never'},
     junction = {'fallback',
                 no = group},
     leisure = {'always',
@@ -228,7 +231,7 @@ module.MAIN_TAGS_POIS = function (group)
     tourism = {'always',
                no = group,
                yes = group,
-               information = 'fallback'},
+               information = exclude_when_key_present('information')},
     tunnel = {'named_with_key',
               no = group}
 } end
diff --git a/test/bdd/osm2pgsql/import/tags.feature b/test/bdd/osm2pgsql/import/tags.feature
index f4ebe7ad..69238e79 100644
--- a/test/bdd/osm2pgsql/import/tags.feature
+++ b/test/bdd/osm2pgsql/import/tags.feature
@@ -232,15 +232,19 @@ Feature: Tag evaluation
             n101 Ttourism=information,name=Generic
             n102 Ttourism=information,information=guidepost
             n103 Thighway=information,information=house
+            n104 Ttourism=information,information=yes,name=Something
+            n105 Ttourism=information,information=route_marker,name=3
             """
         Then place contains exactly
             | object           | type        |
+            | N100:tourism     | information |
             | N101:tourism     | information |
             | N102:information | guidepost   |
             | N103:highway     | information |
+            | N104:tourism     | information |
 
 
-    Scenario: Water feautures
+    Scenario: Water features
         When loading osm data
             """
             n20 Tnatural=water
-- 
2.39.5


From df6f70d223e8fb3129be03662fa90dfeb561309e Mon Sep 17 00:00:00 2001
From: mtmail <mtmail@gmx.net>
Date: Mon, 16 Dec 2024 23:38:18 +0100
Subject: [PATCH 02/16] fix typo in Tokenizers.md

---
 docs/develop/Tokenizers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md
index 8ce21343..a1dae78b 100644
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -95,7 +95,7 @@ Nominatim expects two files containing the Python part of the implementation:
 
  * `src/nominatim_db/tokenizer/<NAME>_tokenizer.py` contains the tokenizer
    code used during import and
- * `src/nominatim_api/search/NAME>_tokenizer.py` has the code used during
+ * `src/nominatim_api/search/<NAME>_tokenizer.py` has the code used during
    query time.
 
 `<NAME>` is a unique name for the tokenizer consisting of only lower-case
-- 
2.39.5


From 71fceb6854b760f1877ca73b782beec471491537 Mon Sep 17 00:00:00 2001
From: marc tobias <mtmail@gmx.net>
Date: Tue, 17 Dec 2024 01:28:02 +0100
Subject: [PATCH 03/16] Slight wording changes for Import-Styles.md

---
 docs/customize/Import-Styles.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/customize/Import-Styles.md b/docs/customize/Import-Styles.md
index 59589a0a..67fe7482 100644
--- a/docs/customize/Import-Styles.md
+++ b/docs/customize/Import-Styles.md
@@ -60,7 +60,7 @@ The _main tags_ classify what kind of place the OSM object represents. One
 OSM object can have more than one main tag. In such case one database entry
 is created for each main tag. _Name tags_ represent searchable names of the
 place. _Address tags_ are used to compute the address hierarchy of the place.
-Address are used for searching and for creating a display name of the place.
+Address tags are used for searching and for creating a display name of the place.
 _Extra tags_ are any tags that are not directly related to search but
 contain interesting additional information.
 
@@ -76,7 +76,7 @@ in which category.
 
 The flex style offers a number of functions to set the classification of
 each OSM tag. Most of these functions can also take a preset string instead
-of a tag descriptions. These presets describe common configurations that
+of a tag description. These presets describe common configurations that
 are also used in the definition of the predefined styles. This section
 lists the configuration functions and the accepted presets.
 
@@ -95,7 +95,7 @@ Any other string is matched exactly against tag keys.
 takes a lua table parameter which defines for keys and key/value
 combinations, how they are classified.
 
-The following classifications are recognised:
+The following classifications are recognized:
 
 | classification  | meaning |
 | :-------------- | :------ |
@@ -133,7 +133,7 @@ the same.
     In this example an object with a `boundary` tag will only be included
     when it has a value of `administrative`. Objects with `highway` tags are
     always included with two exceptions: the troll tag `highway=no` is
-    deleted on the spot and when the value is `street_lamp` then the object
+    deleted on the spot. And when the value is `street_lamp` then the object
     must have a name, too. Finally, if a `landuse` tag is present then
     it will be used independently of the concrete value when neither boundary
     nor highway tags were found and the object is named.
@@ -143,7 +143,7 @@ the same.
 | Name   | Description |
 | :----- | :---------- |
 | admin  | Basic tag set collecting places and administrative boundaries. This set is needed also to ensure proper address computation and should therefore always be present. You can disable selected place types like `place=locality` after adding this set, if they are not relevant for your use case. |
-| all_boundaries | Extends the set of recognised boundaries and places to all available ones. |
+| all_boundaries | Extends the set of recognized boundaries and places to all available ones. |
 | natural | Tags for natural features like rivers and mountain peaks. |
 | street/default | Tags for streets. Major streets are always included, minor ones only when they have a name. |
 | street/car | Tags for all streets that can be used by a motor vehicle. |
@@ -229,7 +229,7 @@ in turn take precedence over prefix matches.
 | Name     | Description |
 | :-----   | :---------- |
 | metatags | Tags with meta information about the OSM tag like source, notes and import sources. |
-| name     | Non-names that describe in fact properties or name parts. These names can throw off search and should always be removed. |
+| name     | Non-names that actually describe properties or name parts. These names can throw off search and should always be removed. |
 | address  | Extra `addr:*` tags that are not useful for Nominatim. |
 
 
@@ -305,7 +305,7 @@ the database independently of the presence of other main tags.
 `set_name_tags()` overwrites the current configuration, while
 `modify_name_tags()` replaces the fields that are given. (Be aware that
 the fields are replaced as a whole. `main = {'foo_name'}` will cause
-`foo_name` to become the only recognised primary name. Any previously
+`foo_name` to become the only recognized primary name. Any previously
 defined primary names are forgotten.)
 
 !!! example
@@ -326,9 +326,9 @@ defined primary names are forgotten.)
 
 | Name     | Description |
 | :-----   | :---------- |
-| core     | Basic set of recognised names for all places. |
+| core     | Basic set of recogniced names for all places. |
 | address  | Additional names useful when indexing full addresses. |
-| poi      | Extended set of recognised names for pois. Use on top of the core set. |
+| poi      | Extended set of recognized names for pois. Use on top of the core set. |
 
 ### Address tags
 
@@ -376,8 +376,8 @@ the fields are replaced as a whole.)
 
 | Name     | Description |
 | :-----   | :---------- |
-| core     | Basic set of tags needed to recognise address relationship for any place. Always include this. |
-| houses   | Additional set of tags needed to recognise proper addresses |
+| core     | Basic set of tags needed to recognize address relationship for any place. Always include this. |
+| houses   | Additional set of tags needed to recognize proper addresses |
 
 ### Handling of unclassified tags
 
@@ -514,7 +514,7 @@ Themepark topics offer two configuration options:
 
 The customization functions described in the
 [Changing recognized tags](#changing-the-recognized-tags) section
-are available from the theme. To access the theme you need to explicitly initialise it.
+are available from the theme. To access the theme you need to explicitly initialize it.
 
 !!! Example
     ``` lua
@@ -568,7 +568,7 @@ gazetteer output.
 
 ## Changing the style of existing databases
 
-There is normally no issue changing the style of a database that is already
+There is usually no issue changing the style of a database that is already
 imported and now kept up-to-date with change files. Just be aware that any
 change in the style applies to updates only. If you want to change the data
 that is already in the database, then a reimport is necessary.
-- 
2.39.5


From 501e13483efb2799591c425870bb6c646067335f Mon Sep 17 00:00:00 2001
From: mtmail <mtmail@gmx.net>
Date: Wed, 18 Dec 2024 21:58:51 +0100
Subject: [PATCH 04/16] Settings.md - one setting was repeated

---
 docs/customize/Settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/customize/Settings.md b/docs/customize/Settings.md
index b00d04cf..94726ca7 100644
--- a/docs/customize/Settings.md
+++ b/docs/customize/Settings.md
@@ -336,7 +336,7 @@ NOMINATIM_TABLESPACE_SEARCH_INDEX
 NOMINATIM_TABLESPACE_OSM_DATA
 :    Raw OSM data cache used for import and updates.
 
-NOMINATIM_TABLESPACE_OSM_DATA
+NOMINATIM_TABLESPACE_OSM_INDEX
 :    Indexes on the raw OSM data cache.
 
 NOMINATIM_TABLESPACE_PLACE_DATA
-- 
2.39.5


From f76dbb0a167fa49839c30da1a4e1d15439070e2a Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 20 Dec 2024 11:27:45 +0100
Subject: [PATCH 05/16] docs: update Update docs for virtualenv use

---
 docs/admin/Update.md            | 78 ++++++++++-----------------------
 docs/customize/Import-Styles.md |  2 +-
 docs/customize/Tokenizers.md    |  2 +-
 3 files changed, 26 insertions(+), 56 deletions(-)

diff --git a/docs/admin/Update.md b/docs/admin/Update.md
index 5d1324d0..cdb79cae 100644
--- a/docs/admin/Update.md
+++ b/docs/admin/Update.md
@@ -68,10 +68,10 @@ the update interval no new data has been published yet, it will go to sleep
 until the next expected update and only then attempt to download the next batch.
 
 The one-time mode is particularly useful if you want to run updates continuously
-but need to schedule other work in between updates. For example, the main
-service at osm.org uses it, to regularly recompute postcodes -- a process that
-must not be run while updates are in progress. Its update script
-looks like this:
+but need to schedule other work in between updates. For example, you might
+want to regularly recompute postcodes -- a process that
+must not be run while updates are in progress. An update script refreshing
+postcodes regularly might look like this:
 
 ```sh
 #!/bin/bash
@@ -109,17 +109,19 @@ Unit=nominatim-updates.service
 WantedBy=multi-user.target
 ```
 
-And then a similar service definition: `/etc/systemd/system/nominatim-updates.service`:
+`OnUnitActiveSec` defines how often the individual update command is run.
+
+Then add a service definition for the timer in `/etc/systemd/system/nominatim-updates.service`:
 
 ```
 [Unit]
 Description=Single updates of Nominatim
 
 [Service]
-WorkingDirectory=/srv/nominatim
-ExecStart=nominatim replication --once
-StandardOutput=append:/var/log/nominatim-updates.log
-StandardError=append:/var/log/nominatim-updates.error.log
+WorkingDirectory=/srv/nominatim-project
+ExecStart=/srv/nominatim-venv/bin/nominatim replication --once
+StandardOutput=journald
+StandardError=inherit
 User=nominatim
 Group=nominatim
 Type=simple
@@ -128,9 +130,9 @@ Type=simple
 WantedBy=multi-user.target
 ```
 
-Replace the `WorkingDirectory` with your project directory. Also adapt user and
-group names as required. `OnUnitActiveSec` defines how often the individual
-update command is run.
+Replace the `WorkingDirectory` with your project directory. `ExecStart` points
+to the nominatim binary that was installed in your virtualenv earlier.
+Finally, you might need to adapt user and group names as required.
 
 Now activate the service and start the updates:
 
@@ -140,12 +142,13 @@ sudo systemctl enable nominatim-updates.timer
 sudo systemctl start nominatim-updates.timer
 ```
 
-You can stop future data updates, while allowing any current, in-progress
+You can stop future data updates while allowing any current, in-progress
 update steps to finish, by running `sudo systemctl stop
 nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
-running (`sudo systemctl is-active nominatim-updates.service`). Current output
-from the update can be seen like above (`systemctl status
-nominatim-updates.service`).
+running (`sudo systemctl is-active nominatim-updates.service`).
+
+To check the output from the update process, use journalctl: `journalctl -u
+nominatim-updates.service`
 
 
 #### Catch-up mode
@@ -155,13 +158,13 @@ all changes from the server until the database is up-to-date. The catch-up mode
 still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
 applies the changes in appropriate batches until all is done.
 
-The catch-up mode is foremost useful to bring the database up to speed after the
+The catch-up mode is foremost useful to bring the database up to date after the
 initial import. Give that the service usually is not in production at this
 point, you can temporarily be a bit more generous with the batch size and
 number of threads you use for the updates by running catch-up like this:
 
 ```
-cd /srv/nominatim
+cd /srv/nominatim-project
 NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
 ```
 
@@ -173,13 +176,13 @@ replication catch-up at whatever interval you desire.
     When running scheduled updates with catch-up, it is a good idea to choose
     a replication source with an update frequency that is an order of magnitude
     lower. For example, if you want to update once a day, use an hourly updated
-    source. This makes sure that you don't miss an entire day of updates when
+    source. This ensures that you don't miss an entire day of updates when
     the source is unexpectedly late to publish its update.
 
     If you want to use the source with the same update frequency (e.g. a daily
     updated source with daily updates), use the
-    continuous update mode. It ensures to re-request the newest update until it
-    is published.
+    once mode together with a frequently run systemd script as described above.
+    It ensures to re-request the newest update until they have been published.
 
 
 #### Continuous updates
@@ -197,36 +200,3 @@ parameters:
 
 The update application keeps running forever and retrieves and applies
 new updates from the server as they are published.
-
-You can run this command as a simple systemd service. Create a service
-description like that in `/etc/systemd/system/nominatim-updates.service`:
-
-```
-[Unit]
-Description=Continuous updates of Nominatim
-
-[Service]
-WorkingDirectory=/srv/nominatim
-ExecStart=nominatim replication
-StandardOutput=append:/var/log/nominatim-updates.log
-StandardError=append:/var/log/nominatim-updates.error.log
-User=nominatim
-Group=nominatim
-Type=simple
-
-[Install]
-WantedBy=multi-user.target
-```
-
-Replace the `WorkingDirectory` with your project directory. Also adapt user
-and group names as required.
-
-Now activate the service and start the updates:
-
-```
-sudo systemctl daemon-reload
-sudo systemctl enable nominatim-updates
-sudo systemctl start nominatim-updates
-```
-
-
diff --git a/docs/customize/Import-Styles.md b/docs/customize/Import-Styles.md
index 67fe7482..23778f77 100644
--- a/docs/customize/Import-Styles.md
+++ b/docs/customize/Import-Styles.md
@@ -326,7 +326,7 @@ defined primary names are forgotten.)
 
 | Name     | Description |
 | :-----   | :---------- |
-| core     | Basic set of recogniced names for all places. |
+| core     | Basic set of recognized names for all places. |
 | address  | Additional names useful when indexing full addresses. |
 | poi      | Extended set of recognized names for pois. Use on top of the core set. |
 
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md
index 3c29972d..d290c148 100644
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -50,7 +50,7 @@ queries. This happens in two stages:
    as during the import process but may involve other processing like,
    for example, word break detection.
 2. The **token analysis** step breaks down the query parts into tokens,
-   looks them up in the database and assignes them possible functions and
+   looks them up in the database and assigns them possible functions and
    probabilities.
 
 Query processing can be further customized while the rest of the analysis
-- 
2.39.5


From cad44eb00c081ba32da3f97bcefaef49f1ac7f33 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 20 Dec 2024 22:59:02 +0100
Subject: [PATCH 06/16] remove farms and isolated dwellings from computed
 addresses

Farms and isolated dwellings are usually confined to a very small
area. It does not make sense if they are automatically used in
addressing surrounding features. Still works to use them for
parenting when used with addr:place.
---
 settings/address-levels.json                |  4 +--
 test/bdd/db/import/rank_computation.feature | 31 +++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/settings/address-levels.json b/settings/address-levels.json
index a82133ef..322ff707 100644
--- a/settings/address-levels.json
+++ b/settings/address-levels.json
@@ -23,8 +23,8 @@
           "allotments" : 22,
           "neighbourhood" : [20, 22],
           "quarter" : [20, 22],
-          "isolated_dwelling" : [22, 20],
-          "farm" : [22, 20],
+          "isolated_dwelling" : [22, 25],
+          "farm" : [22, 25],
           "city_block" : 25,
           "mountain_pass" : 25,
           "square" : 25,
diff --git a/test/bdd/db/import/rank_computation.feature b/test/bdd/db/import/rank_computation.feature
index df01fd91..0fce3e71 100644
--- a/test/bdd/db/import/rank_computation.feature
+++ b/test/bdd/db/import/rank_computation.feature
@@ -267,3 +267,34 @@ Feature: Rank assignment
           | object      | rank_search | rank_address |
           | N23:amenity | 30          | 30           |
           | N23:place   | 16          | 16           |
+
+    Scenario: Address rank 25 is only used for addr:place
+        Given the grid
+           | 10 | 33 | 34 | 11 |
+        Given the places
+          | osm | class | type    | name |
+          | N10 | place | village | vil  |
+          | N11 | place | farm    | farm |
+        And the places
+          | osm | class   | type        | name | geometry |
+          | W1  | highway | residential | RD   | 33,11    |
+        And the places
+          | osm | class   | type        | name | addr+farm | geometry |
+          | W2  | highway | residential | RD2  | farm       | 34,11    |
+        And the places
+          | osm | class | type  | housenr |
+          | N33 | place | house | 23      |
+        And the places
+          | osm | class | type  | housenr | addr+place |
+          | N34 | place | house | 23      | farm       |
+        When importing
+        Then placex contains
+          | object | parent_place_id |
+          | N11    | N10             |
+          | N33    | W1              |
+          | N34    | N11             |
+        And place_addressline contains
+          | object | address |
+          | W1     | N10     |
+          | W2     | N10     |
+          | W2     | N11     |
-- 
2.39.5


From c8a0dc8af19ad5485aebadcb57f981e134306b5e Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 19 Dec 2024 18:08:56 +0100
Subject: [PATCH 07/16] more efficient belongs-to-address determination

---
 lib-sql/functions/partition-functions.sql | 29 +++++------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql
index 20ec3da6..94ed2639 100644
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -17,28 +17,6 @@ CREATE TYPE nearfeaturecentr AS (
   centroid GEOMETRY
 );
 
--- feature intersects geometry
--- for areas and linestrings they must touch at least along a line
-CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
-RETURNS BOOLEAN
-AS $$
-BEGIN
-  IF substring(de9im from 1 for 2) != 'FF' THEN
-    RETURN TRUE;
-  END IF;
-
-  IF geom_type = 'ST_Point' THEN
-    RETURN substring(de9im from 4 for 1) = '0';
-  END IF;
-
-  IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
-    RETURN substring(de9im from 4 for 1) = '1';
-  END IF;
-
-  RETURN substring(de9im from 4 for 1) = '2';
-END
-$$ LANGUAGE plpgsql IMMUTABLE;
-
 CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
                                            feature_centroid GEOMETRY,
                                            maxrank INTEGER)
@@ -59,7 +37,12 @@ BEGIN
              isguess, postcode, centroid
       FROM location_area_large_{{ partition }}
       WHERE geometry && feature
-        AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
+        AND CASE WHEN ST_Dimension(feature) = 0
+                     THEN _ST_Covers(geometry, feature)
+                 WHEN ST_Dimension(feature) = 2
+                     THEN ST_Relate(geometry, feature, 'T********')
+                 ELSE ST_NPoints(ST_Intersection(geometry, feature)) > 1
+            END
         AND rank_address < maxrank
             -- Postcodes currently still use rank_search to define for which
             -- features they are relevant.
-- 
2.39.5


From 32d3eb46d5dfa4fd5486dc8abfb6afc1dcb0a360 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 19 Dec 2024 20:09:27 +0100
Subject: [PATCH 08/16] move geometry split into insertLocationAreaLarge()

thus insert only needs to be called once.
---
 lib-sql/functions/partition-functions.sql |  6 ++++--
 lib-sql/functions/utils.sql               | 21 ++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql
index 94ed2639..595e4a61 100644
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -125,14 +125,16 @@ BEGIN
 
   IF in_rank_search <= 4 and not in_estimate THEN
     INSERT INTO location_area_country (place_id, country_code, geometry)
-      values (in_place_id, in_country_code, in_geometry);
+      (SELECT in_place_id, in_country_code, geom
+       FROM split_geometry(in_geometry) as geom);
     RETURN TRUE;
   END IF;
 
 {% for partition in db.partitions %}
   IF in_partition = {{ partition }} THEN
     INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
-      values (in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, in_geometry);
+      (SELECT in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, geom
+       FROM split_geometry(in_geometry) as geom);
     RETURN TRUE;
   END IF;
 {% endfor %}
diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql
index df00f916..6af2afd5 100644
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -348,8 +348,6 @@ CREATE OR REPLACE FUNCTION add_location(place_id BIGINT, country_code varchar(2)
   RETURNS BOOLEAN
   AS $$
 DECLARE
-  locationid INTEGER;
-  secgeo GEOMETRY;
   postcode TEXT;
 BEGIN
   PERFORM deleteLocationArea(partition, place_id, rank_search);
@@ -360,18 +358,19 @@ BEGIN
       postcode := upper(trim (in_postcode));
   END IF;
 
-  IF ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') THEN
-    FOR secgeo IN select split_geometry(geometry) AS geom LOOP
-      PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, false, postcode, centroid, secgeo);
-    END LOOP;
-
-  ELSEIF ST_GeometryType(geometry) = 'ST_Point' THEN
-    secgeo := place_node_fuzzy_area(geometry, rank_search);
-    PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, true, postcode, centroid, secgeo);
+  IF ST_Dimension(geometry) = 2 THEN
+    RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
+                                   rank_search, rank_address, false, postcode,
+                                   centroid, geometry);
+  END IF;
 
+  IF ST_Dimension(geometry) = 0 THEN
+    RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
+                                   rank_search, rank_address, true, postcode,
+                                   centroid, place_node_fuzzy_area(geometry, rank_search));
   END IF;
 
-  RETURN true;
+  RETURN false;
 END;
 $$
 LANGUAGE plpgsql;
-- 
2.39.5


From 267e5dac0ddeff1fdb6560d5b69cd53b23c1e208 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 20 Dec 2024 11:03:15 +0100
Subject: [PATCH 09/16] split up MultiPolygons before adding them to
 large_areas table

---
 lib-sql/functions/utils.sql | 48 ++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql
index 6af2afd5..534beb58 100644
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -393,19 +393,21 @@ DECLARE
   geo RECORD;
   area FLOAT;
   remainingdepth INTEGER;
-  added INTEGER;
 BEGIN
-
 --  RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
 
-  IF (ST_GeometryType(geometry) not in ('ST_Polygon','ST_MultiPolygon') OR NOT ST_IsValid(geometry)) THEN
+  IF not ST_IsValid(geometry) THEN
+    RETURN;
+  END IF;
+
+  IF ST_Dimension(geometry) != 2 OR maxdepth <= 1 THEN
     RETURN NEXT geometry;
     RETURN;
   END IF;
 
   remainingdepth := maxdepth - 1;
   area := ST_AREA(geometry);
-  IF remainingdepth < 1 OR area < maxarea THEN
+  IF area < maxarea THEN
     RETURN NEXT geometry;
     RETURN;
   END IF;
@@ -425,7 +427,6 @@ BEGIN
   xmid := (xmin+xmax)/2;
   ymid := (ymin+ymax)/2;
 
-  added := 0;
   FOR seg IN 1..4 LOOP
 
     IF seg = 1 THEN
@@ -441,16 +442,13 @@ BEGIN
       secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
     END IF;
 
-    IF st_intersects(geometry, secbox) THEN
-      secgeo := st_intersection(geometry, secbox);
-      IF NOT ST_IsEmpty(secgeo) AND ST_GeometryType(secgeo) in ('ST_Polygon','ST_MultiPolygon') THEN
-        FOR geo IN select quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
-          IF NOT ST_IsEmpty(geo.geom) AND ST_GeometryType(geo.geom) in ('ST_Polygon','ST_MultiPolygon') THEN
-            added := added + 1;
-            RETURN NEXT geo.geom;
-          END IF;
-        END LOOP;
-      END IF;
+    secgeo := st_intersection(geometry, secbox);
+    IF NOT ST_IsEmpty(secgeo) AND ST_Dimension(secgeo) = 2 THEN
+      FOR geo IN SELECT quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
+        IF NOT ST_IsEmpty(geo.geom) AND ST_Dimension(geo.geom) = 2 THEN
+          RETURN NEXT geo.geom;
+        END IF;
+      END LOOP;
     END IF;
   END LOOP;
 
@@ -466,10 +464,22 @@ CREATE OR REPLACE FUNCTION split_geometry(geometry GEOMETRY)
 DECLARE
   geo RECORD;
 BEGIN
-  -- 10000000000 is ~~ 1x1 degree
-  FOR geo IN select quad_split_geometry(geometry, 0.25, 20) as geom LOOP
-    RETURN NEXT geo.geom;
-  END LOOP;
+  IF ST_GeometryType(geometry) = 'ST_MultiPolygon'
+     and ST_Area(geometry) * 10 > ST_Area(Box2D(geometry))
+  THEN
+    FOR geo IN
+        SELECT quad_split_geometry(g, 0.25, 20) as geom
+        FROM (SELECT (ST_Dump(geometry)).geom::geometry(Polygon, 4326) AS g) xx
+    LOOP
+      RETURN NEXT geo.geom;
+    END LOOP;
+  ELSE
+    FOR geo IN
+        SELECT quad_split_geometry(geometry, 0.25, 20) as geom
+    LOOP
+      RETURN NEXT geo.geom;
+    END LOOP;
+  END IF;
   RETURN;
 END;
 $$
-- 
2.39.5


From 499110f549cc2c369ea3a0fb55a79a11bdc0352f Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 6 Jan 2025 17:10:24 +0100
Subject: [PATCH 10/16] add SOFT_PHRASE break and enable parsing

Also enables parsing of PART breaks.
---
 src/nominatim_api/search/db_search_builder.py |  1 +
 src/nominatim_api/search/icu_tokenizer.py     | 14 +++++++++++---
 src/nominatim_api/search/query.py             |  8 +++++++-
 src/nominatim_api/search/token_assignment.py  |  1 +
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py
index 632270ef..a6335c13 100644
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
     BreakType.START: 0.0,
     BreakType.END: 0.0,
     BreakType.PHRASE: 0.0,
+    BreakType.SOFT_PHRASE: 0.0,
     BreakType.WORD: 0.1,
     BreakType.PART: 0.2,
     BreakType.TOKEN: 0.4
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index 5976fbec..d52614fd 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
 from collections import defaultdict
 import dataclasses
 import difflib
+import re
+from itertools import zip_longest
 
 from icu import Transliterator
 
@@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
         wordnr = 0
         for phrase in query.source:
             query.nodes[-1].ptype = phrase.ptype
-            for word in phrase.text.split(' '):
+            phrase_split = re.split('([ :-])', phrase.text)
+            # The zip construct will give us the pairs of word/break from
+            # the regular expression split. As the split array ends on the
+            # final word, we simply use the fillvalue to even out the list and
+            # add the phrase break at the end.
+            for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
+                if not word:
+                    continue
                 trans = self.transliterator.transliterate(word)
                 if trans:
                     for term in trans.split(' '):
                         if term:
                             parts.append(QueryPart(term, word, wordnr))
                             query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
-                    query.nodes[-1].btype = qmod.BreakType.WORD
+                    query.nodes[-1].btype = qmod.BreakType(breakchar)
                 wordnr += 1
-            query.nodes[-1].btype = qmod.BreakType.PHRASE
 
             for word, wrange in yield_words(parts, phrase_start):
                 words[word].append(wrange)
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index 02ebbb5b..b2e18337 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
     END = '>'
     """ End of the query. """
     PHRASE = ','
-    """ Break between two phrases. """
+    """ Hard break between two phrases. Address parts cannot cross hard
+        phrase boundaries."""
+    SOFT_PHRASE = ':'
+    """ Likely break between two phrases. Address parts should not cross soft
+        phrase boundaries. Soft breaks can be inserted by a preprocessor
+        that is analysing the input string.
+    """
     WORD = ' '
     """ Break between words. """
     PART = '-'
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py
index a2e1804c..0983fd13 100644
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
     qmod.BreakType.START: 0.0,
     qmod.BreakType.END: 0.0,
     qmod.BreakType.PHRASE: 0.0,
+    qmod.BreakType.SOFT_PHRASE: 0.0,
     qmod.BreakType.WORD: 0.1,
     qmod.BreakType.PART: 0.2,
     qmod.BreakType.TOKEN: 0.4
-- 
2.39.5


From d984100e23d28253b3659e7b628cde2c8b436cf8 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 7 Jan 2025 20:40:58 +0100
Subject: [PATCH 11/16] add inner word break penalty

---
 src/nominatim_api/search/icu_tokenizer.py | 43 ++++++++++++++++-------
 src/nominatim_api/search/query.py         |  1 +
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index d52614fd..6f1dcf79 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -7,7 +7,7 @@
 """
 Implementation of query analysis for the ICU tokenizer.
 """
-from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
+from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
 from collections import defaultdict
 import dataclasses
 import difflib
@@ -36,17 +36,30 @@ DB_TO_TOKEN_TYPE = {
     'C': qmod.TokenType.COUNTRY
 }
 
+PENALTY_IN_TOKEN_BREAK = {
+     qmod.BreakType.START: 0.5,
+     qmod.BreakType.END: 0.5,
+     qmod.BreakType.PHRASE: 0.5,
+     qmod.BreakType.SOFT_PHRASE: 0.5,
+     qmod.BreakType.WORD: 0.1,
+     qmod.BreakType.PART: 0.0,
+     qmod.BreakType.TOKEN: 0.0
+}
+
 
-class QueryPart(NamedTuple):
+@dataclasses.dataclass
+class QueryPart:
     """ Normalized and transliterated form of a single term in the query.
         When the term came out of a split during the transliteration,
         the normalized string is the full word before transliteration.
         The word number keeps track of the word before transliteration
         and can be used to identify partial transliterated terms.
+        Penalty is the break penalty for the break following the token.
     """
     token: str
     normalized: str
     word_number: int
+    penalty: float
 
 
 QueryParts = List[QueryPart]
@@ -60,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.
     total = len(terms)
     for first in range(start, total):
         word = terms[first].token
-        yield word, qmod.TokenRange(first, first + 1)
+        penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
+        yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
         for last in range(first + 1, min(first + 20, total)):
             word = ' '.join((word, terms[last].token))
-            yield word, qmod.TokenRange(first, last + 1)
+            penalty += terms[last - 1].penalty
+            yield word, qmod.TokenRange(first, last + 1, penalty=penalty)
 
 
 @dataclasses.dataclass
@@ -96,25 +111,25 @@ class ICUToken(qmod.Token):
         self.penalty += (distance/len(self.lookup_word))
 
     @staticmethod
-    def from_db_row(row: SaRow) -> 'ICUToken':
+    def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
         addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
 
-        penalty = 0.0
+        penalty = base_penalty
         if row.type == 'w':
-            penalty = 0.3
+            penalty += 0.3
         elif row.type == 'W':
             if len(row.word_token) == 1 and row.word_token == row.word:
-                penalty = 0.2 if row.word.isdigit() else 0.3
+                penalty += 0.2 if row.word.isdigit() else 0.3
         elif row.type == 'H':
-            penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
+            penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
             if all(not c.isdigit() for c in row.word_token):
                 penalty += 0.2 * (len(row.word_token) - 1)
         elif row.type == 'C':
             if len(row.word_token) == 1:
-                penalty = 0.3
+                penalty += 0.3
 
         if row.info is None:
             lookup_word = row.word
@@ -204,7 +219,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
 
         for row in await self.lookup_in_db(list(words.keys())):
             for trange in words[row.word_token]:
-                token = ICUToken.from_db_row(row)
+                token = ICUToken.from_db_row(row, trange.penalty or 0.0)
                 if row.type == 'S':
                     if row.info['op'] in ('in', 'near'):
                         if trange.start == 0:
@@ -256,9 +271,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                 if trans:
                     for term in trans.split(' '):
                         if term:
-                            parts.append(QueryPart(term, word, wordnr))
+                            parts.append(QueryPart(term, word, wordnr,
+                                                   PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
                             query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
                     query.nodes[-1].btype = qmod.BreakType(breakchar)
+                    parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
                 wordnr += 1
 
             for word, wrange in yield_words(parts, phrase_start):
@@ -280,7 +297,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
         """ Add tokens to query that are not saved in the database.
         """
         for part, node, i in zip(parts, query.nodes, range(1000)):
-            if len(part.token) <= 4 and part[0].isdigit()\
+            if len(part.token) <= 4 and part.token.isdigit()\
                and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
                                 ICUToken(penalty=0.5, token=0,
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index b2e18337..aa169431 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -122,6 +122,7 @@ class TokenRange:
     """
     start: int
     end: int
+    penalty: Optional[float] = None
 
     def __lt__(self, other: 'TokenRange') -> bool:
         return self.end <= other.start
-- 
2.39.5


From 86ad9efa8abb1fb478b3be5b6c469877aad05a51 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 7 Jan 2025 21:32:32 +0100
Subject: [PATCH 12/16] keep break indicators [:-] during normalisation

All punctuation will be converted to '-'. Soft breaks : may be
added by preprocessors. The break signs are only used during
query analysis and are ignored during import token analysis.
---
 settings/icu_tokenizer.yaml                      | 7 ++++---
 src/nominatim_api/search/geocoder.py             | 4 ++--
 src/nominatim_db/tokenizer/icu_token_analysis.py | 2 ++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index 530df1a6..437319fa 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -9,16 +9,17 @@ normalization:
     - "'nÂº' > 'no'"
     - "Âª > a"
     - "Âº > o"
-    - "[[:Punctuation:][:Symbol:]\u02bc]  > ' '"
+    - "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+  > '-'"
     - "Ã > 'ss'" # German szet is unambiguously equal to double ss
-    - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
+    - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
     - "[:Lm:] >"
     - ":: [[:Number:]] Latin ()"
     - ":: [[:Number:]] Ascii ();"
     - ":: [[:Number:]] NFD ();"
     - "[[:Nonspacing Mark:] [:Cf:]] >;"
-    - "[:Space:]+ > ' '"
+    - "[-:]?[:Space:]+[-:]? > ' '"
 transliteration:
+    - "[-:]  > ' '"
     - ":: Latin ()"
     - !include icu-rules/extended-unicode-to-asccii.yaml
     - ":: Ascii ()"
diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py
index efe5b721..69455d77 100644
--- a/src/nominatim_api/search/geocoder.py
+++ b/src/nominatim_api/search/geocoder.py
@@ -133,7 +133,7 @@ class ForwardGeocoder:
         """
         assert self.query_analyzer is not None
         qwords = [word for phrase in query.source
-                  for word in re.split('[, ]+', phrase.text) if word]
+                  for word in re.split('[-,: ]+', phrase.text) if word]
         if not qwords:
             return
 
@@ -146,7 +146,7 @@ class ForwardGeocoder:
             distance = 0.0
             norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
                                                                 result.country_code or '')))
-            words = set((w for w in norm.split(' ') if w))
+            words = set((w for w in re.split('[-,: ]+', norm) if w))
             if not words:
                 continue
             for qword in qwords:
diff --git a/src/nominatim_db/tokenizer/icu_token_analysis.py b/src/nominatim_db/tokenizer/icu_token_analysis.py
index a3cdcb7a..c1ba106c 100644
--- a/src/nominatim_db/tokenizer/icu_token_analysis.py
+++ b/src/nominatim_db/tokenizer/icu_token_analysis.py
@@ -25,6 +25,8 @@ class ICUTokenAnalysis:
 
     def __init__(self, norm_rules: str, trans_rules: str,
                  analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
+        # additional break signs are not relevant during name analysis
+        norm_rules += ";[[:Space:][-:]]+ > ' ';"
         self.normalizer = Transliterator.createFromRules("icu_normalization",
                                                          norm_rules)
         trans_rules += ";[:Space:]+ > ' '"
-- 
2.39.5


From efc09a5cfcfa85663db7d2a4deb86808a0a72de6 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 8 Jan 2025 19:43:25 +0100
Subject: [PATCH 13/16] add japanese phrase preprocessing

Code adapted from GSOC code by @miku.
---
 settings/icu_tokenizer.yaml                   |  1 +
 .../split_japanese_phrases.py                 | 61 +++++++++++++++++++
 .../test_split_japanese_phrases.py            | 34 +++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 src/nominatim_api/query_preprocessing/split_japanese_phrases.py
 create mode 100644 test/python/api/query_processing/test_split_japanese_phrases.py

diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index 437319fa..6cf30d59 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -1,4 +1,5 @@
 query-preprocessing:
+    - step: split_japanese_phrases
     - step: normalize
 normalization:
     - ":: lower ()"
diff --git a/src/nominatim_api/query_preprocessing/split_japanese_phrases.py b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py
new file mode 100644
index 00000000..7ab55b5f
--- /dev/null
+++ b/src/nominatim_api/query_preprocessing/split_japanese_phrases.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This file divides Japanese addresses into three categories:
+prefecture, municipality, and other.
+The division is not strict but simple using these keywords.
+"""
+from typing import List
+import re
+
+from .config import QueryConfig
+from .base import QueryProcessingFunc
+from ..search.query import Phrase
+
+MATCH_PATTERNS = [
+    r'''
+                (...??[é½ï¨¦éåºçç¸£])            # [group1] prefecture
+                (.+?[å¸åºåçºæ])              # [group2] municipalities (city/wards/towns/villages)
+                (.+)                         # [group3] other words
+                ''',
+    r'''
+                (...??[é½ï¨¦éåºçç¸£])            # [group1] prefecture
+                (.+)                         # [group3] other words
+                ''',
+    r'''
+                (.+?[å¸åºåçºæ])              # [group2] municipalities (city/wards/towns/villages)
+                (.+)                         # [group3] other words
+                '''
+]
+
+
+class _JapanesePreprocessing:
+
+    def __init__(self, config: QueryConfig) -> None:
+        self.config = config
+
+    def split_phrase(self, phrase: Phrase) -> Phrase:
+        """
+        This function performs a division on the given text using a regular expression.
+        """
+        for pattern in MATCH_PATTERNS:
+            result = re.match(pattern, phrase.text, re.VERBOSE)
+            if result is not None:
+                return Phrase(phrase.ptype, ':'.join(result.groups()))
+
+        return phrase
+
+    def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
+        """Split a Japanese address using japanese_tokenizer.
+        """
+        return [self.split_phrase(p) for p in phrases]
+
+
+def create(config: QueryConfig) -> QueryProcessingFunc:
+    """ Create a function of japanese preprocessing.
+    """
+    return _JapanesePreprocessing(config)
diff --git a/test/python/api/query_processing/test_split_japanese_phrases.py b/test/python/api/query_processing/test_split_japanese_phrases.py
new file mode 100644
index 00000000..6055f9db
--- /dev/null
+++ b/test/python/api/query_processing/test_split_japanese_phrases.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for japanese phrase splitting.
+"""
+from pathlib import Path
+
+import pytest
+
+from icu import Transliterator
+
+import nominatim_api.search.query as qmod
+from nominatim_api.query_preprocessing.config import QueryConfig
+from nominatim_api.query_preprocessing import split_japanese_phrases
+
+def run_preprocessor_on(query):
+    proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
+
+    return proc(query)
+
+
+@pytest.mark.parametrize('inp,outp', [('å¤§éªåºå¤§éªå¸å¤§éª', 'å¤§éªåº:å¤§éªå¸:å¤§éª'),
+                                      ('å¤§éªåºå¤§éª', 'å¤§éªåº:å¤§éª'),
+                                      ('å¤§éªå¸å¤§éª', 'å¤§éªå¸:å¤§éª')])
+def test_split_phrases(inp, outp):
+    query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
+
+    out = run_preprocessor_on(query)
+
+    assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
-- 
2.39.5


From c2cb6722fe8c855b42514b74146295f8001044e2 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 9 Jan 2025 17:14:37 +0100
Subject: [PATCH 14/16] use autocommit when creating tables and indexes

Might avoid some deadlock situations with autovacuum.
---
 src/nominatim_db/clicmd/setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/nominatim_db/clicmd/setup.py b/src/nominatim_db/clicmd/setup.py
index 39cbe65a..fb193838 100644
--- a/src/nominatim_db/clicmd/setup.py
+++ b/src/nominatim_db/clicmd/setup.py
@@ -122,13 +122,16 @@ class SetupAll:
 
         LOG.warning('Post-process tables')
         with connect(args.config.get_libpq_dsn()) as conn:
+            conn.autocommit = True
             await database_import.create_search_indices(conn, args.config,
                                                         drop=args.no_updates,
                                                         threads=num_threads)
             LOG.warning('Create search index for default country names.')
+            conn.autocommit = False
             country_info.create_country_names(conn, tokenizer,
                                               args.config.get_str_list('LANGUAGES'))
             if args.no_updates:
+                conn.autocommit = True
                 freeze.drop_update_tables(conn)
         tokenizer.finalize_import(args.config)
 
@@ -183,6 +186,7 @@ class SetupAll:
         from ..tools import database_import, refresh
 
         with connect(config.get_libpq_dsn()) as conn:
+            conn.autocommit = True
             LOG.warning('Create functions (1st pass)')
             refresh.create_functions(conn, config, False, False)
             LOG.warning('Create tables')
-- 
2.39.5


From 0cf636a80ce668e5c2a0d0000208f63a33315071 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 10 Jan 2025 13:55:43 +0100
Subject: [PATCH 15/16] ignore overly long ways during import

---
 lib-lua/themes/nominatim/init.lua | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib-lua/themes/nominatim/init.lua b/lib-lua/themes/nominatim/init.lua
index dacaaae8..fef86f91 100644
--- a/lib-lua/themes/nominatim/init.lua
+++ b/lib-lua/themes/nominatim/init.lua
@@ -425,7 +425,7 @@ function Place:write_row(k, v)
     if self.geometry == nil then
         self.geometry = self.geom_func(self.object)
     end
-    if self.geometry:is_null() then
+    if self.geometry == nil or self.geometry:is_null() then
         return 0
     end
 
@@ -608,6 +608,9 @@ function module.process_way(object)
 
         if geom:is_null() then
             geom = o:as_linestring()
+            if geom:is_null() or geom:length() > 30 then
+                return nil
+            end
         end
 
         return geom
-- 
2.39.5


From 343b3c0f1c5d72e248ac73f52ad26ce2c84a8e53 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sat, 11 Jan 2025 10:31:34 +0100
Subject: [PATCH 16/16] release 4.5.0.post9

---
 packaging/nominatim-api/pyproject.toml | 6 +++---
 packaging/nominatim-db/pyproject.toml  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/packaging/nominatim-api/pyproject.toml b/packaging/nominatim-api/pyproject.toml
index b86b346b..e3974143 100644
--- a/packaging/nominatim-api/pyproject.toml
+++ b/packaging/nominatim-api/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "nominatim-api"
-version = "4.5.0.post8"
+version = "4.5.0.post9"
 description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Search library."
 readme = "README.md"
 requires-python = ">=3.7"
@@ -18,11 +18,11 @@ classifiers = [
 dependencies = [
     "python-dotenv==1.0.1",
     "pyYAML==6.0.2",
-    "SQLAlchemy==2.0.36",
+    "SQLAlchemy==2.0.37",
     "psycopg[binary]==3.2.3",
     "PyICU==2.14",
     "falcon==4.0.2",
-    "uvicorn==0.32.1",
+    "uvicorn==0.34.0",
     "gunicorn==23.0.0"
 ]
 
diff --git a/packaging/nominatim-db/pyproject.toml b/packaging/nominatim-db/pyproject.toml
index f35880f5..79dcfb6f 100644
--- a/packaging/nominatim-db/pyproject.toml
+++ b/packaging/nominatim-db/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "nominatim-db"
-version = "4.5.0.post8"
+version = "4.5.0.post9"
 description = "A tool for building a database of OpenStreetMap for geocoding and for searching the database. Database backend."
 readme = "README.md"
 requires-python = ">=3.7"
@@ -18,10 +18,10 @@ classifiers = [
 dependencies = [
     "psycopg[binary]==3.2.3",
     "python-dotenv==1.0.1",
-    "jinja2==3.1.4",
+    "jinja2==3.1.5",
     "pyYAML==6.0.2",
     "datrie==0.8.2",
-    "psutil==6.1.0",
+    "psutil==6.1.1",
     "PyICU==2.14",
     "osmium==4.0.2",
 ]
-- 
2.39.5