]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2450 from mtmail/tiger-data-2021
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 11 Oct 2021 17:22:15 +0000 (19:22 +0200)
committerGitHub <noreply@github.com>
Mon, 11 Oct 2021 17:22:15 +0000 (19:22 +0200)
US TIGER data 2021 released

56 files changed:
docs/admin/Installation.md
docs/admin/Tokenizers.md
docs/develop/Tokenizers.md
lib-php/SearchDescription.php
lib-php/admin/country_languages.php [deleted file]
lib-sql/functions/interpolation.sql
lib-sql/functions/partition-functions.sql
lib-sql/functions/place_triggers.sql
lib-sql/functions/placex_triggers.sql
lib-sql/functions/utils.sql
lib-sql/tables.sql
lib-sql/tiger_import_start.sql
lib-sql/tokenizer/icu_tokenizer.sql
lib-sql/tokenizer/legacy_tokenizer.sql
nominatim/config.py
nominatim/db/sql_preprocessor.py
nominatim/indexer/place_info.py [new file with mode: 0644]
nominatim/indexer/runners.py
nominatim/tokenizer/base.py
nominatim/tokenizer/factory.py
nominatim/tokenizer/icu_name_processor.py [deleted file]
nominatim/tokenizer/icu_rule_loader.py
nominatim/tokenizer/icu_token_analysis.py [new file with mode: 0644]
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/icu_variants.py [deleted file]
nominatim/tokenizer/legacy_tokenizer.py
nominatim/tokenizer/place_sanitizer.py [new file with mode: 0644]
nominatim/tokenizer/sanitizers/__init__.py [new file with mode: 0644]
nominatim/tokenizer/sanitizers/split_name_list.py [new file with mode: 0644]
nominatim/tokenizer/sanitizers/strip_brace_terms.py [new file with mode: 0644]
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py [new file with mode: 0644]
nominatim/tokenizer/token_analysis/__init__.py [new file with mode: 0644]
nominatim/tokenizer/token_analysis/generic.py [new file with mode: 0644]
nominatim/tools/check_database.py
nominatim/tools/country_info.py
nominatim/tools/tiger_data.py
settings/address-levels.json
settings/country_settings.yaml
settings/icu_tokenizer.yaml
test/bdd/db/import/search_name.feature
test/bdd/db/query/normalization.feature
test/bdd/db/update/parenting.feature
test/python/conftest.py
test/python/dummy_tokenizer.py
test/python/test_db_connection.py
test/python/test_indexing.py
test/python/test_tokenizer_icu.py
test/python/test_tokenizer_icu_name_processor.py [deleted file]
test/python/test_tokenizer_icu_rule_loader.py
test/python/test_tokenizer_legacy.py
test/python/test_tools_check_database.py
test/python/tokenizer/sanitizers/test_split_name_list.py [new file with mode: 0644]
test/python/tokenizer/sanitizers/test_strip_brace_terms.py [new file with mode: 0644]
test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py [new file with mode: 0644]
test/python/tokenizer/test_place_sanitizer.py [new file with mode: 0644]
test/python/tokenizer/token_analysis/test_generic.py [new file with mode: 0644]

index 0258b8ff86290de9d6adf1907bfc808b4929db82..2ae594baadc4e8cf4a36a1ab8e09baf44dbc5239 100644 (file)
@@ -24,6 +24,10 @@ and can't offer support.
 
 ### Software
 
+!!! Warning
+    For larger installations you **must have** PostgreSQL 11+ and Postgis 3+
+    otherwise import and queries will be slow to the point of being unusable.
+
 For compiling:
 
   * [cmake](https://cmake.org/)
@@ -39,7 +43,7 @@ For compiling:
 For running Nominatim:
 
   * [PostgreSQL](https://www.postgresql.org) (9.5+ will work, 11+ strongly recommended)
-  * [PostGIS](https://postgis.net) (2.2+)
+  * [PostGIS](https://postgis.net) (2.2+ will work, 3.0+ strongly recommended)
   * [Python 3](https://www.python.org/) (3.6+)
   * [Psycopg2](https://www.psycopg.org) (2.7+)
   * [Python Dotenv](https://github.com/theskumar/python-dotenv)
index 6f8898c8ee70690d88aabd63661b758c9ed37b38..90d0fb5e03332702a5c551f3fc69dab317a675c3 100644 (file)
@@ -60,22 +60,23 @@ NOMINATIM_TOKENIZER=icu
 
 ### How it works
 
-On import the tokenizer processes names in the following four stages:
-
-1. The **Normalization** part removes all non-relevant information from the
-   input.
-2. Incoming names are now converted to **full names**. This process is currently
-   hard coded and mostly serves to handle name tags from OSM that contain
-   multiple names (e.g. [Biel/Bienne](https://www.openstreetmap.org/node/240097197)).
-3. Next the tokenizer creates **variants** from the full names. These variants
-   cover decomposition and abbreviation handling. Variants are saved to the
-   database, so that it is not necessary to create the variants for a search
-   query.
-4. The final **Tokenization** step converts the names to a simple ASCII form,
-   potentially removing further spelling variants for better matching.
-
-At query time only stage 1) and 4) are used. The query is normalized and
-tokenized and the resulting string used for searching in the database.
+On import the tokenizer processes names in the following three stages:
+
+1. During the **Sanitizer step** incoming names are cleaned up and converted to
+   **full names**. This step can be used to regularize spelling, split multi-name
+   tags into their parts and tag names with additional attributes. See the
+   [Sanitizers section](#sanitizers) below for available cleaning routines.
+2. The **Normalization** part removes all information from the full names
+   that are not relevant for search.
+3. The **Token analysis** step takes the normalized full names and creates
+   all transliterated variants under which the name should be searchable.
+   See the [Token analysis](#token-analysis) section below for more
+   information.
+
+During query time, only normalization and transliteration are relevant.
+An incoming query is first split into name chunks (this usually means splitting
+the string at the commas) and the each part is normalised and transliterated.
+The result is used to look up places in the search index.
 
 ### Configuration
 
@@ -93,21 +94,36 @@ normalization:
 transliteration:
     - !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
     - ":: Ascii ()"
-variants:
-    - language: de
-      words:
-        - ~haus => haus
-        - ~strasse -> str
-    - language: en
-      words: 
-        - road -> rd
-        - bridge -> bdge,br,brdg,bri,brg
+sanitizers:
+    - step: split-name-list
+token-analysis:
+    - analyzer: generic
+      variants:
+          - !include icu-rules/variants-ca.yaml
+          - words:
+              - road -> rd
+              - bridge -> bdge,br,brdg,bri,brg
 ```
 
-The configuration file contains three sections:
-`normalization`, `transliteration`, `variants`.
+The configuration file contains four sections:
+`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
 
-The normalization and transliteration sections each must contain a list of
+#### Normalization and Transliteration
+
+The normalization and transliteration sections each define a set of
+ICU rules that are applied to the names.
+
+The **normalisation** rules are applied after sanitation. They should remove
+any information that is not relevant for search at all. Usual rules to be
+applied here are: lower-casing, removing of special characters, cleanup of
+spaces.
+
+The **transliteration** rules are applied at the end of the tokenization
+process to transfer the name into an ASCII representation. Transliteration can
+be useful to allow for further fuzzy matching, especially between different
+scripts.
+
+Each section must contain a list of
 [ICU transformation rules](https://unicode-org.github.io/icu/userguide/transforms/general/rules.html).
 The rules are applied in the order in which they appear in the file.
 You can also include additional rules from external yaml file using the
@@ -119,6 +135,85 @@ and may again include other files.
     YAML syntax. You should therefore always enclose the ICU rules in
     double-quotes.
 
+#### Sanitizers
+
+The sanitizers section defines an ordered list of functions that are applied
+to the name and address tags before they are further processed by the tokenizer.
+They allows to clean up the tagging and bring it to a standardized form more
+suitable for building the search index.
+
+!!! hint
+    Sanitizers only have an effect on how the search index is built. They
+    do not change the information about each place that is saved in the
+    database. In particular, they have no influence on how the results are
+    displayed. The returned results always show the original information as
+    stored in the OpenStreetMap database.
+
+Each entry contains information of a sanitizer to be applied. It has a
+mandatory parameter `step` which gives the name of the sanitizer. Depending
+on the type, it may have additional parameters to configure its operation.
+
+The order of the list matters. The sanitizers are applied exactly in the order
+that is configured. Each sanitizer works on the results of the previous one.
+
+The following is a list of sanitizers that are shipped with Nominatim.
+
+##### split-name-list
+
+::: nominatim.tokenizer.sanitizers.split_name_list
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+##### strip-brace-terms
+
+::: nominatim.tokenizer.sanitizers.strip_brace_terms
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+##### tag-analyzer-by-language
+
+::: nominatim.tokenizer.sanitizers.tag_analyzer_by_language
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+
+
+#### Token Analysis
+
+Token analyzers take a full name and transform it into one or more normalized
+form that are then saved in the search index. In its simplest form, the
+analyzer only applies the transliteration rules. More complex analyzers
+create additional spelling variants of a name. This is useful to handle
+decomposition and abbreviation.
+
+The ICU tokenizer may use different analyzers for different names. To select
+the analyzer to be used, the name must be tagged with the `analyzer` attribute
+by a sanitizer (see for example the
+[tag-analyzer-by-language sanitizer](#tag-analyzer-by-language)).
+
+The token-analysis section contains the list of configured analyzers. Each
+analyzer must have an `id` parameter that uniquely identifies the analyzer.
+The only exception is the default analyzer that is used when no special
+analyzer was selected.
+
+Different analyzer implementations may exist. To select the implementation,
+the `analyzer` parameter must be set. Currently there is only one implementation
+`generic` which is described in the following.
+
+##### Generic token analyzer
+
+The generic analyzer is able to create variants from a list of given
+abbreviation and decomposition replacements. It takes one optional parameter
+`variants` which lists the replacements to apply. If the section is
+omitted, then the generic analyzer becomes a simple analyzer that only
+applies the transliteration.
+
 The variants section defines lists of replacements which create alternative
 spellings of a name. To create the variants, a name is scanned from left to
 right and the longest matching replacement is applied until the end of the
@@ -144,7 +239,7 @@ term.
     words in the configuration because then it is possible to change the
     rules for normalization later without having to adapt the variant rules.
 
-#### Decomposition
+###### Decomposition
 
 In its standard form, only full words match against the source. There
 is a special notation to match the prefix and suffix of a word:
@@ -171,7 +266,7 @@ To avoid automatic decomposition, use the '|' notation:
 
 simply changes "hauptstrasse" to "hauptstr" and "rote strasse" to "rote str".
 
-#### Initial and final terms
+###### Initial and final terms
 
 It is also possible to restrict replacements to the beginning and end of a
 name:
@@ -184,7 +279,7 @@ name:
 So the first example would trigger a replacement for "south 45th street" but
 not for "the south beach restaurant".
 
-#### Replacements vs. variants
+###### Replacements vs. variants
 
 The replacement syntax `source => target` works as a pure replacement. It changes
 the name instead of creating a variant. To create an additional version, you'd
index 529315e4431dd1b2d08097ef7ed92491989c2de1..5282db1ae73a1ad375232fedc647df130ad71a9e 100644 (file)
@@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no
 house numbers.
 
 ```sql
-FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
 ```
 
-Return the match token IDs by which to search a matching street from the
-`addr:street` tag. These IDs will be matched against the IDs supplied by
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street`
-tag.
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:street` tag name. Must return either NULL or FALSE
+when the place has no `addr:street` tag.
 
 ```sql
-FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
 ```
 
-Return the match token IDs by which to search a matching place from the
-`addr:place` tag. These IDs will be matched against the IDs supplied by
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place`
-tag.
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:place` tag name. Must return either NULL or FALSE
+when the place has no `addr:place` tag.
+
 
 ```sql
 FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
@@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the
 database. Must be NULL when the place has no `addr:place` tag.
 
 ```sql
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
+FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
+```
+
+Return the set of keys for which address information is provided. This
+should correspond to the list of (relevant) `addr:*` tags with the `addr:`
+prefix removed or the keys used in the `address` dictionary of the place info.
 
-FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
+```sql
+FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
 ```
 
-Return the match and search token IDs for explicit `addr:*` tags for the place
-other than `addr:street` and `addr:place`. For each address item there are
-three pieces of information returned:
-
- * _key_ contains the type of address item (city, county, etc.). This is the
-   key handed in with the `address` dictionary.
- * *match_tokens* is the list of token IDs used to find the corresponding
-   place object for the address part. The list is matched against the IDs
-   from `token_get_name_match_tokens`.
- * *search_tokens* is the list of token IDs under which to search the address
-   item. It is used when no corresponding place object was found.
+Return the array of search tokens for the given address part. `key` can be
+expected to be one of those returned with `token_get_address_keys()`. The
+search tokens are added to the address search vector of the place, when no
+corresponding OSM object could be found for the given address part from which
+to copy the name information.
+
+```sql
+FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+```
+
+Check if the given tokens match against the address part `key`.
+
+__Warning:__ the tokens that are handed in are the lists previously saved
+from `token_get_name_search_tokens()`, _not_ from the match token list. This
+is an historical oddity which will be fixed at some point in the future.
+Currently, tokenizers are encouraged to make sure that matching works against
+both the search token list and the match token list.
 
 ```sql
 FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
index 4d944bfb1ee498835c2d3178b2ea04d2439ab75d..ee8bbc0c9be81304c9531760aac3751366a6a689 100644 (file)
@@ -624,7 +624,7 @@ class SearchDescription
             $aOrder[] = $this->oContext->distanceSQL('centroid');
         } elseif ($this->sPostcode) {
             if (empty($this->aAddress)) {
-                $aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.1))";
+                $aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.12))";
             } else {
                 $aOrder[] = "(SELECT min(ST_Distance(search_name.centroid, p.geometry)) FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."')";
             }
diff --git a/lib-php/admin/country_languages.php b/lib-php/admin/country_languages.php
deleted file mode 100644 (file)
index 95043d2..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-<?php
-@define('CONST_LibDir', dirname(dirname(__FILE__)));
-
-require_once(CONST_LibDir.'/init-cmd.php');
-
-ini_set('memory_limit', '800M');
-ini_set('display_errors', 'stderr');
-
-$aCMDOptions
- = array(
-    'Import country language data from osm wiki',
-    array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
-    array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
-    array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
-    array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
-   );
-getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
-
-loadSettings($aCMDResult['project-dir'] ?? getcwd());
-setupHTTPProxy();
-
-if (true) {
-    $sURL = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Country_Codes';
-    $sWikiPageXML = file_get_contents($sURL);
-    if (preg_match_all('#\\| ([a-z]{2}) \\|\\| [^|]+\\|\\| ([a-z,]+)#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) {
-        foreach ($aMatches as $aMatch) {
-            $aLanguages = explode(',', $aMatch[2]);
-            foreach ($aLanguages as $i => $s) {
-                $aLanguages[$i] = '"'.pg_escape_string($s).'"';
-            }
-            echo "UPDATE country_name set country_default_language_codes = '{".join(',', $aLanguages)."}' where country_code = '".pg_escape_string($aMatch[1])."';\n";
-        }
-    }
-}
index 55e44dfd646e05f497658ba401207b0fd2194b67..4ef36f4f635e50b821eb0105b803f67302f9570e 100644 (file)
@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
 
 
 -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
                                                     partition SMALLINT,
                                                     centroid GEOMETRY, geom GEOMETRY)
   RETURNS BIGINT
@@ -52,7 +52,7 @@ DECLARE
   parent_place_id BIGINT;
   location RECORD;
 BEGIN
-  parent_place_id := find_parent_for_address(street, place, partition, centroid);
+  parent_place_id := find_parent_for_address(token_info, partition, centroid);
 
   IF parent_place_id is null THEN
     FOR location IN SELECT place_id FROM placex
@@ -155,9 +155,8 @@ BEGIN
   NEW.interpolationtype = NEW.address->'interpolation';
 
   place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
-                                                 token_addr_place_match_tokens(NEW.token_info),
-                                                 NEW.partition, place_centroid, NEW.linegeo);
+  NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
+                                                 place_centroid, NEW.linegeo);
 
   interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
 
index 53aba22c90a3e15290c76ad125c0c7f11982ae52..97afec1560f8d6f6de2ab5091aaf083eaf482ca8 100644 (file)
@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
 
 CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
                                              from_rank SMALLINT, to_rank SMALLINT,
-                                             extent FLOAT, tokens INT[])
+                                             extent FLOAT, token_info JSONB, key TEXT)
   RETURNS nearfeaturecentr
   AS $$
 DECLARE
@@ -80,7 +80,7 @@ BEGIN
         FROM location_area_large_{{ partition }}
         WHERE geometry && ST_Expand(feature, extent)
               AND rank_address between from_rank and to_rank
-              AND tokens && keywords
+              AND token_matches_address(token_info, key, keywords)
         GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
         ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
       RETURN r;
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;
 
 CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
                                                       point GEOMETRY,
-                                                      isin_token INTEGER[])
+                                                      token_info JSONB)
   RETURNS BIGINT
   AS $$
 DECLARE
   parent BIGINT;
 BEGIN
+  IF not token_has_addr_street(token_info) THEN
+    RETURN NULL;
+  END IF;
 
 {% for partition in db.partitions %}
   IF in_partition = {{ partition }} THEN
     SELECT place_id FROM search_name_{{ partition }}
       INTO parent
-      WHERE name_vector && isin_token
+      WHERE token_matches_street(token_info, name_vector)
             AND centroid && ST_Expand(point, 0.015)
             AND address_rank between 26 and 27
       ORDER BY ST_Distance(centroid, point) ASC limit 1;
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
 
 CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
                                                        point GEOMETRY,
-                                                       isin_token INTEGER[])
+                                                       token_info JSONB)
   RETURNS BIGINT
   AS $$
 DECLARE
   parent BIGINT;
 BEGIN
+  IF not token_has_addr_place(token_info) THEN
+    RETURN NULL;
+  END IF;
 
 {% for partition in db.partitions %}
   IF in_partition = {{ partition }} THEN
     SELECT place_id
       INTO parent
       FROM search_name_{{ partition }}
-      WHERE name_vector && isin_token
+      WHERE token_matches_place(token_info, name_vector)
             AND centroid && ST_Expand(point, 0.04)
             AND address_rank between 16 and 25
       ORDER BY ST_Distance(centroid, point) ASC limit 1;
index 014c8cd75ea4502373ce12575a99819aa857ceae..ca16871a89fa4d4acb41fb67d9abd61ea6a2602c 100644 (file)
@@ -247,6 +247,7 @@ BEGIN
         indexed_status = 2,
         geometry = NEW.geometry
         where place_id = existingplacex.place_id;
+
       -- if a node(=>house), which is part of a interpolation line, changes (e.g. the street attribute) => mark this line for reparenting 
       -- (already here, because interpolation lines are reindexed before nodes, so in the second call it would be too late)
       IF NEW.osm_type='N'
@@ -270,6 +271,26 @@ BEGIN
               and x.class = p.class;
       END IF;
 
+      IF coalesce(existing.name::text, '') != coalesce(NEW.name::text, '')
+      THEN
+        IF existingplacex.rank_address between 26 and 27 THEN
+          -- When streets change their name, this may have an effect on POI objects
+          -- with addr:street tags.
+          UPDATE placex SET indexed_status = 2
+          WHERE indexed_status = 0 and address ? 'street'
+                and parent_place_id = existingplacex.place_id;
+          UPDATE placex SET indexed_status = 2
+          WHERE indexed_status = 0 and rank_search = 30 and address ? 'street'
+                and ST_DWithin(NEW.geometry, geometry, 0.002);
+        ELSEIF existingplacex.rank_address between 16 and 25 THEN
+          -- When places change their name, this may have an effect on POI objects
+          -- with addr:place tags.
+          UPDATE placex SET indexed_status = 2
+          WHERE indexed_status = 0 and address ? 'place' and rank_search = 30
+                and parent_place_id = existingplacex.place_id;
+          -- No update of surrounding objects, potentially too expensive.
+        END IF;
+      END IF;
     END IF;
 
     -- Abort the add (we modified the existing place instead)
index fa7156ec904c396a8376b780d03c09a37045fb43..8ae8cf39c0d9fb2e54649ffc821164c26e5de0e6 100644 (file)
@@ -1,30 +1,33 @@
 -- Trigger functions for the placex table.
 
+-- Information returned by update preparation.
+DROP TYPE IF EXISTS prepare_update_info CASCADE;
+CREATE TYPE prepare_update_info AS (
+  name HSTORE,
+  address HSTORE,
+  rank_address SMALLINT,
+  country_code TEXT,
+  class TEXT,
+  type TEXT,
+  linked_place_id BIGINT
+);
+
 -- Retrieve the data needed by the indexer for updating the place.
---
--- Return parameters:
---  name            list of names
---  address         list of address tags, either from the object or a surrounding
---                  building
---  country_feature If the place is a country feature, this contains the
---                  country code, otherwise it is null.
-CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                 OUT name HSTORE,
-                                                 OUT address HSTORE,
-                                                 OUT country_feature VARCHAR,
-                                                 OUT linked_place_id BIGINT)
+CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
+  RETURNS prepare_update_info
   AS $$
 DECLARE
   location RECORD;
+  result prepare_update_info;
 BEGIN
   -- For POI nodes, check if the address should be derived from a surrounding
   -- building.
   IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
-    address := p.address;
+    result.address := p.address;
   ELSE
     -- The additional && condition works around the misguided query
     -- planner of postgis 3.0.
-    SELECT placex.address || hstore('_inherited', '') INTO address
+    SELECT placex.address || hstore('_inherited', '') INTO result.address
       FROM placex
      WHERE ST_Covers(geometry, p.centroid)
            and geometry && p.centroid
@@ -34,27 +37,26 @@ BEGIN
      LIMIT 1;
   END IF;
 
-  address := address - '_unlisted_place'::TEXT;
-  name := p.name;
+  result.address := result.address - '_unlisted_place'::TEXT;
+  result.name := p.name;
+  result.class := p.class;
+  result.type := p.type;
+  result.country_code := p.country_code;
+  result.rank_address := p.rank_address;
 
   -- Names of linked places need to be merged in, so search for a linkable
   -- place already here.
   SELECT * INTO location FROM find_linked_place(p);
 
   IF location.place_id is not NULL THEN
-    linked_place_id := location.place_id;
+    result.linked_place_id := location.place_id;
 
     IF NOT location.name IS NULL THEN
-      name := location.name || name;
+      result.name := location.name || result.name;
     END IF;
   END IF;
 
-  country_feature := CASE WHEN p.admin_level = 2
-                               and p.class = 'boundary' and p.type = 'administrative'
-                               and p.osm_type = 'R'
-                          THEN p.country_code
-                          ELSE null
-                     END;
+  RETURN result;
 END;
 $$
 LANGUAGE plpgsql STABLE;
@@ -104,8 +106,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                                poi_osm_id BIGINT,
                                                poi_partition SMALLINT,
                                                bbox GEOMETRY,
-                                               addr_street INTEGER[],
-                                               addr_place INTEGER[],
+                                               token_info JSONB,
                                                is_place_addr BOOLEAN)
   RETURNS BIGINT
   AS $$
@@ -119,8 +120,7 @@ BEGIN
   parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
 
   IF parent_place_id is null THEN
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                               poi_partition, bbox);
+    parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
   END IF;
 
   IF parent_place_id is null and poi_osm_type = 'N' THEN
@@ -333,13 +333,14 @@ BEGIN
     WHERE s.place_id = parent_place_id;
 
   FOR addr_item IN
-    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-      FROM token_get_address_tokens(token_info)
-      WHERE not search_tokens <@ parent_address_vector
+    SELECT (get_addr_tag_rank(key, country)).*, key,
+           token_get_address_search_tokens(token_info, key) as search_tokens
+      FROM token_get_address_keys(token_info) as key
+      WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
   LOOP
     addr_place := get_address_place(in_partition, geometry,
                                     addr_item.from_rank, addr_item.to_rank,
-                                    addr_item.extent, addr_item.match_tokens);
+                                    addr_item.extent, token_info, addr_item.key);
 
     IF addr_place is null THEN
       -- No place found in OSM that matches. Make it at least searchable.
@@ -447,14 +448,16 @@ BEGIN
 
   FOR location IN
     SELECT (get_address_place(partition, geometry, from_rank, to_rank,
-                              extent, match_tokens)).*, search_tokens
-      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-              FROM token_get_address_tokens(token_info)) x
+                              extent, token_info, key)).*, key
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, key
+              FROM token_get_address_keys(token_info) as key) x
       ORDER BY rank_address, distance, isguess desc
   LOOP
     IF location.place_id is null THEN
       {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      nameaddress_vector := array_merge(nameaddress_vector,
+                                        token_get_address_search_tokens(token_info,
+                                                                        location.key));
       {% endif %}
     ELSE
       {% if not db.reverse_only %}
@@ -689,9 +692,6 @@ DECLARE
   parent_address_level SMALLINT;
   place_address_level SMALLINT;
 
-  addr_street INTEGER[];
-  addr_place INTEGER[];
-
   max_rank SMALLINT;
 
   name_vector INTEGER[];
@@ -860,8 +860,6 @@ BEGIN
   END IF;
 
   NEW.housenumber := token_normalized_housenumber(NEW.token_info);
-  addr_street := token_addr_street_match_tokens(NEW.token_info);
-  addr_place := token_addr_place_match_tokens(NEW.token_info);
 
   NEW.postcode := null;
 
@@ -907,7 +905,7 @@ BEGIN
     NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                                NEW.partition,
                                                ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place,
+                                               NEW.token_info,
                                                is_place_address);
 
     -- If we found the road take a shortcut here.
index c308d0259b8505887d8c8fdc9a85630a20b3313f..f7d2093c9f6c8c32f4b79f1334a3b0e81f68628b 100644 (file)
@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
 
 -- Find the parent of an address with addr:street/addr:place tag.
 --
--- \param street     Value of addr:street or NULL if tag is missing.
--- \param place      Value of addr:place or NULL if tag is missing.
+-- \param token_info Naming info with the address information.
 -- \param partition  Partition where to search the parent.
 -- \param centroid   Location of the address.
 --
 -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
                                                    partition SMALLINT,
                                                    centroid GEOMETRY)
   RETURNS BIGINT
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
 DECLARE
   parent_place_id BIGINT;
 BEGIN
-  IF street is not null THEN
-    -- Check for addr:street attributes
-    -- Note that addr:street links can only be indexed, once the street itself is indexed
-    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
+  -- Check for addr:street attributes
+  parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
+  IF parent_place_id is not null THEN
+    {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
+    RETURN parent_place_id;
   END IF;
 
   -- Check for addr:place attributes.
-  IF place is not null THEN
-    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
-  END IF;
-
-  RETURN NULL;
+  parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
+  {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
+  RETURN parent_place_id;
 END;
 $$
 LANGUAGE plpgsql STABLE;
 
+
 CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
   RETURNS BOOLEAN
   AS $$
index 9732c26cb3b82623e2fe5dd799d94fe24492b226..5008091b3e96f7b28a776a012115b24b4a8ffe51 100644 (file)
@@ -155,11 +155,11 @@ CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id)
 CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
 CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
 CREATE INDEX idx_placex_geometry_buildings ON placex
-  USING GIST (geometry) {{db.tablespace.search_index}}
+  USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
   WHERE address is not null and rank_search = 30
         and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
 CREATE INDEX idx_placex_geometry_placenode ON placex
-  USING GIST (geometry) {{db.tablespace.search_index}}
+  USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
   WHERE osm_type = 'N' and rank_search < 26
         and class = 'place' and type != 'postcode' and linked_place_id is null;
 CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
index faa4efbb2abb76e408c7ef509164db03462c099a..f344e1745ac8da95a86fd3d8013a8916671f2f91 100644 (file)
@@ -14,7 +14,6 @@ DECLARE
   out_partition INTEGER;
   out_parent_place_id BIGINT;
   location RECORD;
-  address_street_word_ids INTEGER[];
 
 BEGIN
 
@@ -54,13 +53,9 @@ BEGIN
 
   place_centroid := ST_Centroid(linegeo);
   out_partition := get_partition('us');
-  out_parent_place_id := null;
 
-  address_street_word_ids := token_addr_street_match_tokens(token_info);
-  IF address_street_word_ids IS NOT NULL THEN
-    out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
-                                                      address_street_word_ids);
-  END IF;
+  out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
+                                                    token_info);
 
   IF out_parent_place_id IS NULL THEN
     SELECT getNearestParallelRoadFeature(out_partition, linegeo)
index ffe6648c38e959c6279efb2d1898d835514f32a7..6092319a0578d338915b6890902d71df7ec90b1f 100644 (file)
@@ -34,40 +34,59 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'place' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] <@ street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[] <@ place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
 CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
   RETURNS INTEGER[]
 AS $$
-  SELECT (info->>'place_search')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
+AS $$
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
 
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT (info->'addr'->>key)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -127,15 +146,34 @@ BEGIN
         VALUES (term_id, term, 'w', json_build_object('count', term_count));
     END IF;
 
-    IF term_count < {{ max_word_freq }} THEN
-      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
-    END IF;
+    partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
   END LOOP;
 END;
 $$
 LANGUAGE plpgsql;
 
 
+CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  token INTEGER;
+BEGIN
+  SELECT min(word_id) INTO token
+    FROM word WHERE word_token = partial and type = 'w';
+
+  IF token IS NULL THEN
+    token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, info)
+        VALUES (token, partial, 'w', json_build_object('count', 0));
+  END IF;
+
+  RETURN token;
+END;
+$$
+LANGUAGE plpgsql;
+
+
 CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
   RETURNS INTEGER
   AS $$
index a2c6b52073ec007e052b7775a148f6159fa1239d..2b734e6f2a95a5cfe97243e81e3b4c47485a3d92 100644 (file)
@@ -34,17 +34,31 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'place_match' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] && street_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place_match')::INTEGER[] && place_tokens
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
@@ -55,19 +69,24 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
+AS $$
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->'addr'->key->>0)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
 
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
 AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
index 64614bf14d7bd55f4a4c2a71f25cadc682ea5d65..f316280bb42bf1dfc71bb5b1e16704bbdfcafa5c 100644 (file)
@@ -12,6 +12,27 @@ from nominatim.errors import UsageError
 
 LOG = logging.getLogger()
 
+
+def flatten_config_list(content, section=''):
+    """ Flatten YAML configuration lists that contain include sections
+        which are lists themselves.
+    """
+    if not content:
+        return []
+
+    if not isinstance(content, list):
+        raise UsageError(f"List expected in section '{section}'.")
+
+    output = []
+    for ele in content:
+        if isinstance(ele, list):
+            output.extend(flatten_config_list(ele, section))
+        else:
+            output.append(ele)
+
+    return output
+
+
 class Configuration:
     """ Load and manage the project configuration.
 
index 80b89c57b1dbfb969aeb10f6b05d8b5166507f23..a1bf5b7f1418a0d65abd1199264a892f3d74f918 100644 (file)
@@ -46,8 +46,10 @@ def _setup_postgresql_features(conn):
         depend on the database version.
     """
     pg_version = conn.server_version_tuple()
+    postgis_version = conn.postgis_version_tuple()
     return {
-        'has_index_non_key_column': pg_version >= (11, 0, 0)
+        'has_index_non_key_column': pg_version >= (11, 0, 0),
+        'spgist_geom' : 'SPGIST' if postgis_version >= (3, 0) else 'GIST'
     }
 
 class SQLPreprocessor:
diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py
new file mode 100644 (file)
index 0000000..06d730e
--- /dev/null
@@ -0,0 +1,68 @@
+"""
+Wrapper around place information the indexer gets from the database and hands to
+the tokenizer.
+"""
+
+import psycopg2.extras
+
+class PlaceInfo:
+    """ Data class containing all information the tokenizer gets about a
+        place it should process the names for.
+    """
+
+    def __init__(self, info):
+        self._info = info
+
+
+    def analyze(self, analyzer):
+        """ Process this place with the given tokenizer and return the
+            result in psycopg2-compatible Json.
+        """
+        return psycopg2.extras.Json(analyzer.process_place(self))
+
+
+    @property
+    def name(self):
+        """ A dictionary with the names of the place or None if the place
+            has no names.
+        """
+        return self._info.get('name')
+
+
+    @property
+    def address(self):
+        """ A dictionary with the address elements of the place
+            or None if no address information is available.
+        """
+        return self._info.get('address')
+
+
+    @property
+    def country_code(self):
+        """ The country code of the country the place is in. Guaranteed
+            to be a two-letter lower-case string or None, if no country
+            could be found.
+        """
+        return self._info.get('country_code')
+
+
+    @property
+    def rank_address(self):
+        """ The computed rank address before rank correction.
+        """
+        return self._info.get('rank_address')
+
+
+    def is_a(self, key, value):
+        """ Check if the place's primary tag corresponds to the given
+            key and value.
+        """
+        return self._info.get('class') == key and self._info.get('type') == value
+
+
+    def is_country(self):
+        """ Check if the place is a valid country boundary.
+        """
+        return self.rank_address == 4 \
+               and self.is_a('boundary', 'administrative') \
+               and self.country_code is not None
index 29261ee50ebe59aac57a260e560a6016036e5139..70536a71db8b51f2e8b3792b63a224a8a646e046 100644 (file)
@@ -4,14 +4,16 @@ tasks.
 """
 import functools
 
-import psycopg2.extras
 from psycopg2 import sql as pysql
 
+from nominatim.indexer.place_info import PlaceInfo
+
 # pylint: disable=C0111
 
 def _mk_valuelist(template, num):
     return pysql.SQL(',').join([pysql.SQL(template)] * num)
 
+
 class AbstractPlacexRunner:
     """ Returns SQL commands for indexing of the placex table.
     """
@@ -37,7 +39,7 @@ class AbstractPlacexRunner:
 
     @staticmethod
     def get_place_details(worker, ids):
-        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+        worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
                           FROM placex WHERE place_id IN %s""",
                        (tuple((p[0] for p in ids)), ))
 
@@ -47,7 +49,7 @@ class AbstractPlacexRunner:
         for place in places:
             for field in ('place_id', 'name', 'address', 'linked_place_id'):
                 values.append(place[field])
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
 
         worker.perform(self._index_sql(len(places)), values)
 
@@ -141,7 +143,7 @@ class InterpolationRunner:
         values = []
         for place in places:
             values.extend((place[x] for x in ('place_id', 'address')))
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
 
         worker.perform(self._index_sql(len(places)), values)
 
index 00ecae447c5843eb0fc772960a421e271a82cb1d..02bc312f18dc5e0bbe65fcacfc7e3564f805d441 100644 (file)
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
 from typing import List, Tuple, Dict, Any
 
 from nominatim.config import Configuration
+from nominatim.indexer.place_info import PlaceInfo
 
 # pylint: disable=unnecessary-pass
 
@@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
 
 
     @abstractmethod
-    def process_place(self, place: Dict) -> Any:
+    def process_place(self, place: PlaceInfo) -> Any:
         """ Extract tokens for the given place and compute the
             information to be handed to the PL/pgSQL processor for building
             the search index.
 
             Arguments:
-                place: Dictionary with the information about the place. Currently
-                       the following fields may be present:
-
-                       - *name* is a dictionary of names for the place together
-                         with the designation of the name.
-                       - *address* is a dictionary of address terms.
-                       - *country_feature* is set to a country code when the
-                         place describes a country.
+                place: Place information retrived from the database.
 
             Returns:
                 A JSON-serialisable structure that will be handed into
@@ -142,7 +136,7 @@ class AbstractTokenizer(ABC):
             the tokenizer remains stable over updates.
 
             Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
 
               init_db: When set to False, then initialisation of database
                 tables should be skipped. This option is only required for
@@ -155,11 +149,14 @@ class AbstractTokenizer(ABC):
 
 
     @abstractmethod
-    def init_from_project(self) -> None:
+    def init_from_project(self, config: Configuration) -> None:
         """ Initialise the tokenizer from an existing database setup.
 
             The function should load all previously saved configuration from
             the project directory and/or the property table.
+
+            Arguments:
+              config: Read-only object with configuration options.
         """
         pass
 
@@ -172,7 +169,7 @@ class AbstractTokenizer(ABC):
             during query time.
 
             Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
         """
         pass
 
@@ -187,22 +184,23 @@ class AbstractTokenizer(ABC):
             data structures or data itself must not be changed by this function.
 
             Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
         """
         pass
 
 
     @abstractmethod
-    def check_database(self) -> str:
+    def check_database(self, config: Configuration) -> str:
         """ Check that the database is set up correctly and ready for being
             queried.
 
+            Arguments:
+              config: Read-only object with configuration options.
+
             Returns:
               If an issue was found, return an error message with the
               description of the issue as well as hints for the user on
-              how to resolve the issue.
-
-              Return `None`, if no issue was found.
+              how to resolve the issue. If everything is okay, return `None`.
         """
         pass
 
index 069672d4a1fd4d9b874943b5d44a367d4f2ef9e8..dc3e7411fa4e865c62e356b6187c910c4ea72b4c 100644 (file)
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
     tokenizer_module = _import_tokenizer(name)
 
     tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
-    tokenizer.init_from_project()
+    tokenizer.init_from_project(config)
 
     return tokenizer
diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py
deleted file mode 100644 (file)
index 93d2b0f..0000000
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-Processor for names that are imported into the database based on the
-ICU library.
-"""
-from collections import defaultdict
-import itertools
-
-from icu import Transliterator
-import datrie
-
-from nominatim.db.properties import set_property, get_property
-from nominatim.tokenizer import icu_variants as variants
-
-DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
-DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
-DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
-DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
-
-
-class ICUNameProcessorRules:
-    """ Data object that saves the rules needed for the name processor.
-
-        The rules can either be initialised through an ICURuleLoader or
-        be loaded from a database when a connection is given.
-    """
-    def __init__(self, loader=None, conn=None):
-        if loader is not None:
-            self.norm_rules = loader.get_normalization_rules()
-            self.trans_rules = loader.get_transliteration_rules()
-            self.replacements = loader.get_replacement_pairs()
-            self.search_rules = loader.get_search_rules()
-        elif conn is not None:
-            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-            self.replacements = \
-                variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
-            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
-        else:
-            assert False, "Parameter loader or conn required."
-
-
-    def save_rules(self, conn):
-        """ Save the rules in the property table of the given database.
-            the rules can be loaded again by handing in a connection into
-            the constructor of the class.
-        """
-        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
-        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
-        set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
-                     variants.pickle_variant_set(self.replacements))
-        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
-
-
-class ICUNameProcessor:
-    """ Collects the different transformation rules for normalisation of names
-        and provides the functions to aply the transformations.
-    """
-
-    def __init__(self, rules):
-        self.normalizer = Transliterator.createFromRules("icu_normalization",
-                                                         rules.norm_rules)
-        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
-                                                       rules.trans_rules +
-                                                       ";[:Space:]+ > ' '")
-        self.search = Transliterator.createFromRules("icu_search",
-                                                     rules.search_rules)
-
-        # Intermediate reorder by source. Also compute required character set.
-        immediate = defaultdict(list)
-        chars = set()
-        for variant in rules.replacements:
-            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
-                replstr = variant.replacement[:-1]
-            else:
-                replstr = variant.replacement
-            immediate[variant.source].append(replstr)
-            chars.update(variant.source)
-        # Then copy to datrie
-        self.replacements = datrie.Trie(''.join(chars))
-        for src, repllist in immediate.items():
-            self.replacements[src] = repllist
-
-
-    def get_normalized(self, name):
-        """ Normalize the given name, i.e. remove all elements not relevant
-            for search.
-        """
-        return self.normalizer.transliterate(name).strip()
-
-    def get_variants_ascii(self, norm_name):
-        """ Compute the spelling variants for the given normalized name
-            and transliterate the result.
-        """
-        baseform = '^ ' + norm_name + ' ^'
-        partials = ['']
-
-        startpos = 0
-        pos = 0
-        force_space = False
-        while pos < len(baseform):
-            full, repl = self.replacements.longest_prefix_item(baseform[pos:],
-                                                               (None, None))
-            if full is not None:
-                done = baseform[startpos:pos]
-                partials = [v + done + r
-                            for v, r in itertools.product(partials, repl)
-                            if not force_space or r.startswith(' ')]
-                if len(partials) > 128:
-                    # If too many variants are produced, they are unlikely
-                    # to be helpful. Only use the original term.
-                    startpos = 0
-                    break
-                startpos = pos + len(full)
-                if full[-1] == ' ':
-                    startpos -= 1
-                    force_space = True
-                pos = startpos
-            else:
-                pos += 1
-                force_space = False
-
-        # No variants detected? Fast return.
-        if startpos == 0:
-            trans_name = self.to_ascii.transliterate(norm_name).strip()
-            return [trans_name] if trans_name else []
-
-        return self._compute_result_set(partials, baseform[startpos:])
-
-
-    def _compute_result_set(self, partials, prefix):
-        results = set()
-
-        for variant in partials:
-            vname = variant + prefix
-            trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
-            if trans_name:
-                results.add(trans_name)
-
-        return list(results)
-
-
-    def get_search_normalized(self, name):
-        """ Return the normalized version of the name (including transliteration)
-            to be applied at search time.
-        """
-        return self.search.transliterate(' ' + name + ' ').strip()
index 0e6e40b4c88dc3109e5aa9fa60cb27925458454b..b8551038aa42283dfccfc6bedaf1c0b89c0ba68b 100644 (file)
@@ -1,57 +1,86 @@
 """
 Helper class to create ICU rules from a configuration file.
 """
+import importlib
 import io
+import json
 import logging
-import itertools
-import re
-
-from icu import Transliterator
 
+from nominatim.config import flatten_config_list
+from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
-import nominatim.tokenizer.icu_variants as variants
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+import nominatim.tools.country_info
 
 LOG = logging.getLogger()
 
-def _flatten_config_list(content):
-    if not content:
-        return []
-
-    if not isinstance(content, list):
-        raise UsageError("List expected in ICU configuration.")
-
-    output = []
-    for ele in content:
-        if isinstance(ele, list):
-            output.extend(_flatten_config_list(ele))
-        else:
-            output.append(ele)
-
-    return output
-
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
-class VariantRule:
-    """ Saves a single variant expansion.
 
-        An expansion consists of the normalized replacement term and
-        a dicitonary of properties that describe when the expansion applies.
+def _get_section(rules, section):
+    """ Get the section named 'section' from the rules. If the section does
+        not exist, raise a usage error with a meaningful message.
     """
+    if section not in rules:
+        LOG.fatal("Section '%s' not found in tokenizer config.", section)
+        raise UsageError("Syntax error in tokenizer configuration file.")
 
-    def __init__(self, replacement, properties):
-        self.replacement = replacement
-        self.properties = properties or {}
+    return rules[section]
 
 
 class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
-    def __init__(self, rules):
-        self.variants = set()
+    def __init__(self, config):
+        rules = config.load_sub_configuration('icu_tokenizer.yaml',
+                                              config='TOKENIZER_CONFIG')
+
+        # Make sure country information is available to analyzers and sanatizers.
+        nominatim.tools.country_info.setup_country_config(config)
 
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_variant_list(self._get_section(rules, 'variants'))
+        self.analysis_rules = _get_section(rules, 'token-analysis')
+        self._setup_analysis()
+
+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+
+
+    def load_config_from_db(self, conn):
+        """ Get previously saved parts of the configuration from the
+            database.
+        """
+        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        self._setup_analysis()
+
+
+    def save_config_to_db(self, conn):
+        """ Save the part of the configuration that cannot be changed into
+            the database.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
+        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
+
+
+    def make_sanitizer(self):
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules)
+
+
+    def make_token_analysis(self):
+        """ Create a token analyser from the reviouly loaded rules.
+        """
+        return ICUTokenAnalysis(self.normalization_rules,
+                                self.transliteration_rules, self.analysis)
 
 
     def get_search_rules(self):
@@ -66,157 +95,66 @@ class ICURuleLoader:
         rules.write(self.transliteration_rules)
         return rules.getvalue()
 
+
     def get_normalization_rules(self):
         """ Return rules for normalisation of a term.
         """
         return self.normalization_rules
 
+
     def get_transliteration_rules(self):
         """ Return the rules for converting a string into its asciii representation.
         """
         return self.transliteration_rules
 
-    def get_replacement_pairs(self):
-        """ Return the list of possible compound decompositions with
-            application of abbreviations included.
-            The result is a list of pairs: the first item is the sequence to
-            replace, the second is a list of replacements.
-        """
-        return self.variants
-
 
-    @staticmethod
-    def _get_section(rules, section):
-        """ Get the section named 'section' from the rules. If the section does
-            not exist, raise a usage error with a meaningful message.
+    def _setup_analysis(self):
+        """ Process the rules used for creating the various token analyzers.
         """
-        if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config.", section)
-            raise UsageError("Syntax error in tokenizer configuration file.")
+        self.analysis = {}
+
+        if not isinstance(self.analysis_rules, list):
+            raise UsageError("Configuration section 'token-analysis' must be a list.")
 
-        return rules[section]
+        for section in self.analysis_rules:
+            name = section.get('id', None)
+            if name in self.analysis:
+                if name is None:
+                    LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
+                else:
+                    LOG.fatal("ICU tokenizer configuration has two token "
+                              "analyzers with id '%s'.", name)
+                raise UsageError("Syntax error in ICU tokenizer config.")
+            self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
 
 
-    def _cfg_to_icu_rules(self, rules, section):
+    @staticmethod
+    def _cfg_to_icu_rules(rules, section):
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
             relative to the tokenizer rule file. If the section is a list then
             each line is assumed to be a rule. All rules are concatenated and returned.
         """
-        content = self._get_section(rules, section)
+        content = _get_section(rules, section)
 
         if content is None:
             return ''
 
-        return ';'.join(_flatten_config_list(content)) + ';'
-
+        return ';'.join(flatten_config_list(content, section)) + ';'
 
-    def _parse_variant_list(self, rules):
-        self.variants.clear()
 
-        if not rules:
-            return
-
-        rules = _flatten_config_list(rules)
-
-        vmaker = _VariantMaker(self.normalization_rules)
-
-        properties = []
-        for section in rules:
-            # Create the property field and deduplicate against existing
-            # instances.
-            props = variants.ICUVariantProperties.from_rules(section)
-            for existing in properties:
-                if existing == props:
-                    props = existing
-                    break
-            else:
-                properties.append(props)
-
-            for rule in (section.get('words') or []):
-                self.variants.update(vmaker.compute(rule, props))
-
-
-class _VariantMaker:
-    """ Generater for all necessary ICUVariants from a single variant rule.
-
-        All text in rules is normalized to make sure the variants match later.
+class TokenAnalyzerRule:
+    """ Factory for a single analysis module. The class saves the configuration
+        and creates a new token analyzer on request.
     """
 
-    def __init__(self, norm_rules):
-        self.norm = Transliterator.createFromRules("rule_loader_normalization",
-                                                   norm_rules)
+    def __init__(self, rules, normalization_rules):
+        # Find the analysis module
+        module_name = 'nominatim.tokenizer.token_analysis.' \
+                      + _get_section(rules, 'analyzer').replace('-', '_')
+        analysis_mod = importlib.import_module(module_name)
+        self.create = analysis_mod.create
 
-
-    def compute(self, rule, props):
-        """ Generator for all ICUVariant tuples from a single variant rule.
-        """
-        parts = re.split(r'(\|)?([=-])>', rule)
-        if len(parts) != 4:
-            raise UsageError("Syntax error in variant rule: " + rule)
-
-        decompose = parts[1] is None
-        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
-        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
-
-        # If the source should be kept, add a 1:1 replacement
-        if parts[2] == '-':
-            for src in src_terms:
-                if src:
-                    for froms, tos in _create_variants(*src, src[0], decompose):
-                        yield variants.ICUVariant(froms, tos, props)
-
-        for src, repl in itertools.product(src_terms, repl_terms):
-            if src and repl:
-                for froms, tos in _create_variants(*src, repl, decompose):
-                    yield variants.ICUVariant(froms, tos, props)
-
-
-    def _parse_variant_word(self, name):
-        name = name.strip()
-        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
-        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
-            raise UsageError("Invalid variant word descriptor '{}'".format(name))
-        norm_name = self.norm.transliterate(match.group(2))
-        if not norm_name:
-            return None
-
-        return norm_name, match.group(1), match.group(3)
-
-
-_FLAG_MATCH = {'^': '^ ',
-               '$': ' ^',
-               '': ' '}
-
-
-def _create_variants(src, preflag, postflag, repl, decompose):
-    if preflag == '~':
-        postfix = _FLAG_MATCH[postflag]
-        # suffix decomposition
-        src = src + postfix
-        repl = repl + postfix
-
-        yield src, repl
-        yield ' ' + src, ' ' + repl
-
-        if decompose:
-            yield src, ' ' + repl
-            yield ' ' + src, repl
-    elif postflag == '~':
-        # prefix decomposition
-        prefix = _FLAG_MATCH[preflag]
-        src = prefix + src
-        repl = prefix + repl
-
-        yield src, repl
-        yield src + ' ', repl + ' '
-
-        if decompose:
-            yield src, repl + ' '
-            yield src + ' ', repl
-    else:
-        prefix = _FLAG_MATCH[preflag]
-        postfix = _FLAG_MATCH[postflag]
-
-        yield prefix + src + postfix, prefix + repl + postfix
+        # Load the configuration.
+        self.config = analysis_mod.configure(rules, normalization_rules)
diff --git a/nominatim/tokenizer/icu_token_analysis.py b/nominatim/tokenizer/icu_token_analysis.py
new file mode 100644 (file)
index 0000000..f27a2fb
--- /dev/null
@@ -0,0 +1,23 @@
+"""
+Container class collecting all components required to transform an OSM name
+into a Nominatim token.
+"""
+
+from icu import Transliterator
+
+class ICUTokenAnalysis:
+    """ Container class collecting the transliterators and token analysis
+        modules for a single NameAnalyser instance.
+    """
+
+    def __init__(self, norm_rules, trans_rules, analysis_rules):
+        self.normalizer = Transliterator.createFromRules("icu_normalization",
+                                                         norm_rules)
+        trans_rules += ";[:Space:]+ > ' '"
+        self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
+                                                       trans_rules)
+        self.search = Transliterator.createFromRules("icu_search",
+                                                     norm_rules + trans_rules)
+
+        self.analysis = {name: arules.create(self.to_ascii, arules.config)
+                         for name, arules in analysis_rules.items()}
index 61263678d811db87e90cc0ab8ed55b885d24a57c..12d1eccd15f1799b6b45af4df6b0b39ec6a93674 100644 (file)
@@ -13,11 +13,10 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
@@ -37,9 +36,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
         self.term_normalization = None
-        self.max_word_frequency = None
 
 
     def init_new_db(self, config, init_db=True):
@@ -48,27 +46,26 @@ class LegacyICUTokenizer(AbstractTokenizer):
             This copies all necessary data in the project directory to make
             sure the tokenizer remains stable even over updates.
         """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.loader = ICURuleLoader(config)
+
         self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
 
         self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
 
         if init_db:
             self.update_sql_functions(config)
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
         """ Initialise the tokenizer from the project directory.
         """
+        self.loader = ICURuleLoader(config)
+
         with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
 
 
     def finalize_import(self, _):
@@ -81,18 +78,16 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Reimport the SQL functions for this tokenizer.
         """
         with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
 
 
-    def check_database(self):
+    def check_database(self, config):
         """ Check that the tokenizer is set up correctly.
         """
-        self.init_from_project()
+        self.init_from_project(config)
 
-        if self.naming_rules is None:
+        if self.term_normalization is None:
             return "Configuration for tokenizer 'icu' are missing."
 
         return None
@@ -113,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
@@ -122,20 +118,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent(f"""\
             <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Max_Word_Frequency', 10000000);
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
-    def _save_config(self, config):
+    def _save_config(self):
         """ Save the configuration that needs to remain stable for the given
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+            self.loader.save_config_to_db(conn)
             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
@@ -170,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Count the partial terms from the names in the place table.
         """
         words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        analysis = self.loader.make_token_analysis()
 
         with conn.cursor(name="words") as cur:
             cur.execute(""" SELECT v, count(*) FROM
@@ -178,12 +172,10 @@ class LegacyICUTokenizer(AbstractTokenizer):
                             WHERE length(v) < 75 GROUP BY v""")
 
             for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
+                word = analysis.search.transliterate(name)
+                if word and ' ' in word:
+                    for term in set(word.split()):
+                        words[term] += cnt
 
         return words
 
@@ -195,10 +187,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         normalization.
     """
 
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
 
         self._cache = _TokenCache()
 
@@ -211,6 +204,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             self.conn = None
 
 
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.search.transliterate(name).strip()
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.normalizer.transliterate(name).strip()
+
+
     def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
@@ -226,9 +232,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         partial_tokens = {}
         for word in words:
             if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
             else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
 
         with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
@@ -259,7 +265,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             This function takes minor shortcuts on transliteration.
         """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
@@ -282,7 +288,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                     if postcode is None:
                         to_delete.append(word)
                     else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                     'P', postcode)
 
                 if to_delete:
@@ -300,7 +306,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             completely replaced. Otherwise the phrases are added to the
             already existing ones.
         """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -330,7 +336,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         added = 0
         with CopyBuffer() as copystr:
             for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                 if term:
                     copystr.add(term, 'S', word,
                                 json.dumps({'class': cls, 'type': typ,
@@ -364,9 +370,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
         word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
             if norm_name:
                 word_tokens.add(norm_name)
 
@@ -392,23 +410,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
             the database via the token_info field.
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.get('name')
+        names, address = self.sanitizer.process_names(place)
 
         if names:
             fulls, partials = self._compute_name_tokens(names)
 
             token_info.add_names(fulls, partials)
 
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self._add_country_full_names(place.country_code, names)
 
-        address = place.get('address')
         if address:
             self._process_place_address(token_info, address)
 
@@ -418,18 +434,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _process_place_address(self, token_info, address):
         hnrs = []
         addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
-            elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 
         if hnrs:
             hnrs = self._split_housenumbers(hnrs)
@@ -439,28 +455,61 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             token_info.add_address_terms(addr_terms)
 
 
+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self._search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
+
+
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
             dictionary of names.
         """
-        full_names = self._compute_full_names(names)
         full_tokens = set()
         partial_tokens = set()
 
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            norm_name = self._normalized(name.name)
+            if analyzer_id is None:
+                token_id = norm_name
+            else:
+                token_id = f'{norm_name}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
                 if not variants:
                     continue
 
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
+                                (token_id, variants))
                     full, part = cur.fetchone()
 
-                self._cache.names[norm_name] = (full, part)
+                self._cache.names[token_id] = (full, part)
 
             full_tokens.add(full)
             partial_tokens.update(part)
@@ -468,23 +517,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         return full_tokens, partial_tokens
 
 
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
@@ -492,7 +524,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                 if not term:
                     return
 
@@ -551,30 +583,25 @@ class _TokenInfo:
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
         """ Add addr:street match terms.
         """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if tokens:
+            self.data['street'] = self._mk_array(tokens)
 
 
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
         """ Add addr:place search and match terms.
         """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)
 
 
     def add_address_terms(self, terms):
         """ Add additional address terms.
         """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}
 
         if tokens:
             self.data['addr'] = tokens
@@ -588,6 +615,7 @@ class _TokenCache:
     """
     def __init__(self):
         self.names = {}
+        self.partials = {}
         self.postcodes = set()
         self.housenumbers = {}
 
diff --git a/nominatim/tokenizer/icu_variants.py b/nominatim/tokenizer/icu_variants.py
deleted file mode 100644 (file)
index 9ebe368..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-Data structures for saving variant expansions for ICU tokenizer.
-"""
-from collections import namedtuple
-import json
-
-_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
-
-
-class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS)):
-    """ Data container for saving properties that describe when a variant
-        should be applied.
-
-        Property instances are hashable.
-    """
-    @classmethod
-    def from_rules(cls, _):
-        """ Create a new property type from a generic dictionary.
-
-            The function only takes into account the properties that are
-            understood presently and ignores all others.
-        """
-        return cls(lang=None)
-
-
-ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
-
-
-def pickle_variant_set(variants):
-    """ Serializes an iterable of variant rules to a string.
-    """
-    # Create a list of property sets. So they don't need to be duplicated
-    properties = {}
-    pid = 1
-    for variant in variants:
-        if variant.properties not in properties:
-            properties[variant.properties] = pid
-            pid += 1
-
-    # Convert the variants into a simple list.
-    variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
-
-    # Convert everythin to json.
-    return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
-                       'variants': variants})
-
-
-def unpickle_variant_set(variant_string):
-    """ Deserializes a variant string that was previously created with
-        pickle_variant_set() into a set of ICUVariants.
-    """
-    data = json.loads(variant_string)
-
-    properties = {int(k): ICUVariantProperties.from_rules(v)
-                  for k, v in data['properties'].items()}
-
-    return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
index 8957426b353efa7ec17f572e754f7fe47f90022c..c935f20d4a9836e0f1c97ab74a5ce93a98b99ba1 100644 (file)
@@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, _):
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
@@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
                               modulepath=modulepath)
 
 
-    def check_database(self):
+    def check_database(self, _):
         """ Check that the tokenizer is set up correctly.
         """
         hint = """\
@@ -405,16 +405,15 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.get('name')
+        names = place.name
 
         if names:
             token_info.add_names(self.conn, names)
 
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
 
-        address = place.get('address')
+        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py
new file mode 100644 (file)
index 0000000..5961dcf
--- /dev/null
@@ -0,0 +1,127 @@
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+import importlib
+
+from nominatim.errors import UsageError
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name, kind, suffix):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr = {}
+
+
+    def __repr__(self):
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name=None, kind=None, suffix=None, attr=None):
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key, value):
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key, default=None):
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key):
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class _ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place):
+        self.place = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names):
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class PlaceSanitizer:
+    """ Controller class which applies sanitizer functions on the place
+        names and address before they are used by the token analysers.
+    """
+
+    def __init__(self, rules):
+        self.handlers = []
+
+        if rules:
+            for func in rules:
+                if 'step' not in func:
+                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
+                handler_module = importlib.import_module(module_name)
+                self.handlers.append(handler_module.create(func))
+
+
+    def process_names(self, place):
+        """ Extract a sanitized list of names and address parts from the
+            given place. The function returns a tuple
+            (list of names, list of address names)
+        """
+        obj = _ProcessInfo(place)
+
+        for func in self.handlers:
+            func(obj)
+
+        return obj.names, obj.address
diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py
new file mode 100644 (file)
index 0000000..8638598
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+Sanitizer that splits lists of names into their components.
+
+Arguments:
+    delimiters: Define the set of characters to be used for
+                splitting the list. (default: `,;`)
+"""
+import re
+
+from nominatim.errors import UsageError
+
+def create(func):
+    """ Create a name processing function that splits name values with
+        multiple values into their components.
+    """
+    delimiter_set = set(func.get('delimiters', ',;'))
+    if not delimiter_set:
+        raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
+
+    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
+
+    def _process(obj):
+        if not obj.names:
+            return
+
+        new_names = []
+        for name in obj.names:
+            split_names = regexp.split(name.name)
+            if len(split_names) == 1:
+                new_names.append(name)
+            else:
+                new_names.extend(name.clone(name=n) for n in split_names if n)
+
+        obj.names = new_names
+
+    return _process
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
new file mode 100644 (file)
index 0000000..caadc81
--- /dev/null
@@ -0,0 +1,23 @@
+"""
+This sanitizer creates additional name variants for names that have
+addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
+only the main name part with the bracket part removed.
+"""
+
+def create(_):
+    """ Create a name processing function that creates additional name variants
+        for bracket addendums.
+    """
+    def _process(obj):
+        """ Add variants for names that have a bracket extension.
+        """
+        if obj.names:
+            new_names = []
+            for name in (n for n in obj.names if '(' in n.name):
+                new_name = name.name.split('(')[0].strip()
+                if new_name:
+                    new_names.append(name.clone(name=new_name))
+
+            obj.names.extend(new_names)
+
+    return _process
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
new file mode 100644 (file)
index 0000000..739e931
--- /dev/null
@@ -0,0 +1,103 @@
+"""
+This sanitizer sets the `analyzer` property depending on the
+language of the tag. The language is taken from the suffix of the name.
+If a name already has an analyzer tagged, then this is kept.
+
+Arguments:
+
+    filter-kind: Restrict the names the sanitizer should be applied to
+                 to the given tags. The parameter expects a list of
+                 regular expressions which are matched against `kind`.
+                 Note that a match against the full string is expected.
+    whitelist: Restrict the set of languages that should be tagged.
+               Expects a list of acceptable suffixes. When unset,
+               all 2- and 3-letter lower-case codes are accepted.
+    use-defaults:  Configure what happens when the name has no suffix.
+                   When set to 'all', a variant is created for
+                   each of the default languages in the country
+                   the feature is in. When set to 'mono', a variant is
+                   only created, when exactly one language is spoken
+                   in the country. The default is to do nothing with
+                   the default languages of a country.
+    mode: Define how the variants are created and may be 'replace' or
+          'append'. When set to 'append' the original name (without
+          any analyzer tagged) is retained. (default: replace)
+
+"""
+import re
+
+from nominatim.tools import country_info
+
+class _AnalyzerByLanguage:
+    """ Processor for tagging the language of names in a place.
+    """
+
+    def __init__(self, config):
+        if 'filter-kind' in config:
+            self.regexes = [re.compile(regex) for regex in config['filter-kind']]
+        else:
+            self.regexes = None
+
+        self.replace = config.get('mode', 'replace') != 'append'
+        self.whitelist = config.get('whitelist')
+
+        self.__compute_default_languages(config.get('use-defaults', 'no'))
+
+
+    def __compute_default_languages(self, use_defaults):
+        self.deflangs = {}
+
+        if use_defaults in ('mono', 'all'):
+            for ccode, prop in country_info.iterate():
+                clangs = prop['languages']
+                if len(clangs) == 1 or use_defaults == 'all':
+                    if self.whitelist:
+                        self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+                    else:
+                        self.deflangs[ccode] = clangs
+
+
+    def _kind_matches(self, kind):
+        if self.regexes is None:
+            return True
+
+        return any(regex.fullmatch(kind) for regex in self.regexes)
+
+
+    def _suffix_matches(self, suffix):
+        if self.whitelist is None:
+            return len(suffix) in (2, 3) and suffix.islower()
+
+        return suffix in self.whitelist
+
+
+    def __call__(self, obj):
+        if not obj.names:
+            return
+
+        more_names = []
+
+        for name in (n for n in obj.names
+                     if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+            if name.suffix:
+                langs = [name.suffix] if self._suffix_matches(name.suffix) else None
+            else:
+                langs = self.deflangs.get(obj.place.country_code)
+
+
+            if langs:
+                if self.replace:
+                    name.set_attr('analyzer', langs[0])
+                else:
+                    more_names.append(name.clone(attr={'analyzer': langs[0]}))
+
+                more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+
+        obj.names.extend(more_names)
+
+
+def create(config):
+    """ Create a function that sets the analyzer property depending on the
+        language of the tag.
+    """
+    return _AnalyzerByLanguage(config)
diff --git a/nominatim/tokenizer/token_analysis/__init__.py b/nominatim/tokenizer/token_analysis/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py
new file mode 100644 (file)
index 0000000..4b47889
--- /dev/null
@@ -0,0 +1,224 @@
+"""
+Generic processor for names that creates abbreviation variants.
+"""
+from collections import defaultdict, namedtuple
+import itertools
+import re
+
+from icu import Transliterator
+import datrie
+
+from nominatim.config import flatten_config_list
+from nominatim.errors import UsageError
+
+### Configuration section
+
+ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
+
+def configure(rules, normalization_rules):
+    """ Extract and preprocess the configuration for this module.
+    """
+    config = {}
+
+    config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
+                                                                  normalization_rules)
+    config['variant_only'] = rules.get('mode', '') == 'variant-only'
+
+    return config
+
+
+def _get_variant_config(rules, normalization_rules):
+    """ Convert the variant definition from the configuration into
+        replacement sets.
+    """
+    immediate = defaultdict(list)
+    chars = set()
+
+    if rules:
+        vset = set()
+        rules = flatten_config_list(rules, 'variants')
+
+        vmaker = _VariantMaker(normalization_rules)
+
+        for section in rules:
+            for rule in (section.get('words') or []):
+                vset.update(vmaker.compute(rule))
+
+        # Intermediate reorder by source. Also compute required character set.
+        for variant in vset:
+            if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
+                replstr = variant.replacement[:-1]
+            else:
+                replstr = variant.replacement
+            immediate[variant.source].append(replstr)
+            chars.update(variant.source)
+
+    return list(immediate.items()), ''.join(chars)
+
+
+class _VariantMaker:
+    """ Generater for all necessary ICUVariants from a single variant rule.
+
+        All text in rules is normalized to make sure the variants match later.
+    """
+
+    def __init__(self, norm_rules):
+        self.norm = Transliterator.createFromRules("rule_loader_normalization",
+                                                   norm_rules)
+
+
+    def compute(self, rule):
+        """ Generator for all ICUVariant tuples from a single variant rule.
+        """
+        parts = re.split(r'(\|)?([=-])>', rule)
+        if len(parts) != 4:
+            raise UsageError("Syntax error in variant rule: " + rule)
+
+        decompose = parts[1] is None
+        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+        repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
+
+        # If the source should be kept, add a 1:1 replacement
+        if parts[2] == '-':
+            for src in src_terms:
+                if src:
+                    for froms, tos in _create_variants(*src, src[0], decompose):
+                        yield ICUVariant(froms, tos)
+
+        for src, repl in itertools.product(src_terms, repl_terms):
+            if src and repl:
+                for froms, tos in _create_variants(*src, repl, decompose):
+                    yield ICUVariant(froms, tos)
+
+
+    def _parse_variant_word(self, name):
+        name = name.strip()
+        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+            raise UsageError("Invalid variant word descriptor '{}'".format(name))
+        norm_name = self.norm.transliterate(match.group(2)).strip()
+        if not norm_name:
+            return None
+
+        return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+               '$': ' ^',
+               '': ' '}
+
+
+def _create_variants(src, preflag, postflag, repl, decompose):
+    if preflag == '~':
+        postfix = _FLAG_MATCH[postflag]
+        # suffix decomposition
+        src = src + postfix
+        repl = repl + postfix
+
+        yield src, repl
+        yield ' ' + src, ' ' + repl
+
+        if decompose:
+            yield src, ' ' + repl
+            yield ' ' + src, repl
+    elif postflag == '~':
+        # prefix decomposition
+        prefix = _FLAG_MATCH[preflag]
+        src = prefix + src
+        repl = prefix + repl
+
+        yield src, repl
+        yield src + ' ', repl + ' '
+
+        if decompose:
+            yield src, repl + ' '
+            yield src + ' ', repl
+    else:
+        prefix = _FLAG_MATCH[preflag]
+        postfix = _FLAG_MATCH[postflag]
+
+        yield prefix + src + postfix, prefix + repl + postfix
+
+
+### Analysis section
+
+def create(transliterator, config):
+    """ Create a new token analysis instance for this module.
+    """
+    return GenericTokenAnalysis(transliterator, config)
+
+
+class GenericTokenAnalysis:
+    """ Collects the different transformation rules for normalisation of names
+        and provides the functions to apply the transformations.
+    """
+
+    def __init__(self, to_ascii, config):
+        self.to_ascii = to_ascii
+        self.variant_only = config['variant_only']
+
+        # Set up datrie
+        if config['replacements']:
+            self.replacements = datrie.Trie(config['chars'])
+            for src, repllist in config['replacements']:
+                self.replacements[src] = repllist
+        else:
+            self.replacements = None
+
+
+    def get_variants_ascii(self, norm_name):
+        """ Compute the spelling variants for the given normalized name
+            and transliterate the result.
+        """
+        baseform = '^ ' + norm_name + ' ^'
+        partials = ['']
+
+        startpos = 0
+        if self.replacements is not None:
+            pos = 0
+            force_space = False
+            while pos < len(baseform):
+                full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+                                                                   (None, None))
+                if full is not None:
+                    done = baseform[startpos:pos]
+                    partials = [v + done + r
+                                for v, r in itertools.product(partials, repl)
+                                if not force_space or r.startswith(' ')]
+                    if len(partials) > 128:
+                        # If too many variants are produced, they are unlikely
+                        # to be helpful. Only use the original term.
+                        startpos = 0
+                        break
+                    startpos = pos + len(full)
+                    if full[-1] == ' ':
+                        startpos -= 1
+                        force_space = True
+                    pos = startpos
+                else:
+                    pos += 1
+                    force_space = False
+
+        # No variants detected? Fast return.
+        if startpos == 0:
+            if self.variant_only:
+                return []
+
+            trans_name = self.to_ascii.transliterate(norm_name).strip()
+            return [trans_name] if trans_name else []
+
+        return self._compute_result_set(partials, baseform[startpos:],
+                                        norm_name if self.variant_only else '')
+
+
+    def _compute_result_set(self, partials, prefix, exclude):
+        results = set()
+
+        for variant in partials:
+            vname = (variant + prefix)[1:-1].strip()
+            if vname != exclude:
+                trans_name = self.to_ascii.transliterate(vname).strip()
+                if trans_name:
+                    results.add(trans_name)
+
+        return list(results)
index d116554fea20f6e9b5e261adc2a48b0434fa5531..30b27d1f60accd64c5ece68d3d82d9841b6cc656 100644 (file)
@@ -166,7 +166,7 @@ def check_tokenizer(_, config):
         return CheckState.FAIL, dict(msg="""\
             Cannot load tokenizer. Did the import finish sucessfully?""")
 
-    result = tokenizer.check_database()
+    result = tokenizer.check_database(config)
 
     if result is None:
         return CheckState.OK
index e04a8693f116bccd6d7e609de0c463b74170e46a..635d15840a84b8197efb9f5cb358344a78a0c2b9 100644 (file)
@@ -13,12 +13,21 @@ class _CountryInfo:
     def __init__(self):
         self._info = {}
 
+
     def load(self, config):
         """ Load the country properties from the configuration files,
             if they are not loaded yet.
         """
         if not self._info:
             self._info = config.load_sub_configuration('country_settings.yaml')
+            # Convert languages into a list for simpler handling.
+            for prop in self._info.values():
+                if 'languages' not in prop:
+                    prop['languages'] = []
+                elif not isinstance(prop['languages'], list):
+                    prop['languages'] = [x.strip()
+                                         for x in prop['languages'].split(',')]
+
 
     def items(self):
         """ Return tuples of (country_code, property dict) as iterable.
@@ -36,6 +45,12 @@ def setup_country_config(config):
     _COUNTRY_INFO.load(config)
 
 
+def iterate():
+    """ Iterate over country code and properties.
+    """
+    return _COUNTRY_INFO.items()
+
+
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
     """ Create and populate the tables with basic static data that provides
         the background for geocoding. Data is assumed to not yet exist.
@@ -50,10 +65,7 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
                 partition = 0
             else:
                 partition = props.get('partition')
-            if ',' in (props.get('languages', ',') or ','):
-                lang = None
-            else:
-                lang = props['languages']
+            lang = props['languages'][0] if len(props['languages']) == 1 else None
             params.append((ccode, partition, lang))
 
     with connect(dsn) as conn:
index ff498f777e527a38adbc38a48e500f5b514bc684..19a1268253feaa7ff1e2e6de20be3c43f1d74025 100644 (file)
@@ -7,12 +7,11 @@ import logging
 import os
 import tarfile
 
-import psycopg2.extras
-
 from nominatim.db.connection import connect
 from nominatim.db.async_connection import WorkerPool
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
+from nominatim.indexer.place_info import PlaceInfo
 
 LOG = logging.getLogger()
 
@@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
             address = dict(street=row['street'], postcode=row['postcode'])
             args = ('SRID=4326;' + row['geometry'],
                     int(row['from']), int(row['to']), row['interpolation'],
-                    psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
+                    PlaceInfo({'address': address}).analyze(analyzer),
                     analyzer.normalize_postcode(row['postcode']))
         except ValueError:
             continue
index 09ba0d28e526c085027170a741c4a55fee8d8afc..fc31a6542e3f6927ee5edde542a78f3f4b84d9b8 100644 (file)
           "administrative10" : 22
       }
   }
+},
+{ "countries" : ["sk"],
+  "tags" : {
+      "boundary" : {
+          "administrative5" : [10, 0],
+          "administrative6" : 11,
+          "administrative7" : [11, 0],
+          "administrative8" : 12,
+          "administrative9" : 16,
+          "administrative10" : 18,
+          "administrative11" : 20
+      }
+  }
 }
+
 ]
 
index 77b137a1b8019fcfce1facbee8c9f7fc32891e94..dcbb1847f8fd1d7158d4dae8122081346e061e34 100644 (file)
@@ -171,7 +171,7 @@ bt:
 #  (Bouvet Island)
 bv:
     partition: 185
-    languages: no
+    languages: "no"
 
 # Botswana (Botswana)
 bw:
@@ -1006,7 +1006,7 @@ si:
 #  (Svalbard and Jan Mayen)
 sj:
     partition: 197
-    languages: no
+    languages: "no"
 
 # Slovakia (Slovensko)
 sk:
index c0c8c043c289bdb31f699ce2725e5ca8d0c631be..41760c49e0fbd2122d2f1e7fd1966fc4278d1975 100644 (file)
@@ -24,34 +24,163 @@ transliteration:
     - "[^[:Ascii:]] >"
     - ":: lower ()"
     - ":: NFC ()"
-variants:
-    - !include icu-rules/variants-bg.yaml
-    - !include icu-rules/variants-ca.yaml
-    - !include icu-rules/variants-cs.yaml
-    - !include icu-rules/variants-da.yaml
-    - !include icu-rules/variants-de.yaml
-    - !include icu-rules/variants-el.yaml
-    - !include icu-rules/variants-en.yaml
-    - !include icu-rules/variants-es.yaml
-    - !include icu-rules/variants-et.yaml
-    - !include icu-rules/variants-eu.yaml
-    - !include icu-rules/variants-fi.yaml
-    - !include icu-rules/variants-fr.yaml
-    - !include icu-rules/variants-gl.yaml
-    - !include icu-rules/variants-hu.yaml
-    - !include icu-rules/variants-it.yaml
-    - !include icu-rules/variants-ja.yaml
-    - !include icu-rules/variants-mg.yaml
-    - !include icu-rules/variants-ms.yaml
-    - !include icu-rules/variants-nl.yaml
-    - !include icu-rules/variants-no.yaml
-    - !include icu-rules/variants-pl.yaml
-    - !include icu-rules/variants-pt.yaml
-    - !include icu-rules/variants-ro.yaml
-    - !include icu-rules/variants-ru.yaml
-    - !include icu-rules/variants-sk.yaml
-    - !include icu-rules/variants-sl.yaml
-    - !include icu-rules/variants-sv.yaml
-    - !include icu-rules/variants-tr.yaml
-    - !include icu-rules/variants-uk.yaml
-    - !include icu-rules/variants-vi.yaml
+sanitizers:
+    - step: split-name-list
+    - step: strip-brace-terms
+    - step: tag-analyzer-by-language
+      filter-kind: [".*name.*"]
+      whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
+      use-defaults: all
+      mode: append
+token-analysis:
+    - analyzer: generic
+    - id: bg
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-bg.yaml
+    - id: ca
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-ca.yaml
+    - id: cs
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-cs.yaml
+    - id: da
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-da.yaml
+    - id: de
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-de.yaml
+    - id: el
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-el.yaml
+    - id: en
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-en.yaml
+    - id: es
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-es.yaml
+    - id: et
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-et.yaml
+    - id: eu
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-eu.yaml
+    - id: fi
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-fi.yaml
+    - id: fr
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-fr.yaml
+    - id: gl
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-gl.yaml
+    - id: hu
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-hu.yaml
+    - id: it
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-it.yaml
+    - id: ja
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-ja.yaml
+    - id: mg
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-mg.yaml
+    - id: ms
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-ms.yaml
+    - id: nl
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-nl.yaml
+    - id: no
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-no.yaml
+    - id: pl
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-pl.yaml
+    - id: pt
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-pt.yaml
+    - id: ro
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-ro.yaml
+    - id: ru
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-ru.yaml
+    - id: sk
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-sk.yaml
+    - id: sl
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-sl.yaml
+    - id: sv
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-sv.yaml
+    - id: tr
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-tr.yaml
+    - id: uk
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-uk.yaml
+    - id: vi
+      analyzer: generic
+      mode: variant-only
+      variants:
+          - !include icu-rules/variants-vi.yaml
index d393b8f3381a7a493d4597cd113b77d4a331287f..a5b07e5cf5e5027919066d8967da19cfccdfdc26 100644 (file)
@@ -125,9 +125,6 @@ Feature: Creation of search terms
         Then placex contains
          | object | parent_place_id |
          | N1     | N2              |
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Walltown   | Strange, Town |
         When sending search query "23 Rose Street"
         Then exactly 1 results are returned
         And results contain
@@ -156,9 +153,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | :w-north |
          | N2  | place   | city        | Strange Town | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector      | nameaddress_vector      |
-         | N1     | #Walltown, #Blue house | Walltown, Strange, Town |
         When sending search query "23 Walltown, Strange Town"
         Then results contain
          | osm | display_name |
@@ -190,9 +184,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | :w-north |
          | N2  | place   | city        | Strange Town | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector      | nameaddress_vector      |
-         | N1     | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
         When sending search query "23 Moon Sun, Strange Town"
         Then results contain
          | osm | display_name |
@@ -212,9 +203,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | Walltown  | :w-north |
          | N2  | place   | suburb      | Strange Town | Walltown  | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Walltown   | Strange, Town      |
         When sending search query "23 Rose Street, Walltown"
         Then exactly 1 result is returned
         And results contain
@@ -303,9 +291,6 @@ Feature: Creation of search terms
          | W1  | highway | residential | Rose Street  | :w-north |
          | N2  | place   | suburb      | Strange Town | :p-N1 |
         When importing
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Walltown |
         When sending search query "Green Moss, Rose Street, Walltown"
         Then exactly 0 result is returned
         When sending search query "Green Moss, Walltown"
index b8a760f99bd0bc03e127c14ae60de5f93fdf0290..deaa635e0b190733a99306795d2bef779f5caaa0 100644 (file)
@@ -52,7 +52,7 @@ Feature: Import and search of names
 
     Scenario: Special characters in name
         Given the places
-          | osm | class | type      | name |
+          | osm | class | type      | name+name:de |
           | N1  | place | locality  | Jim-Knopf-Straße |
           | N2  | place | locality  | Smith/Weston |
           | N3  | place | locality  | space mountain |
index 99199de4fe24469450570090e50cc344eabbac0a..c962fc7e3fcb085b25b34b10683c1f5849c361bd 100644 (file)
@@ -1,7 +1,7 @@
 @DB
 Feature: Update parenting of objects
 
-Scenario: POI inside building inherits addr:street change
+    Scenario: POI inside building inherits addr:street change
         Given the scene building-on-street-corner
         And the named places
          | osm | class   | type       | geometry |
@@ -34,3 +34,132 @@ Scenario: POI inside building inherits addr:street change
          | N1     | W3              | 3 |
          | N2     | W3              | 3 |
          | N3     | W3              | 3 |
+
+
+    Scenario: Housenumber is reparented when street gets name matching addr:street
+        Given the grid
+         | 1 |    |   | 2 |
+         |   | 10 |   |   |
+         |   |    |   |   |
+         | 3 |    |   | 4 |
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | W1  | highway | residential | A street | 1,2      |
+         | W2  | highway | residential | B street | 3,4      |
+        And the places
+         | osm | class    | type | housenr | street   | geometry |
+         | N1  | building | yes  | 3       | X street | 10       |
+        When importing
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | W1              |
+        When updating places
+         | osm | class   | type        | name     | geometry |
+         | W2  | highway | residential | X street | 3,4      |
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | W2              |
+
+
+    Scenario: Housenumber is reparented when street looses name matching addr:street
+        Given the grid
+         | 1 |    |   | 2 |
+         |   | 10 |   |   |
+         |   |    |   |   |
+         | 3 |    |   | 4 |
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | W1  | highway | residential | A street | 1,2      |
+         | W2  | highway | residential | X street | 3,4      |
+        And the places
+         | osm | class    | type | housenr | street   | geometry |
+         | N1  | building | yes  | 3       | X street | 10       |
+        When importing
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | W2              |
+        When updating places
+         | osm | class   | type        | name     | geometry |
+         | W2  | highway | residential | B street | 3,4      |
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | W1              |
+
+
+    Scenario: Housenumber is reparented when street gets name matching addr:street
+        Given the grid
+         | 1 |    |   | 2 |
+         |   | 10 |   |   |
+         |   |    |   |   |
+         | 3 |    |   | 4 |
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | W1  | highway | residential | A street | 1,2      |
+         | W2  | highway | residential | B street | 3,4      |
+        And the places
+         | osm | class    | type | housenr | street   | geometry |
+         | N1  | building | yes  | 3       | X street | 10       |
+        When importing
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | W1              |
+        When updating places
+         | osm | class   | type        | name     | geometry |
+         | W2  | highway | residential | X street | 3,4      |
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | W2              |
+
+
+    # Invalidation of geometries currently disabled for addr:place matches.
+    @Fail
+    Scenario: Housenumber is reparented when place is renamed to matching addr:place
+        Given the grid
+         | 1 |    |   | 2 |
+         |   | 10 | 4 |   |
+         |   |    |   |   |
+         |   |    | 5 |   |
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | W1  | highway | residential | A street | 1,2      |
+         | N5  | place   | village     | Bdorf    | 5        |
+         | N4  | place   | village     | Other    | 4        |
+        And the places
+         | osm | class    | type | housenr | addr_place | geometry |
+         | N1  | building | yes  | 3       | Cdorf      | 10       |
+        When importing
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | N4              |
+        When updating places
+         | osm | class   | type        | name     | geometry |
+         | N5  | place   | village     | Cdorf    | 5        |
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | N5              |
+
+
+    Scenario: Housenumber is reparented when it looses a matching addr:place
+        Given the grid
+         | 1 |    |   | 2 |
+         |   | 10 | 4 |   |
+         |   |    |   |   |
+         |   |    | 5 |   |
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | W1  | highway | residential | A street | 1,2      |
+         | N5  | place   | village     | Bdorf    | 5        |
+         | N4  | place   | village     | Other    | 4        |
+        And the places
+         | osm | class    | type | housenr | addr_place | geometry |
+         | N1  | building | yes  | 3       | Bdorf      | 10       |
+        When importing
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | N5              |
+        When updating places
+         | osm | class   | type        | name     | geometry |
+         | N5  | place   | village     | Cdorf    | 5        |
+        Then placex contains
+         | object | parent_place_id |
+         | N1     | N4              |
index 2fc9772671069c06f337c0d24ca410b1c5a093b6..74d3633968cb12c6c16733f997d3cd0f621c4d94 100644 (file)
@@ -227,7 +227,7 @@ def osm2pgsql_options(temp_db):
                                  main_data='', main_index=''))
 
 @pytest.fixture
-def sql_preprocessor(temp_db_conn, tmp_path, table_factory):
+def sql_preprocessor(temp_db_conn, tmp_path, table_factory, temp_db_with_extensions):
     table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
     cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
     cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
index 69202bc322ffd88e103f60f8ced809bfa8e82fd3..6029eb7c6620b5f088f831dec66dede835664340 100644 (file)
@@ -1,6 +1,8 @@
 """
 Tokenizer for testing.
 """
+from nominatim.indexer.place_info import PlaceInfo
+from nominatim.config import Configuration
 
 def create(dsn, data_dir):
     """ Create a new instance of the tokenizer provided by this module.
@@ -21,7 +23,8 @@ class DummyTokenizer:
         self.init_state = "new"
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
+        assert isinstance(config, Configuration)
         assert self.init_state is None
         self.init_state = "loaded"
 
@@ -68,4 +71,5 @@ class DummyNameAnalyzer:
 
     @staticmethod
     def process_place(place):
+        assert isinstance(place, PlaceInfo)
         return {}
index 41978e59135513682cb04bc088d00338df78c499..00c29a43a9a8bdf32f9531199ac0b6335fb5a502 100644 (file)
@@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch):
 
 
 def test_get_pg_env_ignore_unknown():
-    env = get_pg_env('tty=stuff', base_env={})
+    env = get_pg_env('client_encoding=stuff', base_env={})
 
     assert env == {}
index 60ad0bc4cbd3c5705891c60c847809ba74d7b985..4c9d940d09b1c2a0a0cddbe78383c34c9251af53 100644 (file)
@@ -29,6 +29,7 @@ class IndexerTestDB:
                                                 indexed_date TIMESTAMP,
                                                 partition SMALLINT,
                                                 admin_level SMALLINT,
+                                                country_code TEXT,
                                                 address HSTORE,
                                                 token_info JSONB,
                                                 geometry_sector INTEGER)""")
@@ -54,15 +55,26 @@ class IndexerTestDB:
                              END IF;
                              RETURN NEW;
                            END; $$ LANGUAGE plpgsql;""")
-            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                      OUT name HSTORE,
-                                                      OUT address HSTORE,
-                                                      OUT country_feature VARCHAR,
-                                                      OUT linked_place_id BIGINT)
+            cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE")
+            cur.execute("""CREATE TYPE prepare_update_info AS (
+                             name HSTORE,
+                             address HSTORE,
+                             rank_address SMALLINT,
+                             country_code TEXT,
+                             class TEXT,
+                             type TEXT,
+                             linked_place_id BIGINT
+                           )""")
+            cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex,
+                                                     OUT result prepare_update_info)
                            AS $$
                            BEGIN
-                            address := p.address;
-                            name := p.name;
+                             result.address := p.address;
+                             result.name := p.name;
+                             result.class := p.class;
+                             result.type := p.type;
+                             result.country_code := p.country_code;
+                             result.rank_address := p.rank_address;
                            END;
                            $$ LANGUAGE plpgsql STABLE;
                         """)
index b7101c3f67ef62229e5205d226e4c50b4c6ccad8..6a2f2f8bd04b405f7741aca6fbaa27cbe2ce113c 100644 (file)
@@ -7,9 +7,10 @@ import yaml
 import pytest
 
 from nominatim.tokenizer import icu_tokenizer
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 
 from mock_icu_word_table import MockIcuWordTable
 
@@ -66,16 +67,29 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
     monkeypatch.undo()
 
     def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
-        cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
-                   'variants' : [ {'words': list(variants)}]}
-        tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
+                     variants=('~gasse -> gasse', 'street => st', ),
+                     sanitizers=[]):
+        cfgstr = {'normalization': list(norm),
+                  'sanitizers': sanitizers,
+                  'transliteration': list(trans),
+                  'token-analysis': [{'analyzer': 'generic',
+                                      'variants': [{'words': list(variants)}]}]}
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
+        tok.loader = ICURuleLoader(test_config)
 
         return tok.name_analyzer()
 
     return _mk_analyser
 
+@pytest.fixture
+def sql_functions(temp_db_conn, def_config, src_dir):
+    orig_sql = def_config.lib_dir.sql
+    def_config.lib_dir.sql = src_dir / 'lib-sql'
+    sqlproc = SQLPreprocessor(temp_db_conn, def_config)
+    sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
+    sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
+    def_config.lib_dir.sql = orig_sql
+
 
 @pytest.fixture
 def getorcreate_full_word(temp_db_cursor):
@@ -144,7 +158,6 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
     tok.init_new_db(test_config)
 
     assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
 
 
 def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@@ -156,40 +169,32 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
     tok.init_new_db(test_config)
 
     assert word_table.get_partial_words() == {('test', 1),
-                                              ('no', 1), ('area', 2),
-                                              ('holz', 1), ('strasse', 1),
-                                              ('str', 1)}
+                                              ('no', 1), ('area', 2)}
 
 
 def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
     monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
     monkeypatch.undo()
 
     tok = tokenizer_factory()
-    tok.init_from_project()
+    tok.init_from_project(test_config)
 
-    assert tok.naming_rules is not None
+    assert tok.loader is not None
     assert tok.term_normalization == ':: lower();'
-    assert tok.max_word_frequency == '90300'
 
 
 def test_update_sql_functions(db_prop, temp_db_cursor,
                               tokenizer_factory, test_config, table_factory,
                               monkeypatch):
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
     tok = tokenizer_factory()
     tok.init_new_db(test_config)
-    monkeypatch.undo()
-
-    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
 
     table_factory('test', 'txt TEXT')
 
     func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
+    func_file.write_text("""INSERT INTO test VALUES (1133)""")
 
     tok.update_sql_functions(test_config)
 
@@ -304,45 +309,55 @@ def test_add_country_names_extend(analyzer, word_table):
 class TestPlaceNames:
 
     @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
-        with analyzer() as anl:
+    def setup(self, analyzer, sql_functions):
+        sanitizers = [{'step': 'split-name-list'},
+                      {'step': 'strip-brace-terms'}]
+        with analyzer(sanitizers=sanitizers) as anl:
             self.analyzer = anl
             yield anl
 
 
     def expect_name_terms(self, info, *expected_terms):
         tokens = self.analyzer.get_word_token_info(expected_terms)
-        print (tokens)
         for token in tokens:
             assert token[2] is not None, "No token for {0}".format(token)
 
         assert eval(info['names']) == set((t[2] for t in tokens))
 
 
+    def process_named_place(self, names):
+        return self.analyzer.process_place(PlaceInfo({'name': names}))
+
+
     def test_simple_names(self):
-        info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
+        info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
 
         self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
 
 
     @pytest.mark.parametrize('sep', [',' , ';'])
     def test_names_with_separator(self, sep):
-        info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
+        info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
 
         self.expect_name_terms(info, '#New York', '#Big Apple',
                                      'new', 'york', 'big', 'apple')
 
 
     def test_full_names_with_bracket(self):
-        info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
+        info = self.process_named_place({'name': 'Houseboat (left)'})
 
         self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
                                      'houseboat', 'left')
 
 
     def test_country_name(self, word_table):
-        info = self.analyzer.process_place({'name': {'name': 'Norge'},
-                                           'country_feature': 'no'})
+        place = PlaceInfo({'name' : {'name': 'Norge'},
+                           'country_code': 'no',
+                           'rank_address': 4,
+                           'class': 'boundary',
+                           'type': 'administrative'})
+
+        info = self.analyzer.process_place(place)
 
         self.expect_name_terms(info, '#norge', 'norge')
         assert word_table.get_country() == {('no', 'NORGE')}
@@ -351,14 +366,14 @@ class TestPlaceNames:
 class TestPlaceAddress:
 
     @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
         with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
             self.analyzer = anl
             yield anl
 
 
     def process_address(self, **kwargs):
-        return self.analyzer.process_place({'address': kwargs})
+        return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
 
 
     def name_token_set(self, *expected_terms):
@@ -424,7 +439,7 @@ class TestPlaceAddress:
     def test_process_place_street(self):
         info = self.process_address(street='Grand Road')
 
-        assert eval(info['street']) == self.name_token_set('#GRAND ROAD')
+        assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD')
 
 
     def test_process_place_street_empty(self):
@@ -436,16 +451,13 @@ class TestPlaceAddress:
     def test_process_place_place(self):
         info = self.process_address(place='Honu Lulu')
 
-        assert eval(info['place_search']) == self.name_token_set('#HONU LULU',
-                                                                 'HONU', 'LULU')
-        assert eval(info['place_match']) == self.name_token_set('#HONU LULU')
+        assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
 
 
     def test_process_place_place_empty(self):
         info = self.process_address(place='🜵')
 
-        assert 'place_search' not in info
-        assert 'place_match' not in info
+        assert 'place' not in info
 
 
     def test_process_place_address_terms(self):
@@ -453,16 +465,12 @@ class TestPlaceAddress:
                                     suburb='Zwickau', street='Hauptstr',
                                     full='right behind the church')
 
-        city_full = self.name_token_set('#ZWICKAU')
-        city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU')
-        state_full = self.name_token_set('#SACHSEN')
-        state_all = self.name_token_set('#SACHSEN', 'SACHSEN')
+        city = self.name_token_set('ZWICKAU')
+        state = self.name_token_set('SACHSEN')
 
-        result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()}
+        result = {k: eval(v) for k,v in info['addr'].items()}
 
-        assert result == {'city': [city_all, city_full],
-                          'suburb': [city_all, city_full],
-                          'state': [state_all, state_full]}
+        assert result == {'city': city, 'suburb': city, 'state': state}
 
 
     def test_process_place_address_terms_empty(self):
diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py
deleted file mode 100644 (file)
index ae05988..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-Tests for import name normalisation and variant generation.
-"""
-from textwrap import dedent
-
-import pytest
-import yaml
-
-from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
-
-from nominatim.errors import UsageError
-
-@pytest.fixture
-def cfgfile():
-    def _create_config(*variants, **kwargs):
-        content = dedent("""\
-        normalization:
-            - ":: NFD ()"
-            - "'🜳' > ' '"
-            - "[[:Nonspacing Mark:] [:Cf:]] >"
-            - ":: lower ()"
-            - "[[:Punctuation:][:Space:]]+ > ' '"
-            - ":: NFC ()"
-        transliteration:
-            - "::  Latin ()"
-            - "'🜵' > ' '"
-        """)
-        content += "variants:\n  - words:\n"
-        content += '\n'.join(("      - " + s for s in variants)) + '\n'
-        for k, v in kwargs:
-            content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
-
-    return _create_config
-
-
-def get_normalized_variants(proc, name):
-    return proc.get_variants_ascii(proc.get_normalized(name))
-
-
-def test_variants_empty(cfgfile):
-    fpath = cfgfile('saint -> 🜵', 'street -> st')
-
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
-    proc = ICUNameProcessor(rules)
-
-    assert get_normalized_variants(proc, '🜵') == []
-    assert get_normalized_variants(proc, '🜳') == []
-    assert get_normalized_variants(proc, 'saint') == ['saint']
-
-
-VARIANT_TESTS = [
-(('~strasse,~straße -> str', '~weg => weg'), "hallo", {'hallo'}),
-(('weg => wg',), "holzweg", {'holzweg'}),
-(('weg -> wg',), "holzweg", {'holzweg'}),
-(('~weg => weg',), "holzweg", {'holz weg', 'holzweg'}),
-(('~weg -> weg',), "holzweg",  {'holz weg', 'holzweg'}),
-(('~weg => w',), "holzweg", {'holz w', 'holzw'}),
-(('~weg -> w',), "holzweg",  {'holz weg', 'holzweg', 'holz w', 'holzw'}),
-(('~weg => weg',), "Meier Weg", {'meier weg', 'meierweg'}),
-(('~weg -> weg',), "Meier Weg", {'meier weg', 'meierweg'}),
-(('~weg => w',), "Meier Weg", {'meier w', 'meierw'}),
-(('~weg -> w',), "Meier Weg", {'meier weg', 'meierweg', 'meier w', 'meierw'}),
-(('weg => wg',), "Meier Weg", {'meier wg'}),
-(('weg -> wg',), "Meier Weg", {'meier weg', 'meier wg'}),
-(('~strasse,~straße -> str', '~weg => weg'), "Bauwegstraße",
-     {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}),
-(('am => a', 'bach => b'), "am bach", {'a b'}),
-(('am => a', '~bach => b'), "am bach", {'a b'}),
-(('am -> a', '~bach -> b'), "am bach", {'am bach', 'a bach', 'am b', 'a b'}),
-(('am -> a', '~bach -> b'), "ambach", {'ambach', 'am bach', 'amb', 'am b'}),
-(('saint -> s,st', 'street -> st'), "Saint Johns Street",
-     {'saint johns street', 's johns street', 'st johns street',
-      'saint johns st', 's johns st', 'st johns st'}),
-(('river$ -> r',), "River Bend Road", {'river bend road'}),
-(('river$ -> r',), "Bent River", {'bent river', 'bent r'}),
-(('^north => n',), "North 2nd Street", {'n 2nd street'}),
-(('^north => n',), "Airport North", {'airport north'}),
-(('am -> a',), "am am am am am am am am", {'am am am am am am am am'}),
-(('am => a',), "am am am am am am am am", {'a a a a a a a a'})
-]
-
-@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
-def test_variants(cfgfile, rules, name, variants):
-    fpath = cfgfile(*rules)
-    proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
-
-    result = get_normalized_variants(proc, name)
-
-    assert len(result) == len(set(result))
-    assert set(get_normalized_variants(proc, name)) == variants
-
-
-def test_search_normalized(cfgfile):
-    fpath = cfgfile('~street => s,st', 'master => mstr')
-
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
-    proc = ICUNameProcessor(rules)
-
-    assert proc.get_search_normalized('Master Street') == 'master street'
-    assert proc.get_search_normalized('Earnes St') == 'earnes st'
-    assert proc.get_search_normalized('Nostreet') == 'nostreet'
index c3480de87ac08a1b251666c0b61fb31f6405cfba..e22ccd4b054f1a73d3e9bc99b25ba3715356ca38 100644 (file)
@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
 from icu import Transliterator
 
 @pytest.fixture
-def cfgrules():
+def test_config(def_config, tmp_path):
+    project_dir = tmp_path / 'project_dir'
+    project_dir.mkdir()
+    def_config.project_dir = project_dir
+
+    return def_config
+
+
+@pytest.fixture
+def cfgrules(test_config):
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
@@ -25,36 +34,40 @@ def cfgrules():
             - "::  Latin ()"
             - "[[:Punctuation:][:Space:]]+ > ' '"
         """)
-        content += "variants:\n  - words:\n"
-        content += '\n'.join(("      - " + s for s in variants)) + '\n'
+        content += "token-analysis:\n  - analyzer: generic\n    variants:\n     - words:\n"
+        content += '\n'.join(("         - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+        return test_config
 
     return _create_config
 
 
-def test_empty_rule_set():
-    rule_cfg = yaml.safe_load(dedent("""\
+def test_empty_rule_set(test_config):
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
         normalization:
         transliteration:
-        variants:
+        token-analysis:
+          - analyzer: generic
+            variants:
         """))
 
-    rules = ICURuleLoader(rule_cfg)
+    rules = ICURuleLoader(test_config)
     assert rules.get_search_rules() == ''
     assert rules.get_normalization_rules() == ''
     assert rules.get_transliteration_rules() == ''
-    assert list(rules.get_replacement_pairs()) == []
 
-CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
+CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis')
 
 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
-def test_missing_section(section):
-    rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+def test_missing_section(section, test_config):
+    rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
 
     with pytest.raises(UsageError):
-        ICURuleLoader(rule_cfg)
+        ICURuleLoader(test_config)
 
 
 def test_get_search_rules(cfgrules):
@@ -88,26 +101,36 @@ def test_get_transliteration_rules(cfgrules):
     assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
 
 
-def test_transliteration_rules_from_file(def_config, tmp_path):
-    def_config.project_dir = tmp_path
-    cfgpath = tmp_path / ('test_config.yaml')
+def test_transliteration_rules_from_file(test_config):
+    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
     cfgpath.write_text(dedent("""\
         normalization:
         transliteration:
             - "'ax' > 'b'"
             - !include transliteration.yaml
-        variants:
+        token-analysis:
+            - analyzer: generic
+              variants:
         """))
-    transpath = tmp_path / ('transliteration.yaml')
+    transpath = test_config.project_dir / ('transliteration.yaml')
     transpath.write_text('- "x > y"')
 
-    loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
+    loader = ICURuleLoader(test_config)
     rules = loader.get_transliteration_rules()
     trans = Transliterator.createFromRules("test", rules)
 
     assert trans.transliterate(" axxt ") == " byt "
 
 
+def test_search_rules(cfgrules):
+    config = cfgrules('~street => s,st', 'master => mstr')
+    proc = ICURuleLoader(config).make_token_analysis()
+
+    assert proc.search.transliterate('Master Street').strip() == 'master street'
+    assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
+    assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
+
+
 class TestGetReplacements:
 
     @pytest.fixture(autouse=True)
@@ -116,9 +139,9 @@ class TestGetReplacements:
 
     def get_replacements(self, *variants):
         loader = ICURuleLoader(self.cfgrules(*variants))
-        rules = loader.get_replacement_pairs()
+        rules = loader.analysis[None].config['replacements']
 
-        return set((v.source, v.replacement) for v in rules)
+        return sorted((k, sorted(v)) for k,v in rules)
 
 
     @pytest.mark.parametrize("variant", ['foo > bar', 'foo -> bar -> bar',
@@ -130,131 +153,122 @@ class TestGetReplacements:
     def test_add_full(self):
         repl = self.get_replacements("foo -> bar")
 
-        assert repl == {(' foo ', ' bar '), (' foo ', ' foo ')}
+        assert repl == [(' foo ', [' bar', ' foo'])]
 
 
     def test_replace_full(self):
         repl = self.get_replacements("foo => bar")
 
-        assert repl == {(' foo ', ' bar ')}
+        assert repl == [(' foo ', [' bar'])]
 
 
     def test_add_suffix_no_decompose(self):
         repl = self.get_replacements("~berg |-> bg")
 
-        assert repl == {('berg ', 'berg '), ('berg ', 'bg '),
-                        (' berg ', ' berg '), (' berg ', ' bg ')}
+        assert repl == [(' berg ', [' berg', ' bg']),
+                        ('berg ', ['berg', 'bg'])]
 
 
     def test_replace_suffix_no_decompose(self):
         repl = self.get_replacements("~berg |=> bg")
 
-        assert repl == {('berg ', 'bg '), (' berg ', ' bg ')}
+        assert repl == [(' berg ', [' bg']),('berg ', ['bg'])]
 
 
     def test_add_suffix_decompose(self):
         repl = self.get_replacements("~berg -> bg")
 
-        assert repl == {('berg ', 'berg '), ('berg ', ' berg '),
-                        (' berg ', ' berg '), (' berg ', 'berg '),
-                        ('berg ', 'bg '), ('berg ', ' bg '),
-                        (' berg ', 'bg '), (' berg ', ' bg ')}
+        assert repl == [(' berg ', [' berg', ' bg', 'berg', 'bg']),
+                        ('berg ', [' berg', ' bg', 'berg', 'bg'])]
 
 
     def test_replace_suffix_decompose(self):
         repl = self.get_replacements("~berg => bg")
 
-        assert repl == {('berg ', 'bg '), ('berg ', ' bg '),
-                        (' berg ', 'bg '), (' berg ', ' bg ')}
+        assert repl == [(' berg ', [' bg', 'bg']),
+                        ('berg ', [' bg', 'bg'])]
 
 
     def test_add_prefix_no_compose(self):
         repl = self.get_replacements("hinter~ |-> hnt")
 
-        assert repl == {(' hinter', ' hinter'), (' hinter ', ' hinter '),
-                        (' hinter', ' hnt'), (' hinter ', ' hnt ')}
+        assert repl == [(' hinter', [' hinter', ' hnt']),
+                        (' hinter ', [' hinter', ' hnt'])]
 
 
     def test_replace_prefix_no_compose(self):
         repl = self.get_replacements("hinter~ |=> hnt")
 
-        assert repl ==  {(' hinter', ' hnt'), (' hinter ', ' hnt ')}
+        assert repl ==  [(' hinter', [' hnt']), (' hinter ', [' hnt'])]
 
 
     def test_add_prefix_compose(self):
         repl = self.get_replacements("hinter~-> h")
 
-        assert repl == {(' hinter', ' hinter'), (' hinter', ' hinter '),
-                        (' hinter', ' h'), (' hinter', ' h '),
-                        (' hinter ', ' hinter '), (' hinter ', ' hinter'),
-                        (' hinter ', ' h '), (' hinter ', ' h')}
+        assert repl == [(' hinter', [' h', ' h ', ' hinter', ' hinter ']),
+                        (' hinter ', [' h', ' h', ' hinter', ' hinter'])]
 
 
     def test_replace_prefix_compose(self):
         repl = self.get_replacements("hinter~=> h")
 
-        assert repl == {(' hinter', ' h'), (' hinter', ' h '),
-                        (' hinter ', ' h '), (' hinter ', ' h')}
+        assert repl == [(' hinter', [' h', ' h ']),
+                        (' hinter ', [' h', ' h'])]
 
 
     def test_add_beginning_only(self):
         repl = self.get_replacements("^Premier -> Pr")
 
-        assert repl == {('^ premier ', '^ premier '), ('^ premier ', '^ pr ')}
+        assert repl == [('^ premier ', ['^ pr', '^ premier'])]
 
 
     def test_replace_beginning_only(self):
         repl = self.get_replacements("^Premier => Pr")
 
-        assert repl == {('^ premier ', '^ pr ')}
+        assert repl == [('^ premier ', ['^ pr'])]
 
 
     def test_add_final_only(self):
         repl = self.get_replacements("road$ -> rd")
 
-        assert repl == {(' road ^', ' road ^'), (' road ^', ' rd ^')}
+        assert repl == [(' road ^', [' rd ^', ' road ^'])]
 
 
     def test_replace_final_only(self):
         repl = self.get_replacements("road$ => rd")
 
-        assert repl == {(' road ^', ' rd ^')}
+        assert repl == [(' road ^', [' rd ^'])]
 
 
     def test_decompose_only(self):
         repl = self.get_replacements("~foo -> foo")
 
-        assert repl == {('foo ', 'foo '), ('foo ', ' foo '),
-                        (' foo ', 'foo '), (' foo ', ' foo ')}
+        assert repl == [(' foo ', [' foo', 'foo']),
+                        ('foo ', [' foo', 'foo'])]
 
 
     def test_add_suffix_decompose_end_only(self):
         repl = self.get_replacements("~berg |-> bg", "~berg$ -> bg")
 
-        assert repl == {('berg ', 'berg '), ('berg ', 'bg '),
-                        (' berg ', ' berg '), (' berg ', ' bg '),
-                        ('berg ^', 'berg ^'), ('berg ^', ' berg ^'),
-                        ('berg ^', 'bg ^'), ('berg ^', ' bg ^'),
-                        (' berg ^', 'berg ^'), (' berg ^', 'bg ^'),
-                        (' berg ^', ' berg ^'), (' berg ^', ' bg ^')}
+        assert repl == [(' berg ', [' berg', ' bg']),
+                        (' berg ^', [' berg ^', ' bg ^', 'berg ^', 'bg ^']),
+                        ('berg ', ['berg', 'bg']),
+                        ('berg ^', [' berg ^', ' bg ^', 'berg ^', 'bg ^'])]
 
 
     def test_replace_suffix_decompose_end_only(self):
         repl = self.get_replacements("~berg |=> bg", "~berg$ => bg")
 
-        assert repl == {('berg ', 'bg '), (' berg ', ' bg '),
-                        ('berg ^', 'bg ^'), ('berg ^', ' bg ^'),
-                        (' berg ^', 'bg ^'), (' berg ^', ' bg ^')}
+        assert repl == [(' berg ', [' bg']),
+                        (' berg ^', [' bg ^', 'bg ^']),
+                        ('berg ', ['bg']),
+                        ('berg ^', [' bg ^', 'bg ^'])]
 
 
     def test_add_multiple_suffix(self):
         repl = self.get_replacements("~berg,~burg -> bg")
 
-        assert repl == {('berg ', 'berg '), ('berg ', ' berg '),
-                        (' berg ', ' berg '), (' berg ', 'berg '),
-                        ('berg ', 'bg '), ('berg ', ' bg '),
-                        (' berg ', 'bg '), (' berg ', ' bg '),
-                        ('burg ', 'burg '), ('burg ', ' burg '),
-                        (' burg ', ' burg '), (' burg ', 'burg '),
-                        ('burg ', 'bg '), ('burg ', ' bg '),
-                        (' burg ', 'bg '), (' burg ', ' bg ')}
+        assert repl == [(' berg ', [' berg', ' bg', 'berg', 'bg']),
+                        (' burg ', [' bg', ' burg', 'bg', 'burg']),
+                        ('berg ', [' berg', ' bg', 'berg', 'bg']),
+                        ('burg ', [' bg', ' burg', 'bg', 'burg'])]
index 4dd3a1414d4ac0872678a118fae51ae07b6920a0..53d45c1c93a0f44f375ab11b675902497789b73e 100644 (file)
@@ -5,6 +5,7 @@ import shutil
 
 import pytest
 
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer import legacy_tokenizer
 from nominatim.db import properties
 from nominatim.errors import UsageError
@@ -131,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
     assert not (test_config.project_dir / 'module').exists()
 
 
-def test_init_from_project(tokenizer_setup, tokenizer_factory):
+def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
     tok = tokenizer_factory()
 
-    tok.init_from_project()
+    tok.init_from_project(test_config)
 
     assert tok.normalization is not None
 
@@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
 
 
 def test_process_place_names(analyzer, make_keywords):
-    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+    info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
 
     assert info['names'] == '{1,2,3}'
 
 
 @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
 def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
 
     assert word_table.get_postcodes() == {pcode, }
 
 
 @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
 def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
 
     assert not word_table.get_postcodes()
 
@@ -319,7 +320,7 @@ class TestHousenumberName:
     @staticmethod
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(analyzer, hnr):
-        info = analyzer.process_place({'address': {'housenumber' : hnr}})
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
 
         assert info['hnr'] == hnr
         assert info['hnr_tokens'].startswith("{")
@@ -327,15 +328,15 @@ class TestHousenumberName:
 
     @staticmethod
     def test_process_place_housenumbers_lists(analyzer):
-        info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+        info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
 
         assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
 
 
     @staticmethod
     def test_process_place_housenumbers_duplicates(analyzer):
-        info = analyzer.process_place({'address': {'housenumber' : '134',
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
                                                    'conscriptionnumber' : '134',
-                                                   'streetnumber' : '99a'}})
+                                                   'streetnumber' : '99a'}}))
 
         assert set(info['hnr'].split(';')) == set(('134', '99a'))
index aed5cb7e98c75fe9a9f0b029cb04db9cceed8efe..edba32364c32d33b230dc969b118a1abd6412bd3 100644 (file)
@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
                          check_result, state):
     class _TestTokenizer:
         @staticmethod
-        def check_database():
+        def check_database(_):
             return check_result
 
     monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py
new file mode 100644 (file)
index 0000000..ee74546
--- /dev/null
@@ -0,0 +1,65 @@
+"""
+Tests for the sanitizer that splitts multivalue lists.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+from nominatim.errors import UsageError
+
+def run_sanitizer_on(**kwargs):
+    place = PlaceInfo({'name': kwargs})
+    name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
+
+    return sorted([(p.name, p.kind, p.suffix) for p in name])
+
+
+def sanitize_with_delimiter(delimiter, name):
+    place = PlaceInfo({'name': {'name': name}})
+    san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
+    name, _ = san.process_names(place)
+
+    return sorted([p.name for p in name])
+
+
+def test_simple():
+    assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
+    assert run_sanitizer_on(name='') == [('', 'name', None)]
+
+
+def test_splits():
+    assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
+                                              ('B', 'name', None),
+                                              ('C', 'name', None)]
+    assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
+                                                            ('boat', 'short_name', None)]
+
+
+def test_empty_fields():
+    assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
+                                             ('B', 'name', None)]
+    assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
+                                              ('B', 'name', None)]
+    assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
+    assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
+
+
+def test_custom_delimiters():
+    assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
+    assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
+    assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
+    assert sanitize_with_delimiter(' ', 'morning  sun') == ['morning', 'sun']
+
+
+def test_empty_delimiter_set():
+    with pytest.raises(UsageError):
+        sanitize_with_delimiter('', 'abc')
+
+
+def test_no_name_list():
+    place = PlaceInfo({'address': {'housenumber': '3'}})
+    name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
+
+    assert not name
+    assert len(address) == 1
diff --git a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py
new file mode 100644 (file)
index 0000000..50af244
--- /dev/null
@@ -0,0 +1,44 @@
+"""
+Tests for the sanitizer that handles braced suffixes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+def run_sanitizer_on(**kwargs):
+    place = PlaceInfo({'name': kwargs})
+    name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
+
+    return sorted([(p.name, p.kind, p.suffix) for p in name])
+
+
+def test_no_braces():
+    assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
+                                                      ('foo', 'name', None)]
+
+
+def test_simple_braces():
+    assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
+      == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
+    assert run_sanitizer_on(name='ack ( bar')\
+      == [('ack', 'name', None), ('ack ( bar', 'name', None)]
+
+
+def test_only_braces():
+    assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
+
+
+def test_double_braces():
+    assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
+                                               ('a((b))', 'name', None)]
+    assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
+                                                  ('a (b) (c)', 'name', None)]
+
+
+def test_no_names():
+    place = PlaceInfo({'address': {'housenumber': '3'}})
+    name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
+
+    assert not name
+    assert len(address) == 1
diff --git a/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py b/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py
new file mode 100644 (file)
index 0000000..e4a836f
--- /dev/null
@@ -0,0 +1,259 @@
+"""
+Tests for the sanitizer that enables language-dependent analyzers.
+"""
+import pytest
+
+from nominatim.indexer.place_info import PlaceInfo
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.tools.country_info import setup_country_config
+
+class TestWithDefaults:
+
+    @staticmethod
+    def run_sanitizer_on(country, **kwargs):
+        place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
+                           'country_code': country})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}]).process_names(place)
+
+        return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
+
+
+    def test_no_names(self):
+        assert self.run_sanitizer_on('de') == []
+
+
+    def test_simple(self):
+        res = self.run_sanitizer_on('fr', name='Foo',name_de='Zoo', ref_abc='M')
+
+        assert res == [('Foo', 'name', None, {}),
+                       ('M', 'ref', 'abc', {'analyzer': 'abc'}),
+                       ('Zoo', 'name', 'de', {'analyzer': 'de'})]
+
+
+    @pytest.mark.parametrize('suffix', ['DE', 'asbc'])
+    def test_illegal_suffix(self, suffix):
+        assert self.run_sanitizer_on('fr', **{'name_' + suffix: 'Foo'}) \
+                 == [('Foo', 'name', suffix, {})]
+
+
+class TestFilterKind:
+
+    @staticmethod
+    def run_sanitizer_on(filt, **kwargs):
+        place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
+                           'country_code': 'de'})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
+                                   'filter-kind': filt}]).process_names(place)
+
+        return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name])
+
+
+    def test_single_exact_name(self):
+        res = self.run_sanitizer_on(['name'], name_fr='A', ref_fr='12',
+                                              shortname_fr='C', name='D')
+
+        assert res == [('12', 'ref',  'fr', {}),
+                       ('A',  'name', 'fr', {'analyzer': 'fr'}),
+                       ('C',  'shortname', 'fr', {}),
+                       ('D',  'name', None, {})]
+
+
+    def test_single_pattern(self):
+        res = self.run_sanitizer_on(['.*name'],
+                                    name_fr='A', ref_fr='12', namexx_fr='B',
+                                    shortname_fr='C', name='D')
+
+        assert res == [('12', 'ref',  'fr', {}),
+                       ('A',  'name', 'fr', {'analyzer': 'fr'}),
+                       ('B',  'namexx', 'fr', {}),
+                       ('C',  'shortname', 'fr', {'analyzer': 'fr'}),
+                       ('D',  'name', None, {})]
+
+
+    def test_multiple_patterns(self):
+        res = self.run_sanitizer_on(['.*name', 'ref'],
+                                    name_fr='A', ref_fr='12', oldref_fr='X',
+                                    namexx_fr='B', shortname_fr='C', name='D')
+
+        assert res == [('12', 'ref',  'fr', {'analyzer': 'fr'}),
+                       ('A',  'name', 'fr', {'analyzer': 'fr'}),
+                       ('B',  'namexx', 'fr', {}),
+                       ('C',  'shortname', 'fr', {'analyzer': 'fr'}),
+                       ('D',  'name', None, {}),
+                       ('X',  'oldref', 'fr', {})]
+
+
+class TestDefaultCountry:
+
+    @pytest.fixture(autouse=True)
+    def setup_country(self, def_config):
+        setup_country_config(def_config)
+
+    @staticmethod
+    def run_sanitizer_append(mode,  country, **kwargs):
+        place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
+                           'country_code': country})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
+                                   'use-defaults': mode,
+                                   'mode': 'append'}]).process_names(place)
+
+        assert all(isinstance(p.attr, dict) for p in name)
+        assert all(len(p.attr) <= 1 for p in name)
+        assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
+                   for p in name)
+
+        return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
+
+
+    @staticmethod
+    def run_sanitizer_replace(mode,  country, **kwargs):
+        place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
+                           'country_code': country})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
+                                   'use-defaults': mode,
+                                   'mode': 'replace'}]).process_names(place)
+
+        assert all(isinstance(p.attr, dict) for p in name)
+        assert all(len(p.attr) <= 1 for p in name)
+        assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
+                   for p in name)
+
+        return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
+
+
+    def test_missing_country(self):
+        place = PlaceInfo({'name': {'name': 'something'}})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
+                                   'use-defaults': 'all',
+                                   'mode': 'replace'}]).process_names(place)
+
+        assert len(name) == 1
+        assert name[0].name == 'something'
+        assert name[0].suffix is None
+        assert 'analyzer' not in name[0].attr
+
+
+    def test_mono_unknown_country(self):
+        expect = [('XX', '')]
+
+        assert self.run_sanitizer_replace('mono', 'xx', name='XX') == expect
+        assert self.run_sanitizer_append('mono', 'xx', name='XX') == expect
+
+
+    def test_mono_monoling_replace(self):
+        res = self.run_sanitizer_replace('mono', 'de', name='Foo')
+
+        assert res == [('Foo', 'de')]
+
+
+    def test_mono_monoling_append(self):
+        res = self.run_sanitizer_append('mono', 'de', name='Foo')
+
+        assert res == [('Foo', ''), ('Foo', 'de')]
+
+
+    def test_mono_multiling(self):
+        expect = [('XX', '')]
+
+        assert self.run_sanitizer_replace('mono', 'ch', name='XX') == expect
+        assert self.run_sanitizer_append('mono', 'ch', name='XX') == expect
+
+
+    def test_all_unknown_country(self):
+        expect = [('XX', '')]
+
+        assert self.run_sanitizer_replace('all', 'xx', name='XX') == expect
+        assert self.run_sanitizer_append('all', 'xx', name='XX') == expect
+
+
+    def test_all_monoling_replace(self):
+        res = self.run_sanitizer_replace('all', 'de', name='Foo')
+
+        assert res == [('Foo', 'de')]
+
+
+    def test_all_monoling_append(self):
+        res = self.run_sanitizer_append('all', 'de', name='Foo')
+
+        assert res == [('Foo', ''), ('Foo', 'de')]
+
+
+    def test_all_multiling_append(self):
+        res = self.run_sanitizer_append('all', 'ch', name='XX')
+
+        assert res == [('XX', ''),
+                       ('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')]
+
+
+    def test_all_multiling_replace(self):
+        res = self.run_sanitizer_replace('all', 'ch', name='XX')
+
+        assert res == [('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')]
+
+
+class TestCountryWithWhitelist:
+
+    @staticmethod
+    def run_sanitizer_on(mode,  country, **kwargs):
+        place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
+                           'country_code': country})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
+                                   'use-defaults': mode,
+                                   'mode': 'replace',
+                                   'whitelist': ['de', 'fr', 'ru']}]).process_names(place)
+
+        assert all(isinstance(p.attr, dict) for p in name)
+        assert all(len(p.attr) <= 1 for p in name)
+        assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
+                   for p in name)
+
+        return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
+
+
+    def test_mono_monoling(self):
+        assert self.run_sanitizer_on('mono', 'de', name='Foo') == [('Foo', 'de')]
+        assert self.run_sanitizer_on('mono', 'pt', name='Foo') == [('Foo', '')]
+
+
+    def test_mono_multiling(self):
+        assert self.run_sanitizer_on('mono', 'ca', name='Foo') == [('Foo', '')]
+
+
+    def test_all_monoling(self):
+        assert self.run_sanitizer_on('all', 'de', name='Foo') == [('Foo', 'de')]
+        assert self.run_sanitizer_on('all', 'pt', name='Foo') == [('Foo', '')]
+
+
+    def test_all_multiling(self):
+        assert self.run_sanitizer_on('all', 'ca', name='Foo') == [('Foo', 'fr')]
+        assert self.run_sanitizer_on('all', 'ch', name='Foo') \
+            == [('Foo', 'de'), ('Foo', 'fr')]
+
+
+class TestWhiteList:
+
+    @staticmethod
+    def run_sanitizer_on(whitelist, **kwargs):
+        place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}})
+        name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language',
+                                   'mode': 'replace',
+                                   'whitelist': whitelist}]).process_names(place)
+
+        assert all(isinstance(p.attr, dict) for p in name)
+        assert all(len(p.attr) <= 1 for p in name)
+        assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer'])
+                   for p in name)
+
+        return sorted([(p.name, p.attr.get('analyzer', '')) for p in name])
+
+
+    def test_in_whitelist(self):
+        assert self.run_sanitizer_on(['de', 'xx'], ref_xx='123') == [('123', 'xx')]
+
+
+    def test_not_in_whitelist(self):
+        assert self.run_sanitizer_on(['de', 'xx'], ref_yy='123') == [('123', '')]
+
+
+    def test_empty_whitelist(self):
+        assert self.run_sanitizer_on([], ref_yy='123') == [('123', '')]
diff --git a/test/python/tokenizer/test_place_sanitizer.py b/test/python/tokenizer/test_place_sanitizer.py
new file mode 100644 (file)
index 0000000..389b068
--- /dev/null
@@ -0,0 +1,71 @@
+"""
+Tests for execution of the sanitztion step.
+"""
+import pytest
+
+from nominatim.errors import UsageError
+import nominatim.tokenizer.place_sanitizer as sanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+
+def test_placeinfo_clone_new_name():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+
+    newplace = place.clone(name='bar')
+
+    assert place.name == 'foo'
+    assert newplace.name == 'bar'
+    assert newplace.kind == 'ki'
+    assert newplace.suffix == 'su'
+
+
+def test_placeinfo_clone_merge_attr():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+    place.set_attr('a1', 'v1')
+    place.set_attr('a2', 'v2')
+
+    newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
+
+    assert place.get_attr('a2') == 'v2'
+    assert place.get_attr('b2') is None
+    assert newplace.get_attr('a1') == 'v1'
+    assert newplace.get_attr('a2') == 'new'
+    assert newplace.get_attr('b2') == 'foo'
+
+
+def test_placeinfo_has_attr():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+    place.set_attr('a1', 'v1')
+
+    assert place.has_attr('a1')
+    assert not place.has_attr('whatever')
+
+
+def test_sanitizer_default():
+    san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
+
+    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
+                                                  'address': {'street': 'Bald'}}))
+
+    assert len(name) == 3
+    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
+    assert all(n.kind == 'name'  for n in name)
+    assert all(n.suffix == 'de:de'  for n in name)
+
+    assert len(address) == 1
+    assert all(isinstance(n, sanitizer.PlaceName) for n in address)
+
+
+@pytest.mark.parametrize('rules', [None, []])
+def test_sanitizer_empty_list(rules):
+    san = sanitizer.PlaceSanitizer(rules)
+
+    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
+
+    assert len(name) == 1
+    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
+
+
+def test_sanitizer_missing_step_definition():
+    with pytest.raises(UsageError):
+        san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])
diff --git a/test/python/tokenizer/token_analysis/test_generic.py b/test/python/tokenizer/token_analysis/test_generic.py
new file mode 100644 (file)
index 0000000..02a95f2
--- /dev/null
@@ -0,0 +1,265 @@
+"""
+Tests for import name normalisation and variant generation.
+"""
+import pytest
+
+from icu import Transliterator
+
+import nominatim.tokenizer.token_analysis.generic as module
+from nominatim.errors import UsageError
+
+DEFAULT_NORMALIZATION = """ :: NFD ();
+                            '🜳' > ' ';
+                            [[:Nonspacing Mark:] [:Cf:]] >;
+                            :: lower ();
+                            [[:Punctuation:][:Space:]]+ > ' ';
+                            :: NFC ();
+                        """
+
+DEFAULT_TRANSLITERATION = """ ::  Latin ();
+                              '🜵' > ' ';
+                          """
+
+def make_analyser(*variants, variant_only=False):
+    rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
+    if variant_only:
+        rules['mode'] = 'variant-only'
+    config = module.configure(rules, DEFAULT_NORMALIZATION)
+    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+
+    return module.create(trans, config)
+
+
+def get_normalized_variants(proc, name):
+    norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+    return proc.get_variants_ascii(norm.transliterate(name).strip())
+
+
+def test_no_variants():
+    rules = { 'analyzer': 'generic' }
+    config = module.configure(rules, DEFAULT_NORMALIZATION)
+    trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+
+    proc = module.create(trans, config)
+
+    assert get_normalized_variants(proc, '大德!') == ['dà dé']
+
+
+def test_variants_empty():
+    proc = make_analyser('saint -> 🜵', 'street -> st')
+
+    assert get_normalized_variants(proc, '🜵') == []
+    assert get_normalized_variants(proc, '🜳') == []
+    assert get_normalized_variants(proc, 'saint') == ['saint']
+
+
+VARIANT_TESTS = [
+(('~strasse,~straße -> str', '~weg => weg'), "hallo", {'hallo'}),
+(('weg => wg',), "holzweg", {'holzweg'}),
+(('weg -> wg',), "holzweg", {'holzweg'}),
+(('~weg => weg',), "holzweg", {'holz weg', 'holzweg'}),
+(('~weg -> weg',), "holzweg",  {'holz weg', 'holzweg'}),
+(('~weg => w',), "holzweg", {'holz w', 'holzw'}),
+(('~weg -> w',), "holzweg",  {'holz weg', 'holzweg', 'holz w', 'holzw'}),
+(('~weg => weg',), "Meier Weg", {'meier weg', 'meierweg'}),
+(('~weg -> weg',), "Meier Weg", {'meier weg', 'meierweg'}),
+(('~weg => w',), "Meier Weg", {'meier w', 'meierw'}),
+(('~weg -> w',), "Meier Weg", {'meier weg', 'meierweg', 'meier w', 'meierw'}),
+(('weg => wg',), "Meier Weg", {'meier wg'}),
+(('weg -> wg',), "Meier Weg", {'meier weg', 'meier wg'}),
+(('~strasse,~straße -> str', '~weg => weg'), "Bauwegstraße",
+     {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}),
+(('am => a', 'bach => b'), "am bach", {'a b'}),
+(('am => a', '~bach => b'), "am bach", {'a b'}),
+(('am -> a', '~bach -> b'), "am bach", {'am bach', 'a bach', 'am b', 'a b'}),
+(('am -> a', '~bach -> b'), "ambach", {'ambach', 'am bach', 'amb', 'am b'}),
+(('saint -> s,st', 'street -> st'), "Saint Johns Street",
+     {'saint johns street', 's johns street', 'st johns street',
+      'saint johns st', 's johns st', 'st johns st'}),
+(('river$ -> r',), "River Bend Road", {'river bend road'}),
+(('river$ -> r',), "Bent River", {'bent river', 'bent r'}),
+(('^north => n',), "North 2nd Street", {'n 2nd street'}),
+(('^north => n',), "Airport North", {'airport north'}),
+(('am -> a',), "am am am am am am am am", {'am am am am am am am am'}),
+(('am => a',), "am am am am am am am am", {'a a a a a a a a'})
+]
+
+@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
+def test_variants(rules, name, variants):
+    proc = make_analyser(*rules)
+
+    result = get_normalized_variants(proc, name)
+
+    assert len(result) == len(set(result))
+    assert set(get_normalized_variants(proc, name)) == variants
+
+
+VARIANT_ONLY_TESTS = [
+(('weg => wg',), "hallo", set()),
+(('weg => wg',), "Meier Weg", {'meier wg'}),
+(('weg -> wg',), "Meier Weg", {'meier wg'}),
+]
+
+@pytest.mark.parametrize("rules,name,variants", VARIANT_ONLY_TESTS)
+def test_variants_only(rules, name, variants):
+    proc = make_analyser(*rules, variant_only=True)
+
+    result = get_normalized_variants(proc, name)
+
+    assert len(result) == len(set(result))
+    assert set(get_normalized_variants(proc, name)) == variants
+
+
+class TestGetReplacements:
+
+    @staticmethod
+    def configure_rules(*variants):
+        rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
+        return module.configure(rules, DEFAULT_NORMALIZATION)
+
+
+    def get_replacements(self, *variants):
+        config = self.configure_rules(*variants)
+
+        return sorted((k, sorted(v)) for k,v in config['replacements'])
+
+
+    @pytest.mark.parametrize("variant", ['foo > bar', 'foo -> bar -> bar',
+                                         '~foo~ -> bar', 'fo~ o -> bar'])
+    def test_invalid_variant_description(self, variant):
+        with pytest.raises(UsageError):
+            self.configure_rules(variant)
+
+
+    @pytest.mark.parametrize("rule", ["!!! -> bar", "bar => !!!"])
+    def test_ignore_unnormalizable_terms(self, rule):
+        repl = self.get_replacements(rule)
+
+        assert repl == []
+
+
+    def test_add_full(self):
+        repl = self.get_replacements("foo -> bar")
+
+        assert repl == [(' foo ', [' bar', ' foo'])]
+
+
+    def test_replace_full(self):
+        repl = self.get_replacements("foo => bar")
+
+        assert repl == [(' foo ', [' bar'])]
+
+
+    def test_add_suffix_no_decompose(self):
+        repl = self.get_replacements("~berg |-> bg")
+
+        assert repl == [(' berg ', [' berg', ' bg']),
+                        ('berg ', ['berg', 'bg'])]
+
+
+    def test_replace_suffix_no_decompose(self):
+        repl = self.get_replacements("~berg |=> bg")
+
+        assert repl == [(' berg ', [' bg']),('berg ', ['bg'])]
+
+
+    def test_add_suffix_decompose(self):
+        repl = self.get_replacements("~berg -> bg")
+
+        assert repl == [(' berg ', [' berg', ' bg', 'berg', 'bg']),
+                        ('berg ', [' berg', ' bg', 'berg', 'bg'])]
+
+
+    def test_replace_suffix_decompose(self):
+        repl = self.get_replacements("~berg => bg")
+
+        assert repl == [(' berg ', [' bg', 'bg']),
+                        ('berg ', [' bg', 'bg'])]
+
+
+    def test_add_prefix_no_compose(self):
+        repl = self.get_replacements("hinter~ |-> hnt")
+
+        assert repl == [(' hinter', [' hinter', ' hnt']),
+                        (' hinter ', [' hinter', ' hnt'])]
+
+
+    def test_replace_prefix_no_compose(self):
+        repl = self.get_replacements("hinter~ |=> hnt")
+
+        assert repl ==  [(' hinter', [' hnt']), (' hinter ', [' hnt'])]
+
+
+    def test_add_prefix_compose(self):
+        repl = self.get_replacements("hinter~-> h")
+
+        assert repl == [(' hinter', [' h', ' h ', ' hinter', ' hinter ']),
+                        (' hinter ', [' h', ' h', ' hinter', ' hinter'])]
+
+
+    def test_replace_prefix_compose(self):
+        repl = self.get_replacements("hinter~=> h")
+
+        assert repl == [(' hinter', [' h', ' h ']),
+                        (' hinter ', [' h', ' h'])]
+
+
+    def test_add_beginning_only(self):
+        repl = self.get_replacements("^Premier -> Pr")
+
+        assert repl == [('^ premier ', ['^ pr', '^ premier'])]
+
+
+    def test_replace_beginning_only(self):
+        repl = self.get_replacements("^Premier => Pr")
+
+        assert repl == [('^ premier ', ['^ pr'])]
+
+
+    def test_add_final_only(self):
+        repl = self.get_replacements("road$ -> rd")
+
+        assert repl == [(' road ^', [' rd ^', ' road ^'])]
+
+
+    def test_replace_final_only(self):
+        repl = self.get_replacements("road$ => rd")
+
+        assert repl == [(' road ^', [' rd ^'])]
+
+
+    def test_decompose_only(self):
+        repl = self.get_replacements("~foo -> foo")
+
+        assert repl == [(' foo ', [' foo', 'foo']),
+                        ('foo ', [' foo', 'foo'])]
+
+
+    def test_add_suffix_decompose_end_only(self):
+        repl = self.get_replacements("~berg |-> bg", "~berg$ -> bg")
+
+        assert repl == [(' berg ', [' berg', ' bg']),
+                        (' berg ^', [' berg ^', ' bg ^', 'berg ^', 'bg ^']),
+                        ('berg ', ['berg', 'bg']),
+                        ('berg ^', [' berg ^', ' bg ^', 'berg ^', 'bg ^'])]
+
+
+    def test_replace_suffix_decompose_end_only(self):
+        repl = self.get_replacements("~berg |=> bg", "~berg$ => bg")
+
+        assert repl == [(' berg ', [' bg']),
+                        (' berg ^', [' bg ^', 'bg ^']),
+                        ('berg ', ['bg']),
+                        ('berg ^', [' bg ^', 'bg ^'])]
+
+
+    @pytest.mark.parametrize('rule', ["~berg,~burg -> bg",
+                                      "~berg, ~burg -> bg",
+                                      "~berg,,~burg -> bg"])
+    def test_add_multiple_suffix(self, rule):
+        repl = self.get_replacements(rule)
+
+        assert repl == [(' berg ', [' berg', ' bg', 'berg', 'bg']),
+                        (' burg ', [' bg', ' burg', 'bg', 'burg']),
+                        ('berg ', [' berg', ' bg', 'berg', 'bg']),
+                        ('burg ', [' bg', ' burg', 'bg', 'burg'])]