Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 28 Sep 2021 07:45:15 +0000 (09:45 +0200)

committer GitHub <noreply@github.com>

Tue, 28 Sep 2021 07:45:15 +0000 (09:45 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 28 Sep 2021 07:45:15 +0000 (09:45 +0200)
committer GitHub <noreply@github.com>
Tue, 28 Sep 2021 07:45:15 +0000 (09:45 +0200)
diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md

index 529315e4431dd1b2d08097ef7ed92491989c2de1..5282db1ae73a1ad375232fedc647df130ad71a9e 100644 (file)
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -190,22 +190,21 @@ be listed with a semicolon as delimiter. Must be NULL when the place has no
  house numbers.
  
  ```sql
-FUNCTION token_addr_street_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
  ```
  
-Return the match token IDs by which to search a matching street from the
-`addr:street` tag. These IDs will be matched against the IDs supplied by
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:street`
-tag.
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:street` tag name. Must return either NULL or FALSE
+when the place has no `addr:street` tag.
  
  ```sql
-FUNCTION token_addr_place_match_tokens(info JSONB) RETURNS INTEGER[]
+FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
  ```
  
-Return the match token IDs by which to search a matching place from the
-`addr:place` tag. These IDs will be matched against the IDs supplied by
-`token_get_name_match_tokens`. Must be NULL when the place has no `addr:place`
-tag.
+Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
+match against the `addr:place` tag name. Must return either NULL or FALSE
+when the place has no `addr:place` tag.
+
  
  ```sql
  FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
@@ -216,26 +215,34 @@ are used for searches by address when no matching place can be found in the
  database. Must be NULL when the place has no `addr:place` tag.
  
  ```sql
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
+FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
+```
+
+Return the set of keys for which address information is provided. This
+should correspond to the list of (relevant) `addr:*` tags with the `addr:`
+prefix removed or the keys used in the `address` dictionary of the place info.
  
-FUNCTION token_get_address_tokens(info JSONB) RETURNS SETOF token_addresstoken
+```sql
+FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
  ```
  
-Return the match and search token IDs for explicit `addr:*` tags for the place
-other than `addr:street` and `addr:place`. For each address item there are
-three pieces of information returned:
-
- * _key_ contains the type of address item (city, county, etc.). This is the
-   key handed in with the `address` dictionary.
- * *match_tokens* is the list of token IDs used to find the corresponding
-   place object for the address part. The list is matched against the IDs
-   from `token_get_name_match_tokens`.
- * *search_tokens* is the list of token IDs under which to search the address
-   item. It is used when no corresponding place object was found.
+Return the array of search tokens for the given address part. `key` can be
+expected to be one of those returned with `token_get_address_keys()`. The
+search tokens are added to the address search vector of the place, when no
+corresponding OSM object could be found for the given address part from which
+to copy the name information.
+
+```sql
+FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+```
+
+Check if the given tokens match against the address part `key`.
+
+__Warning:__ the tokens that are handed in are the lists previously saved
+from `token_get_name_search_tokens()`, _not_ from the match token list. This
+is an historical oddity which will be fixed at some point in the future.
+Currently, tokenizers are encouraged to make sure that matching works against
+both the search token list and the match token list.
  
  ```sql
  FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql

index 55e44dfd646e05f497658ba401207b0fd2194b67..4ef36f4f635e50b821eb0105b803f67302f9570e 100644 (file)
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
  
  
  -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
                                                      partition SMALLINT,
                                                      centroid GEOMETRY, geom GEOMETRY)
    RETURNS BIGINT
@@ -52,7 +52,7 @@ DECLARE
    parent_place_id BIGINT;
    location RECORD;
  BEGIN
-  parent_place_id := find_parent_for_address(street, place, partition, centroid);
+  parent_place_id := find_parent_for_address(token_info, partition, centroid);
  
    IF parent_place_id is null THEN
      FOR location IN SELECT place_id FROM placex
@@ -155,9 +155,8 @@ BEGIN
    NEW.interpolationtype = NEW.address->'interpolation';
  
    place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
-                                                 token_addr_place_match_tokens(NEW.token_info),
-                                                 NEW.partition, place_centroid, NEW.linegeo);
+  NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
+                                                 place_centroid, NEW.linegeo);
  
    interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
  
diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql

index 53aba22c90a3e15290c76ad125c0c7f11982ae52..97afec1560f8d6f6de2ab5091aaf083eaf482ca8 100644 (file)
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
  
  CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
                                               from_rank SMALLINT, to_rank SMALLINT,
-                                             extent FLOAT, tokens INT[])
+                                             extent FLOAT, token_info JSONB, key TEXT)
    RETURNS nearfeaturecentr
    AS $$
  DECLARE
@@ -80,7 +80,7 @@ BEGIN
          FROM location_area_large_{{ partition }}
          WHERE geometry && ST_Expand(feature, extent)
                AND rank_address between from_rank and to_rank
-              AND tokens && keywords
+              AND token_matches_address(token_info, key, keywords)
          GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
          ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
        RETURN r;
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;
  
  CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
                                                        point GEOMETRY,
-                                                      isin_token INTEGER[])
+                                                      token_info JSONB)
    RETURNS BIGINT
    AS $$
  DECLARE
    parent BIGINT;
  BEGIN
+  IF not token_has_addr_street(token_info) THEN
+    RETURN NULL;
+  END IF;
  
  {% for partition in db.partitions %}
    IF in_partition = {{ partition }} THEN
      SELECT place_id FROM search_name_{{ partition }}
        INTO parent
-      WHERE name_vector && isin_token
+      WHERE token_matches_street(token_info, name_vector)
              AND centroid && ST_Expand(point, 0.015)
              AND address_rank between 26 and 27
        ORDER BY ST_Distance(centroid, point) ASC limit 1;
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
  
  CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
                                                         point GEOMETRY,
-                                                       isin_token INTEGER[])
+                                                       token_info JSONB)
    RETURNS BIGINT
    AS $$
  DECLARE
    parent BIGINT;
  BEGIN
+  IF not token_has_addr_place(token_info) THEN
+    RETURN NULL;
+  END IF;
  
  {% for partition in db.partitions %}
    IF in_partition = {{ partition }} THEN
      SELECT place_id
        INTO parent
        FROM search_name_{{ partition }}
-      WHERE name_vector && isin_token
+      WHERE token_matches_place(token_info, name_vector)
              AND centroid && ST_Expand(point, 0.04)
              AND address_rank between 16 and 25
        ORDER BY ST_Distance(centroid, point) ASC limit 1;
diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql

index fa7156ec904c396a8376b780d03c09a37045fb43..9c2a67a1ae8f50039071c8bd8f24e50ed9c06ae6 100644 (file)
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                                 poi_osm_id BIGINT,
                                                 poi_partition SMALLINT,
                                                 bbox GEOMETRY,
-                                               addr_street INTEGER[],
-                                               addr_place INTEGER[],
+                                               token_info JSONB,
                                                 is_place_addr BOOLEAN)
    RETURNS BIGINT
    AS $$
@@ -119,8 +118,7 @@ BEGIN
    parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
  
    IF parent_place_id is null THEN
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                               poi_partition, bbox);
+    parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
    END IF;
  
    IF parent_place_id is null and poi_osm_type = 'N' THEN
@@ -333,13 +331,14 @@ BEGIN
      WHERE s.place_id = parent_place_id;
  
    FOR addr_item IN
-    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-      FROM token_get_address_tokens(token_info)
-      WHERE not search_tokens <@ parent_address_vector
+    SELECT (get_addr_tag_rank(key, country)).*, key,
+           token_get_address_search_tokens(token_info, key) as search_tokens
+      FROM token_get_address_keys(token_info) as key
+      WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
    LOOP
      addr_place := get_address_place(in_partition, geometry,
                                      addr_item.from_rank, addr_item.to_rank,
-                                    addr_item.extent, addr_item.match_tokens);
+                                    addr_item.extent, token_info, addr_item.key);
  
      IF addr_place is null THEN
        -- No place found in OSM that matches. Make it at least searchable.
@@ -447,14 +446,16 @@ BEGIN
  
    FOR location IN
      SELECT (get_address_place(partition, geometry, from_rank, to_rank,
-                              extent, match_tokens)).*, search_tokens
-      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
-              FROM token_get_address_tokens(token_info)) x
+                              extent, token_info, key)).*, key
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, key
+              FROM token_get_address_keys(token_info) as key) x
        ORDER BY rank_address, distance, isguess desc
    LOOP
      IF location.place_id is null THEN
        {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      nameaddress_vector := array_merge(nameaddress_vector,
+                                        token_get_address_search_tokens(token_info,
+                                                                        location.key));
        {% endif %}
      ELSE
        {% if not db.reverse_only %}
@@ -689,9 +690,6 @@ DECLARE
    parent_address_level SMALLINT;
    place_address_level SMALLINT;
  
-  addr_street INTEGER[];
-  addr_place INTEGER[];
-
    max_rank SMALLINT;
  
    name_vector INTEGER[];
@@ -860,8 +858,6 @@ BEGIN
    END IF;
  
    NEW.housenumber := token_normalized_housenumber(NEW.token_info);
-  addr_street := token_addr_street_match_tokens(NEW.token_info);
-  addr_place := token_addr_place_match_tokens(NEW.token_info);
  
    NEW.postcode := null;
  
@@ -907,7 +903,7 @@ BEGIN
      NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                                 NEW.partition,
                                                 ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place,
+                                               NEW.token_info,
                                                 is_place_address);
  
      -- If we found the road take a shortcut here.
diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql

index c308d0259b8505887d8c8fdc9a85630a20b3313f..f7d2093c9f6c8c32f4b79f1334a3b0e81f68628b 100644 (file)
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
  
  -- Find the parent of an address with addr:street/addr:place tag.
  --
--- \param street     Value of addr:street or NULL if tag is missing.
--- \param place      Value of addr:place or NULL if tag is missing.
+-- \param token_info Naming info with the address information.
  -- \param partition  Partition where to search the parent.
  -- \param centroid   Location of the address.
  --
  -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
+CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
                                                     partition SMALLINT,
                                                     centroid GEOMETRY)
    RETURNS BIGINT
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
  DECLARE
    parent_place_id BIGINT;
  BEGIN
-  IF street is not null THEN
-    -- Check for addr:street attributes
-    -- Note that addr:street links can only be indexed, once the street itself is indexed
-    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
+  -- Check for addr:street attributes
+  parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
+  IF parent_place_id is not null THEN
+    {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
+    RETURN parent_place_id;
    END IF;
  
    -- Check for addr:place attributes.
-  IF place is not null THEN
-    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
-    IF parent_place_id is not null THEN
-      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-      RETURN parent_place_id;
-    END IF;
-  END IF;
-
-  RETURN NULL;
+  parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
+  {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
+  RETURN parent_place_id;
  END;
  $$
  LANGUAGE plpgsql STABLE;
  
+
  CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
    RETURNS BOOLEAN
    AS $$
diff --git a/lib-sql/tiger_import_start.sql b/lib-sql/tiger_import_start.sql

index faa4efbb2abb76e408c7ef509164db03462c099a..f344e1745ac8da95a86fd3d8013a8916671f2f91 100644 (file)
--- a/lib-sql/tiger_import_start.sql
+++ b/lib-sql/tiger_import_start.sql
@@ -14,7 +14,6 @@ DECLARE
    out_partition INTEGER;
    out_parent_place_id BIGINT;
    location RECORD;
-  address_street_word_ids INTEGER[];
  
  BEGIN
  
@@ -54,13 +53,9 @@ BEGIN
  
    place_centroid := ST_Centroid(linegeo);
    out_partition := get_partition('us');
-  out_parent_place_id := null;
  
-  address_street_word_ids := token_addr_street_match_tokens(token_info);
-  IF address_street_word_ids IS NOT NULL THEN
-    out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
-                                                      address_street_word_ids);
-  END IF;
+  out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
+                                                    token_info);
  
    IF out_parent_place_id IS NULL THEN
      SELECT getNearestParallelRoadFeature(out_partition, linegeo)
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql

index ffe6648c38e959c6279efb2d1898d835514f32a7..6092319a0578d338915b6890902d71df7ec90b1f 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -34,40 +34,59 @@ AS $$
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
  AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'place' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] <@ street_tokens
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
  AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[] <@ place_tokens
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
  CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
    RETURNS INTEGER[]
  AS $$
-  SELECT (info->>'place_search')::INTEGER[]
+  SELECT (info->>'place')::INTEGER[]
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
+AS $$
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
  
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
  AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT (info->'addr'->>key)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
@@ -127,15 +146,34 @@ BEGIN
          VALUES (term_id, term, 'w', json_build_object('count', term_count));
      END IF;
  
-    IF term_count < {{ max_word_freq }} THEN
-      partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
-    END IF;
+    partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
    END LOOP;
  END;
  $$
  LANGUAGE plpgsql;
  
  
+CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  token INTEGER;
+BEGIN
+  SELECT min(word_id) INTO token
+    FROM word WHERE word_token = partial and type = 'w';
+
+  IF token IS NULL THEN
+    token := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, info)
+        VALUES (token, partial, 'w', json_build_object('count', 0));
+  END IF;
+
+  RETURN token;
+END;
+$$
+LANGUAGE plpgsql;
+
+
  CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
    RETURNS INTEGER
    AS $$
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql

index a2c6b52073ec007e052b7775a148f6159fa1239d..2b734e6f2a95a5cfe97243e81e3b4c47485a3d92 100644 (file)
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -34,17 +34,31 @@ AS $$
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
-CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
+  RETURNS BOOLEAN
+AS $$
+  SELECT info->>'street' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
+  RETURNS BOOLEAN
  AS $$
-  SELECT (info->>'street')::INTEGER[]
+  SELECT info->>'place_match' is not null;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
+  RETURNS BOOLEAN
+AS $$
+  SELECT (info->>'street')::INTEGER[] && street_tokens
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
-CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
-  RETURNS INTEGER[]
+CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
+  RETURNS BOOLEAN
  AS $$
-  SELECT (info->>'place_match')::INTEGER[]
+  SELECT (info->>'place_match')::INTEGER[] && place_tokens
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
@@ -55,19 +69,24 @@ AS $$
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
-DROP TYPE IF EXISTS token_addresstoken CASCADE;
-CREATE TYPE token_addresstoken AS (
-  key TEXT,
-  match_tokens INT[],
-  search_tokens INT[]
-);
+CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
+  RETURNS SETOF TEXT
+AS $$
+  SELECT * FROM jsonb_object_keys(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->'addr'->key->>0)::INTEGER[];
+$$ LANGUAGE SQL IMMUTABLE STRICT;
  
-CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
-  RETURNS SETOF token_addresstoken
+
+CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
+  RETURNS BOOLEAN
  AS $$
-  SELECT key, (value->>1)::int[] as match_tokens,
-         (value->>0)::int[] as search_tokens
-  FROM jsonb_each(info->'addr');
+  SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
  $$ LANGUAGE SQL IMMUTABLE STRICT;
  
  
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 61263678d811db87e90cc0ab8ed55b885d24a57c..5768fd3596652e07fca2896a9d6a02772af8ccb5 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
@@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
          self.data_dir = data_dir
          self.naming_rules = None
          self.term_normalization = None
-        self.max_word_frequency = None
  
  
      def init_new_db(self, config, init_db=True):
@@ -52,10 +50,9 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                                               config='TOKENIZER_CONFIG'))
          self.naming_rules = ICUNameProcessorRules(loader=loader)
          self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
  
          self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
  
          if init_db:
              self.update_sql_functions(config)
@@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
          with connect(self.dsn) as conn:
              self.naming_rules = ICUNameProcessorRules(conn=conn)
              self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
      def finalize_import(self, _):
@@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
      def check_database(self):
@@ -122,20 +116,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
          php_file = self.data_dir / "tokenizer.php"
          php_file.write_text(dedent(f"""\
              <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Max_Word_Frequency', 10000000);
              @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
              @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
-    def _save_config(self, config):
+    def _save_config(self):
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          with connect(self.dsn) as conn:
              self.naming_rules.save_rules(conn)
  
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
              set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
@@ -424,12 +417,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                  hnrs.append(value)
              elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
+                token_info.add_street(self._compute_partial_tokens(value))
              elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
+                token_info.add_place(self._compute_partial_tokens(value))
              elif not key.startswith('_') and \
                   key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+                addr_terms.append((key, self._compute_partial_tokens(value)))
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
@@ -438,6 +431,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self.name_processor.get_search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
  
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
@@ -551,30 +570,25 @@ class _TokenInfo:
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
          """ Add addr:street match terms.
          """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if tokens:
+            self.data['street'] = self._mk_array(tokens)
  
  
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
          """ Add addr:place search and match terms.
          """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)
  
  
      def add_address_terms(self, terms):
          """ Add additional address terms.
          """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}
  
          if tokens:
              self.data['addr'] = tokens
@@ -588,6 +602,7 @@ class _TokenCache:
      """
      def __init__(self):
          self.names = {}
+        self.partials = {}
          self.postcodes = set()
          self.housenumbers = {}
  
diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature

index d393b8f3381a7a493d4597cd113b77d4a331287f..a5b07e5cf5e5027919066d8967da19cfccdfdc26 100644 (file)
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -125,9 +125,6 @@ Feature: Creation of search terms
          Then placex contains
           | object | parent_place_id |
           | N1     | N2              |
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Walltown   | Strange, Town |
          When sending search query "23 Rose Street"
          Then exactly 1 results are returned
          And results contain
@@ -156,9 +153,6 @@ Feature: Creation of search terms
           | W1  | highway | residential | Rose Street  | :w-north |
           | N2  | place   | city        | Strange Town | :p-N1 |
          When importing
-        Then search_name contains
-         | object | name_vector      | nameaddress_vector      |
-         | N1     | #Walltown, #Blue house | Walltown, Strange, Town |
          When sending search query "23 Walltown, Strange Town"
          Then results contain
           | osm | display_name |
@@ -190,9 +184,6 @@ Feature: Creation of search terms
           | W1  | highway | residential | Rose Street  | :w-north |
           | N2  | place   | city        | Strange Town | :p-N1 |
          When importing
-        Then search_name contains
-         | object | name_vector      | nameaddress_vector      |
-         | N1     | #Moon sun, #Blue house | Moon, Sun, Strange, Town |
          When sending search query "23 Moon Sun, Strange Town"
          Then results contain
           | osm | display_name |
@@ -212,9 +203,6 @@ Feature: Creation of search terms
           | W1  | highway | residential | Rose Street  | Walltown  | :w-north |
           | N2  | place   | suburb      | Strange Town | Walltown  | :p-N1 |
          When importing
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Walltown   | Strange, Town      |
          When sending search query "23 Rose Street, Walltown"
          Then exactly 1 result is returned
          And results contain
@@ -303,9 +291,6 @@ Feature: Creation of search terms
           | W1  | highway | residential | Rose Street  | :w-north |
           | N2  | place   | suburb      | Strange Town | :p-N1 |
          When importing
-        Then search_name contains
-         | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Walltown |
          When sending search query "Green Moss, Rose Street, Walltown"
          Then exactly 0 result is returned
          When sending search query "Green Moss, Walltown"
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py

index b7101c3f67ef62229e5205d226e4c50b4c6ccad8..ed079269561ff8ba8a19b56ecaff840ec0bd0c93 100644 (file)
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -10,6 +10,7 @@ from nominatim.tokenizer import icu_tokenizer
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.db import properties
+from nominatim.db.sql_preprocessor import SQLPreprocessor
  
  from mock_icu_word_table import MockIcuWordTable
  
@@ -76,6 +77,15 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
  
      return _mk_analyser
  
+@pytest.fixture
+def sql_functions(temp_db_conn, def_config, src_dir):
+    orig_sql = def_config.lib_dir.sql
+    def_config.lib_dir.sql = src_dir / 'lib-sql'
+    sqlproc = SQLPreprocessor(temp_db_conn, def_config)
+    sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql')
+    sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql')
+    def_config.lib_dir.sql = orig_sql
+
  
  @pytest.fixture
  def getorcreate_full_word(temp_db_cursor):
@@ -144,7 +154,6 @@ def test_init_new(tokenizer_factory, test_config, monkeypatch, db_prop):
      tok.init_new_db(test_config)
  
      assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) is not None
  
  
  def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
@@ -163,7 +172,6 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
  
  def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
      monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '90300')
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
      monkeypatch.undo()
@@ -173,23 +181,18 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
  
      assert tok.naming_rules is not None
      assert tok.term_normalization == ':: lower();'
-    assert tok.max_word_frequency == '90300'
  
  
  def test_update_sql_functions(db_prop, temp_db_cursor,
                                tokenizer_factory, test_config, table_factory,
                                monkeypatch):
-    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
-    monkeypatch.undo()
-
-    assert db_prop(icu_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
  
      table_factory('test', 'txt TEXT')
  
      func_file = test_config.lib_dir.sql / 'tokenizer' / 'icu_tokenizer.sql'
-    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}')""")
+    func_file.write_text("""INSERT INTO test VALUES (1133)""")
  
      tok.update_sql_functions(test_config)
  
@@ -304,7 +307,7 @@ def test_add_country_names_extend(analyzer, word_table):
  class TestPlaceNames:
  
      @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
          with analyzer() as anl:
              self.analyzer = anl
              yield anl
@@ -351,7 +354,7 @@ class TestPlaceNames:
  class TestPlaceAddress:
  
      @pytest.fixture(autouse=True)
-    def setup(self, analyzer, getorcreate_full_word):
+    def setup(self, analyzer, sql_functions):
          with analyzer(trans=(":: upper()", "'🜵' > ' '")) as anl:
              self.analyzer = anl
              yield anl
@@ -424,7 +427,7 @@ class TestPlaceAddress:
      def test_process_place_street(self):
          info = self.process_address(street='Grand Road')
  
-        assert eval(info['street']) == self.name_token_set('#GRAND ROAD')
+        assert eval(info['street']) == self.name_token_set('GRAND', 'ROAD')
  
  
      def test_process_place_street_empty(self):
@@ -436,16 +439,13 @@ class TestPlaceAddress:
      def test_process_place_place(self):
          info = self.process_address(place='Honu Lulu')
  
-        assert eval(info['place_search']) == self.name_token_set('#HONU LULU',
-                                                                 'HONU', 'LULU')
-        assert eval(info['place_match']) == self.name_token_set('#HONU LULU')
+        assert eval(info['place']) == self.name_token_set('HONU', 'LULU')
  
  
      def test_process_place_place_empty(self):
          info = self.process_address(place='🜵')
  
-        assert 'place_search' not in info
-        assert 'place_match' not in info
+        assert 'place' not in info
  
  
      def test_process_place_address_terms(self):
@@ -453,16 +453,12 @@ class TestPlaceAddress:
                                      suburb='Zwickau', street='Hauptstr',
                                      full='right behind the church')
  
-        city_full = self.name_token_set('#ZWICKAU')
-        city_all = self.name_token_set('#ZWICKAU', 'ZWICKAU')
-        state_full = self.name_token_set('#SACHSEN')
-        state_all = self.name_token_set('#SACHSEN', 'SACHSEN')
+        city = self.name_token_set('ZWICKAU')
+        state = self.name_token_set('SACHSEN')
  
-        result = {k: [eval(v[0]), eval(v[1])] for k,v in info['addr'].items()}
+        result = {k: eval(v) for k,v in info['addr'].items()}
  
-        assert result == {'city': [city_all, city_full],
-                          'suburb': [city_all, city_full],
-                          'state': [state_all, state_full]}
+        assert result == {'city': city, 'suburb': city, 'state': state}
  
  
      def test_process_place_address_terms_empty(self):
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 28 Sep 2021 07:45:15 +0000 (09:45 +0200)
committer	GitHub <noreply@github.com>
	Tue, 28 Sep 2021 07:45:15 +0000 (09:45 +0200)
docs/develop/Tokenizers.md		patch \| blob \| history
lib-sql/functions/interpolation.sql		patch \| blob \| history
lib-sql/functions/partition-functions.sql		patch \| blob \| history
lib-sql/functions/placex_triggers.sql		patch \| blob \| history
lib-sql/functions/utils.sql		patch \| blob \| history
lib-sql/tiger_import_start.sql		patch \| blob \| history
lib-sql/tokenizer/icu_tokenizer.sql		patch \| blob \| history
lib-sql/tokenizer/legacy_tokenizer.sql		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
test/bdd/db/import/search_name.feature		patch \| blob \| history
test/python/test_tokenizer_icu.py		patch \| blob \| history