Merge pull request #1024 from lonvia/reduce-address-search-terms

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 26 Apr 2018 20:28:53 +0000 (22:28 +0200)

committer GitHub <noreply@github.com>

Thu, 26 Apr 2018 20:28:53 +0000 (22:28 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 26 Apr 2018 20:28:53 +0000 (22:28 +0200)
committer GitHub <noreply@github.com>
Thu, 26 Apr 2018 20:28:53 +0000 (22:28 +0200)
diff --git a/sql/functions.sql b/sql/functions.sql

index 1829c63dea21e7dca324fb19d67c53e30c611c69..16862acb39803c8ddf4cf08139eb2cb74ab589a2 100644 (file)
--- a/sql/functions.sql
+++ b/sql/functions.sql
@@ -1234,6 +1234,7 @@ DECLARE
    relation_members TEXT[];
    relMember RECORD;
    linkedplacex RECORD;
+  addr_item RECORD;
    search_diameter FLOAT;
    search_prevdiameter FLOAT;
    search_maxrank INTEGER;
@@ -1766,44 +1767,43 @@ BEGIN
    parent_place_id_rank = 0;
  
  
-  -- convert isin to array of tokenids
+  -- convert address store to array of tokenids
    --DEBUG: RAISE WARNING 'Starting address search';
    isin_tokens := '{}'::int[];
    IF NEW.address IS NOT NULL THEN
-    isin := avals(NEW.address);
-    IF array_upper(isin, 1) IS NOT NULL THEN
-      FOR i IN 1..array_upper(isin, 1) LOOP
-        -- TODO further split terms with comma and semicolon
-        address_street_word_id := get_name_id(make_standard_name(isin[i]));
+    FOR addr_item IN SELECT * FROM each(NEW.address)
+    LOOP
+      IF addr_item.key IN ('city', 'tiger:county', 'state', 'suburb', 'province', 'district', 'region', 'county', 'municipality', 'hamlet', 'village', 'subdistrict', 'town', 'neighbourhood', 'quarter', 'parish') THEN
+        address_street_word_id := get_name_id(make_standard_name(addr_item.value));
          IF address_street_word_id IS NOT NULL AND NOT(ARRAY[address_street_word_id] <@ isin_tokens) THEN
-          nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
            isin_tokens := isin_tokens || address_street_word_id;
          END IF;
-
-        -- merge word into address vector
-        address_street_word_id := get_word_id(make_standard_name(isin[i]));
+        address_street_word_id := get_word_id(make_standard_name(addr_item.value));
          IF address_street_word_id IS NOT NULL THEN
            nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
          END IF;
-      END LOOP;
-    END IF;
-  END IF;
+      END IF;
+      IF addr_item.key = 'is_in' THEN
+        -- is_in items need splitting
+        isin := regexp_split_to_array(addr_item.value, E'[;,]');
+        IF array_upper(isin, 1) IS NOT NULL THEN
+          FOR i IN 1..array_upper(isin, 1) LOOP
+            address_street_word_id := get_name_id(make_standard_name(isin[i]));
+            IF address_street_word_id IS NOT NULL AND NOT(ARRAY[address_street_word_id] <@ isin_tokens) THEN
+              isin_tokens := isin_tokens || address_street_word_id;
+            END IF;
  
-  -- %NOTIGERDATA% IF 0 THEN
-  -- for the USA we have an additional address table.  Merge in zip codes from there too
-  IF NEW.rank_search = 26 AND NEW.country_code = 'us' THEN
-    FOR location IN SELECT distinct postcode from location_property_tiger where parent_place_id = NEW.place_id LOOP
-      address_street_word_id := get_name_id(make_standard_name(location.postcode));
-      nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
-      isin_tokens := isin_tokens || address_street_word_id;
-
-      -- also merge in the single word version
-      address_street_word_id := get_word_id(make_standard_name(location.postcode));
-      nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
+            -- merge word into address vector
+            address_street_word_id := get_word_id(make_standard_name(isin[i]));
+            IF address_street_word_id IS NOT NULL THEN
+              nameaddress_vector := array_merge(nameaddress_vector, ARRAY[address_street_word_id]);
+            END IF;
+          END LOOP;
+        END IF;
+      END IF;
      END LOOP;
    END IF;
-  --DEBUG: RAISE WARNING 'Tiger postcodes collected';
-  -- %NOTIGERDATA% END IF;
+  nameaddress_vector := array_merge(nameaddress_vector, isin_tokens);
  
  -- RAISE WARNING 'ISIN: %', isin_tokens;
  
@@ -1884,40 +1884,6 @@ BEGIN
    END LOOP;
    --DEBUG: RAISE WARNING 'address computed';
  
-  -- try using the isin value to find parent places
-  IF array_upper(isin_tokens, 1) IS NOT NULL THEN
-    FOR i IN 1..array_upper(isin_tokens, 1) LOOP
---RAISE WARNING '  getNearestNamedFeature: % % % %',NEW.partition, place_centroid, search_maxrank, isin_tokens[i];
-      IF NOT ARRAY[isin_tokens[i]] <@ nameaddress_vector THEN
-
-        FOR location IN SELECT * from getNearestNamedFeature(NEW.partition, place_centroid, search_maxrank, isin_tokens[i]) LOOP
-
---RAISE WARNING '  ISIN: %',location;
-
-          IF location.rank_search > 4 THEN
-              nameaddress_vector := array_merge(nameaddress_vector, location.keywords::integer[]);
-              INSERT INTO place_addressline (place_id, address_place_id, fromarea, isaddress, distance, cached_rank_address)
-                VALUES (NEW.place_id, location.place_id, false, NOT address_havelevel[location.rank_address], location.distance, location.rank_address);
-              IF NEW.postcode is null AND location.postcode is not null
-                 AND NOT address_havelevel[location.rank_address] THEN
-                NEW.postcode := location.postcode;
-              END IF;
-
-              address_havelevel[location.rank_address] := true;
-
-              IF location.rank_address > parent_place_id_rank THEN
-                NEW.parent_place_id = location.place_id;
-                parent_place_id_rank = location.rank_address;
-              END IF;
-          END IF;
-        END LOOP;
-
-      END IF;
-
-    END LOOP;
-  END IF;
-  --DEBUG: RAISE WARNING 'isin tokens processed';
-
    -- for long ways we should add search terms for the entire length
    IF st_length(NEW.geometry) > 0.05 THEN
  
diff --git a/sql/indices.src.sql b/sql/indices.src.sql

index cf5c4bc1779601e5324950c2412e8adefbaa4139..61af89002b81762be8835b7b7170e73bcdfae880 100644 (file)
--- a/sql/indices.src.sql
+++ b/sql/indices.src.sql
@@ -19,8 +19,6 @@ CREATE INDEX idx_location_area_country_place_id ON location_area_country USING B
  
  CREATE INDEX idx_osmline_parent_place_id ON location_property_osmline USING BTREE (parent_place_id) {ts:search-index};
  
-CREATE INDEX idx_search_name_country_centroid ON search_name_country USING GIST (centroid) {ts:address-index};
-
  DROP INDEX IF EXISTS place_id_idx;
  CREATE UNIQUE INDEX idx_place_osm_unique on place using btree(osm_id,osm_type,class,type) {ts:address-index};
  
diff --git a/sql/partition-functions.src.sql b/sql/partition-functions.src.sql

index 32b5f0a17f3c7f5396786b1da82e01f2822759bf..cc11cf960ace6d373c1892eb5cfaddb6e33733e4 100644 (file)
--- a/sql/partition-functions.src.sql
+++ b/sql/partition-functions.src.sql
@@ -84,38 +84,6 @@ END
  $$
  LANGUAGE plpgsql;
  
-create or replace function getNearestNamedFeature(in_partition INTEGER, point GEOMETRY, maxrank INTEGER, isin_token INTEGER) RETURNS setof nearfeature AS $$
-DECLARE
-  r nearfeature%rowtype;
-BEGIN
-
--- start
-  IF in_partition = -partition- THEN
-    FOR r IN 
-      SELECT place_id, name_vector, address_rank, search_rank,
-          ST_Distance(centroid, point) as distance, null as isguess
-          FROM search_name_-partition-
-          WHERE name_vector @> ARRAY[isin_token]
-          AND search_rank < maxrank
-      UNION ALL
-      SELECT place_id, name_vector, address_rank, search_rank,
-          ST_Distance(centroid, point) as distance, null as isguess
-          FROM search_name_country
-          WHERE name_vector @> ARRAY[isin_token]
-          AND search_rank < maxrank
-      ORDER BY distance ASC limit 1
-    LOOP
-      RETURN NEXT r;
-    END LOOP;
-    RETURN;
-  END IF;
--- end
-
-  RAISE EXCEPTION 'Unknown partition %', in_partition;
-END
-$$
-LANGUAGE plpgsql;
-
  create or replace function getNearestNamedRoadFeature(in_partition INTEGER, point GEOMETRY, isin_token INTEGER[]) 
    RETURNS setof nearfeature AS $$
  DECLARE
@@ -185,15 +153,6 @@ BEGIN
    INSERT INTO search_name (place_id, search_rank, address_rank, importance, country_code, name_vector, nameaddress_vector, centroid)
      values (in_place_id, in_rank_search, in_rank_address, in_importance, in_country_code, in_name_vector, in_nameaddress_vector, in_centroid);
  
-  IF in_rank_search <= 4 THEN
-    DELETE FROM search_name_country WHERE place_id = in_place_id;
-    IF in_rank_address > 0 THEN
-      INSERT INTO search_name_country (place_id, search_rank, address_rank, name_vector, centroid)
-        values (in_place_id, in_rank_search, in_rank_address, in_name_vector, in_geometry);
-    END IF;
-    RETURN TRUE;
-  END IF;
-
  -- start
    IF in_partition = -partition- THEN
      DELETE FROM search_name_-partition- values WHERE place_id = in_place_id;
@@ -216,7 +175,6 @@ DECLARE
  BEGIN
  
    DELETE from search_name WHERE place_id = in_place_id;
-  DELETE from search_name_country WHERE place_id = in_place_id;
  
  -- start
    IF in_partition = -partition- THEN
diff --git a/sql/partition-tables.src.sql b/sql/partition-tables.src.sql

index d8f02e10825a0c851b80cbee6b100e333657b889..61ed52814b63dbaa6206570fad93e88da867666a 100644 (file)
--- a/sql/partition-tables.src.sql
+++ b/sql/partition-tables.src.sql
@@ -38,10 +38,6 @@ CREATE TABLE search_name_blank (
  CREATE TABLE location_area_country () INHERITS (location_area_large) {ts:address-data};
  CREATE INDEX idx_location_area_country_geometry ON location_area_country USING GIST (geometry) {ts:address-index};
  
-CREATE TABLE search_name_country () INHERITS (search_name_blank) {ts:address-data};
-CREATE INDEX idx_search_name_country_place_id ON search_name_country USING BTREE (place_id) {ts:address-index};
-CREATE INDEX idx_search_name_country_name_vector ON search_name_country USING GIN (name_vector) WITH (fastupdate = off) {ts:address-index};
-
  -- start
  CREATE TABLE location_area_large_-partition- () INHERITS (location_area_large) {ts:address-data};
  CREATE INDEX idx_location_area_large_-partition-_place_id ON location_area_large_-partition- USING BTREE (place_id) {ts:address-index};
diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature

index 86bdea9bfd78133291f376d44251000039a1c7f0..cf3ce4dd266a27d2eaa8888e630324ca3e2d55f2 100644 (file)
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -23,3 +23,57 @@ Feature: Creation of search terms
          Then search_name contains
           | object | name_vector | nameaddress_vector |
           | N1     | foo         | the road |
+
+    Scenario: Some addr: tags are added to address when the name exists
+        Given the scene roads-with-pois
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | N1  | place   | state       | new york | 80 80 |
+         | N1  | place   | city        | bonn     | 81 81 |
+         | N1  | place   | suburb      | smalltown| 80 81 |
+        And the named places
+         | osm | class   | type    | addr+city | addr+state | addr+suburb | geometry |
+         | W1  | highway | service | bonn      | New York   | Smalltown   | :w-north |
+        When importing
+        Then search_name contains
+         | object | nameaddress_vector |
+         | W1     | bonn, new york, smalltown |
+
+    Scenario: A known addr:* tag is not added if the name is unknown
+        Given the scene roads-with-pois
+        And the places
+         | osm | class   | type        | name | addr+city | geometry |
+         | W1  | highway | residential | Road | Nandu     | :w-north |
+        When importing
+        Then search_name contains not
+         | object | nameaddress_vector |
+         | W1     | nandu |
+
+    Scenario: addr:postcode is not added to the address terms
+        Given the scene roads-with-pois
+        And the places
+         | osm | class   | type        | name+ref  | geometry |
+         | N1  | place   | state       | 12345     | 80 80 |
+        And the named places
+         | osm | class   | type        | addr+postcode | geometry |
+         | W1  | highway | residential | 12345 | :w-north |
+        When importing
+        Then search_name contains not
+         | object | nameaddress_vector |
+         | W1     | 12345 |
+
+    Scenario: is_in is split and added to the address search terms
+        Given the scene roads-with-pois
+        And the places
+         | osm | class   | type        | name     | geometry |
+         | N1  | place   | state       | new york | 80 80 |
+         | N1  | place   | city        | bonn     | 81 81 |
+         | N1  | place   | suburb      | smalltown| 80 81 |
+        And the named places
+         | osm | class   | type    | addr+is_in                | geometry |
+         | W1  | highway | service | bonn, New York, Smalltown | :w-north |
+        When importing
+        Then search_name contains
+         | object | nameaddress_vector |
+         | W1     | bonn, new york, smalltown |
+
diff --git a/test/bdd/steps/db_ops.py b/test/bdd/steps/db_ops.py

index 87babdad88158a1f42cb26f136d5957bfc4ad0b9..80f922222c0018c7e99c69e8064990883c51b69c 100644 (file)
--- a/test/bdd/steps/db_ops.py
+++ b/test/bdd/steps/db_ops.py
@@ -427,8 +427,8 @@ def check_placex_contents(context, exact):
  
      context.db.commit()
  
-@then("search_name contains")
-def check_search_name_contents(context):
+@then("search_name contains(?P<exclude> not)?")
+def check_search_name_contents(context, exclude):
      cur = context.db.cursor(cursor_factory=psycopg2.extras.DictCursor)
  
      for row in context.table:
@@ -446,11 +446,16 @@ def check_search_name_contents(context):
                                        FROM word, (SELECT unnest(%s) as term) t
                                        WHERE word_token = make_standard_name(t.term)""",
                                     (terms,))
-                    ok_(subcur.rowcount >= len(terms),
-                        "No word entry found for " + row[h])
+                    if not exclude:
+                        ok_(subcur.rowcount >= len(terms),
+                            "No word entry found for " + row[h])
                      for wid in subcur:
-                        assert_in(wid[0], res[h],
-                                  "Missing term for %s/%s: %s" % (pid, h, wid[1]))
+                        if exclude:
+                            assert_not_in(wid[0], res[h],
+                                          "Found term for %s/%s: %s" % (pid, h, wid[1]))
+                        else:
+                            assert_in(wid[0], res[h],
+                                      "Missing term for %s/%s: %s" % (pid, h, wid[1]))
                  else:
                      assert_db_column(res, h, row[h], context)
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 26 Apr 2018 20:28:53 +0000 (22:28 +0200)
committer	GitHub <noreply@github.com>
	Thu, 26 Apr 2018 20:28:53 +0000 (22:28 +0200)
sql/functions.sql		patch \| blob \| history
sql/indices.src.sql		patch \| blob \| history
sql/partition-functions.src.sql		patch \| blob \| history
sql/partition-tables.src.sql		patch \| blob \| history
test/bdd/db/import/search_name.feature		patch \| blob \| history
test/bdd/steps/db_ops.py		patch \| blob \| history