From: Sarah Hoffmann Date: Sun, 5 Jan 2020 14:55:39 +0000 (+0100) Subject: move importance/wikipedia functions into separate file X-Git-Tag: v3.5.0~105^2~7 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4088e4e3719dc0ee5b446c97eb1db9650ddbab0d move importance/wikipedia functions into separate file --- diff --git a/lib/setup/SetupClass.php b/lib/setup/SetupClass.php index 92d6347b..8e33fef8 100755 --- a/lib/setup/SetupClass.php +++ b/lib/setup/SetupClass.php @@ -651,6 +651,7 @@ class SetupFunctions $sBasePath = CONST_BasePath.'/sql/functions/'; $sTemplate = file_get_contents(CONST_BasePath.'/sql/functions.sql'); $sTemplate .= file_get_contents($sBasePath.'normalization.sql'); + $sTemplate .= file_get_contents($sBasePath.'importance.sql'); $sTemplate = str_replace('{modulepath}', $this->sModulePath, $sTemplate); if ($this->bEnableDiffUpdates) { $sTemplate = str_replace('RETURN NEW; -- %DIFFUPDATES%', '--', $sTemplate); diff --git a/sql/functions.sql b/sql/functions.sql index 908236b1..24acf7c8 100644 --- a/sql/functions.sql +++ b/sql/functions.sql @@ -2227,120 +2227,6 @@ END; $$ LANGUAGE plpgsql; --- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql -CREATE OR REPLACE FUNCTION decode_url_part(p varchar) RETURNS varchar - AS $$ -SELECT convert_from(CAST(E'\\x' || array_to_string(ARRAY( - SELECT CASE WHEN length(r.m[1]) = 1 THEN encode(convert_to(r.m[1], 'SQL_ASCII'), 'hex') ELSE substring(r.m[1] from 2 for 2) END - FROM regexp_matches($1, '%[0-9a-f][0-9a-f]|.', 'gi') AS r(m) -), '') AS bytea), 'UTF8'); -$$ -LANGUAGE SQL IMMUTABLE STRICT; - -CREATE OR REPLACE FUNCTION catch_decode_url_part(p varchar) RETURNS varchar - AS $$ -DECLARE -BEGIN - RETURN decode_url_part(p); -EXCEPTION - WHEN others THEN return null; -END; -$$ -LANGUAGE plpgsql IMMUTABLE; - -DROP TYPE IF EXISTS wikipedia_article_match CASCADE; -create type wikipedia_article_match as ( - language TEXT, - title TEXT, - importance FLOAT -); - -CREATE OR REPLACE FUNCTION get_wikipedia_match(extratags HSTORE, country_code varchar(2)) RETURNS wikipedia_article_match - AS $$ -DECLARE - langs TEXT[]; - i INT; - wiki_article TEXT; - wiki_article_title TEXT; - wiki_article_language TEXT; - result wikipedia_article_match; -BEGIN - langs := ARRAY['english','country','ar','bg','ca','cs','da','de','en','es','eo','eu','fa','fr','ko','hi','hr','id','it','he','lt','hu','ms','nl','ja','no','pl','pt','kk','ro','ru','sk','sl','sr','fi','sv','tr','uk','vi','vo','war','zh']; - i := 1; - WHILE langs[i] IS NOT NULL LOOP - wiki_article := extratags->(case when langs[i] in ('english','country') THEN 'wikipedia' ELSE 'wikipedia:'||langs[i] END); - IF wiki_article is not null THEN - wiki_article := regexp_replace(wiki_article,E'^(.*?)([a-z]{2,3}).wikipedia.org/wiki/',E'\\2:'); - wiki_article := regexp_replace(wiki_article,E'^(.*?)([a-z]{2,3}).wikipedia.org/w/index.php\\?title=',E'\\2:'); - wiki_article := regexp_replace(wiki_article,E'^(.*?)/([a-z]{2,3})/wiki/',E'\\2:'); - --wiki_article := regexp_replace(wiki_article,E'^(.*?)([a-z]{2,3})[=:]',E'\\2:'); - wiki_article := replace(wiki_article,' ','_'); - IF strpos(wiki_article, ':') IN (3,4) THEN - wiki_article_language := lower(trim(split_part(wiki_article, ':', 1))); - wiki_article_title := trim(substr(wiki_article, strpos(wiki_article, ':')+1)); - ELSE - wiki_article_title := trim(wiki_article); - wiki_article_language := CASE WHEN langs[i] = 'english' THEN 'en' WHEN langs[i] = 'country' THEN get_country_language_code(country_code) ELSE langs[i] END; - END IF; - - select wikipedia_article.language,wikipedia_article.title,wikipedia_article.importance - from wikipedia_article - where language = wiki_article_language and - (title = wiki_article_title OR title = catch_decode_url_part(wiki_article_title) OR title = replace(catch_decode_url_part(wiki_article_title),E'\\','')) - UNION ALL - select wikipedia_article.language,wikipedia_article.title,wikipedia_article.importance - from wikipedia_redirect join wikipedia_article on (wikipedia_redirect.language = wikipedia_article.language and wikipedia_redirect.to_title = wikipedia_article.title) - where wikipedia_redirect.language = wiki_article_language and - (from_title = wiki_article_title OR from_title = catch_decode_url_part(wiki_article_title) OR from_title = replace(catch_decode_url_part(wiki_article_title),E'\\','')) - order by importance desc limit 1 INTO result; - - IF result.language is not null THEN - return result; - END IF; - END IF; - i := i + 1; - END LOOP; - RETURN NULL; -END; -$$ -LANGUAGE plpgsql; - -DROP TYPE IF EXISTS place_importance CASCADE; -create type place_importance as ( - importance FLOAT, - wikipedia TEXT -); - -CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, country_code varchar(2), osm_type varchar(1), osm_id BIGINT) - RETURNS place_importance - AS $$ -DECLARE - match RECORD; - result place_importance; -BEGIN - FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code) - WHERE language is not NULL - LOOP - result.importance := match.importance; - result.wikipedia := match.language || ':' || match.title; - RETURN result; - END LOOP; - - IF extratags ? 'wikidata' THEN - FOR match IN SELECT * FROM wikipedia_article - WHERE wd_page_title = extratags->'wikidata' - ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP - result.importance := match.importance; - result.wikipedia := match.language || ':' || match.title; - RETURN result; - END LOOP; - END IF; - - RETURN null; -END; -$$ -LANGUAGE plpgsql; - CREATE OR REPLACE FUNCTION quad_split_geometry(geometry GEOMETRY, maxarea FLOAT, maxdepth INTEGER) RETURNS SETOF GEOMETRY AS $$ diff --git a/sql/functions/importance.sql b/sql/functions/importance.sql new file mode 100644 index 00000000..0837f80f --- /dev/null +++ b/sql/functions/importance.sql @@ -0,0 +1,125 @@ +-- Functions for interpreting wkipedia/wikidata tags and computing importance. + +DROP TYPE IF EXISTS wikipedia_article_match CASCADE; +CREATE TYPE wikipedia_article_match as ( + language TEXT, + title TEXT, + importance FLOAT +); + +DROP TYPE IF EXISTS place_importance CASCADE; +CREATE TYPE place_importance as ( + importance FLOAT, + wikipedia TEXT +); + + +-- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql +CREATE OR REPLACE FUNCTION decode_url_part(p varchar) + RETURNS varchar + AS $$ +SELECT convert_from(CAST(E'\\x' || array_to_string(ARRAY( + SELECT CASE WHEN length(r.m[1]) = 1 THEN encode(convert_to(r.m[1], 'SQL_ASCII'), 'hex') ELSE substring(r.m[1] from 2 for 2) END + FROM regexp_matches($1, '%[0-9a-f][0-9a-f]|.', 'gi') AS r(m) +), '') AS bytea), 'UTF8'); +$$ +LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION catch_decode_url_part(p varchar) + RETURNS varchar + AS $$ +DECLARE +BEGIN + RETURN decode_url_part(p); +EXCEPTION + WHEN others THEN return null; +END; +$$ +LANGUAGE plpgsql IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION get_wikipedia_match(extratags HSTORE, country_code varchar(2)) + RETURNS wikipedia_article_match + AS $$ +DECLARE + langs TEXT[]; + i INT; + wiki_article TEXT; + wiki_article_title TEXT; + wiki_article_language TEXT; + result wikipedia_article_match; +BEGIN + langs := ARRAY['english','country','ar','bg','ca','cs','da','de','en','es','eo','eu','fa','fr','ko','hi','hr','id','it','he','lt','hu','ms','nl','ja','no','pl','pt','kk','ro','ru','sk','sl','sr','fi','sv','tr','uk','vi','vo','war','zh']; + i := 1; + WHILE langs[i] IS NOT NULL LOOP + wiki_article := extratags->(case when langs[i] in ('english','country') THEN 'wikipedia' ELSE 'wikipedia:'||langs[i] END); + IF wiki_article is not null THEN + wiki_article := regexp_replace(wiki_article,E'^(.*?)([a-z]{2,3}).wikipedia.org/wiki/',E'\\2:'); + wiki_article := regexp_replace(wiki_article,E'^(.*?)([a-z]{2,3}).wikipedia.org/w/index.php\\?title=',E'\\2:'); + wiki_article := regexp_replace(wiki_article,E'^(.*?)/([a-z]{2,3})/wiki/',E'\\2:'); + --wiki_article := regexp_replace(wiki_article,E'^(.*?)([a-z]{2,3})[=:]',E'\\2:'); + wiki_article := replace(wiki_article,' ','_'); + IF strpos(wiki_article, ':') IN (3,4) THEN + wiki_article_language := lower(trim(split_part(wiki_article, ':', 1))); + wiki_article_title := trim(substr(wiki_article, strpos(wiki_article, ':')+1)); + ELSE + wiki_article_title := trim(wiki_article); + wiki_article_language := CASE WHEN langs[i] = 'english' THEN 'en' WHEN langs[i] = 'country' THEN get_country_language_code(country_code) ELSE langs[i] END; + END IF; + + select wikipedia_article.language,wikipedia_article.title,wikipedia_article.importance + from wikipedia_article + where language = wiki_article_language and + (title = wiki_article_title OR title = catch_decode_url_part(wiki_article_title) OR title = replace(catch_decode_url_part(wiki_article_title),E'\\','')) + UNION ALL + select wikipedia_article.language,wikipedia_article.title,wikipedia_article.importance + from wikipedia_redirect join wikipedia_article on (wikipedia_redirect.language = wikipedia_article.language and wikipedia_redirect.to_title = wikipedia_article.title) + where wikipedia_redirect.language = wiki_article_language and + (from_title = wiki_article_title OR from_title = catch_decode_url_part(wiki_article_title) OR from_title = replace(catch_decode_url_part(wiki_article_title),E'\\','')) + order by importance desc limit 1 INTO result; + + IF result.language is not null THEN + return result; + END IF; + END IF; + i := i + 1; + END LOOP; + RETURN NULL; +END; +$$ +LANGUAGE plpgsql STABLE; + + +CREATE OR REPLACE FUNCTION compute_importance(extratags HSTORE, + country_code varchar(2), + osm_type varchar(1), osm_id BIGINT) + RETURNS place_importance + AS $$ +DECLARE + match RECORD; + result place_importance; +BEGIN + FOR match IN SELECT * FROM get_wikipedia_match(extratags, country_code) + WHERE language is not NULL + LOOP + result.importance := match.importance; + result.wikipedia := match.language || ':' || match.title; + RETURN result; + END LOOP; + + IF extratags ? 'wikidata' THEN + FOR match IN SELECT * FROM wikipedia_article + WHERE wd_page_title = extratags->'wikidata' + ORDER BY language = 'en' DESC, langcount DESC LIMIT 1 LOOP + result.importance := match.importance; + result.wikipedia := match.language || ':' || match.title; + RETURN result; + END LOOP; + END IF; + + RETURN null; +END; +$$ +LANGUAGE plpgsql; +