ICU: better letter identification in normalization

[nominatim.git] / lib-sql / tokenizer / icu_tokenizer.sql
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql

index 547facce1572f367d2f7164ad94ddeddad929074..a3dac8ddcbe82eb5fd6057bd81bb9b823befa159 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -1,3 +1,10 @@
+-- SPDX-License-Identifier: GPL-2.0-only
+--
+-- This file is part of Nominatim. (https://nominatim.org)
+--
+-- Copyright (C) 2022 by the Nominatim developer community.
+-- For a full list of authors see the git log.
+
  -- Get tokens used for searching the given place.
  --
  -- These are the tokens that will be saved in the search_name table.
@@ -193,3 +200,26 @@ BEGIN
  END;
  $$
  LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  return_id INTEGER;
+BEGIN
+  SELECT min(word_id) INTO return_id
+    FROM word WHERE word = norm_term and type = 'H';
+
+  IF return_id IS NULL THEN
+    return_id := nextval('seq_word');
+    INSERT INTO word (word_id, word_token, type, word, info)
+      SELECT return_id, lookup_term, 'H', norm_term,
+             json_build_object('lookup', lookup_terms[1])
+        FROM unnest(lookup_terms) as lookup_term;
+  END IF;
+
+  RETURN return_id;
+END;
+$$
+LANGUAGE plpgsql;