module/nominatim.c

   1 /**
   2  * SPDX-License-Identifier: GPL-2.0-only
   3  *
   4  * This file is part of Nominatim. (https://nominatim.org)
   5  *
   6  * Copyright (C) 2022 by the Nominatim developer community.
   7  * For a full list of authors see the git log.
   8  */
   9 #include "postgres.h"
  10 #include "fmgr.h"
  11 #include "mb/pg_wchar.h"
  12 #include <utfasciitable.h>
  13
  14 #ifdef PG_MODULE_MAGIC
  15 PG_MODULE_MAGIC;
  16 #endif
  17
  18 Datum transliteration( PG_FUNCTION_ARGS );
  19 Datum gettokenstring( PG_FUNCTION_ARGS );
  20 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
  21 void str_dupspaces(char* buffer);
  22
  23 PG_FUNCTION_INFO_V1( transliteration );
  24 Datum
  25 transliteration( PG_FUNCTION_ARGS )
  26 {
  27         static char * ascii = UTFASCII;
  28         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
  29         char * asciipos;
  30
  31         text *source;
  32         unsigned char *sourcedata;
  33         int sourcedatalength;
  34
  35         unsigned int c1,c2,c3,c4;
  36         unsigned int * wchardata;
  37         unsigned int * wchardatastart;
  38
  39         text *result;
  40         unsigned char *resultdata;
  41         int resultdatalength;
  42         int iLen;
  43
  44         if (GetDatabaseEncoding() != PG_UTF8)
  45         {
  46                 ereport(ERROR,
  47                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
  48                                          errmsg("requires UTF8 database encoding")));
  49         }
  50
  51         if (PG_ARGISNULL(0))
  52         {
  53                 PG_RETURN_NULL();
  54         }
  55
  56         // The original string
  57         source = PG_GETARG_TEXT_P(0);
  58         sourcedata = (unsigned char *)VARDATA(source);
  59         sourcedatalength = VARSIZE(source) - VARHDRSZ;
  60
  61         // Intermediate wchar version of string
  62         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
  63
  64         // Based on pg_utf2wchar_with_len from wchar.c
  65         // Postgresql strings are not zero terminalted
  66         while (sourcedatalength > 0)
  67         {
  68                 if ((*sourcedata & 0x80) == 0)
  69                 {
  70                         *wchardata = *sourcedata++;
  71                         wchardata++;
  72                         sourcedatalength--;
  73                 }
  74                 else if ((*sourcedata & 0xe0) == 0xc0)
  75                 {
  76                         if (sourcedatalength < 2) break;
  77                         c1 = *sourcedata++ & 0x1f;
  78                         c2 = *sourcedata++ & 0x3f;
  79                         *wchardata = (c1 << 6) | c2;
  80                         if (*wchardata < 65536) wchardata++;
  81                         sourcedatalength -= 2;
  82                 }
  83                 else if ((*sourcedata & 0xf0) == 0xe0)
  84                 {
  85                         if (sourcedatalength < 3) break;
  86                         c1 = *sourcedata++ & 0x0f;
  87                         c2 = *sourcedata++ & 0x3f;
  88                         c3 = *sourcedata++ & 0x3f;
  89                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
  90                         if (*wchardata < 65536) wchardata++;
  91                         sourcedatalength -= 3;
  92                 }
  93                 else if ((*sourcedata & 0xf8) == 0xf0)
  94                 {
  95                         if (sourcedatalength < 4) break;
  96                         c1 = *sourcedata++ & 0x07;
  97                         c2 = *sourcedata++ & 0x3f;
  98                         c3 = *sourcedata++ & 0x3f;
  99                         c4 = *sourcedata++ & 0x3f;
 100                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
 101                         if (*wchardata < 65536) wchardata++;
 102                         sourcedatalength -= 4;
 103                 }
 104                 else if ((*sourcedata & 0xfc) == 0xf8)
 105                 {
 106                         // table does not extend beyond 4 char long, just skip
 107                         if (sourcedatalength < 5) break;
 108                         sourcedatalength -= 5;
 109                         sourcedata += 5;
 110                 }
 111                 else if ((*sourcedata & 0xfe) == 0xfc)
 112                 {
 113                         // table does not extend beyond 4 char long, just skip
 114                         if (sourcedatalength < 6) break;
 115                         sourcedatalength -= 6;
 116                         sourcedata += 6;
 117                 }
 118                 else
 119                 {
 120                         // assume lenngth 1, silently drop bogus characters
 121                         sourcedatalength--;
 122                         sourcedata += 1;
 123                 }
 124         }
 125         *wchardata = 0;
 126
 127         // calc the length of transliteration string
 128         resultdatalength = 0;
 129         wchardata = wchardatastart;
 130         while(*wchardata)
 131         {
 132                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
 133                 wchardata++;
 134         }
 135
 136         // allocate & create the result
 137         result = (text *)palloc(resultdatalength + VARHDRSZ);
 138         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
 139         resultdata = (unsigned char *)VARDATA(result);
 140
 141         wchardata = wchardatastart;
 142         while(*wchardata)
 143         {
 144                 if (*(asciilookup + *wchardata) > 0)
 145                 {
 146                         asciipos = ascii + *(asciilookup + *wchardata);
 147                         for(iLen = *asciipos; iLen > 0; iLen--)
 148                         {
 149                                 asciipos++;
 150                                 *resultdata = *asciipos;
 151                                 resultdata++;
 152                         }
 153                 }
 154                 /*else
 155                 {
 156                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
 157                               errmsg( "missing char: %i\n", *wchardata )));
 158
 159                 }*/
 160                 wchardata++;
 161         }
 162
 163         pfree(wchardatastart);
 164
 165         PG_RETURN_TEXT_P(result);
 166 }
 167
 168 // Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
 169 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
 170 {
 171         char *p;
 172
 173         // Search string is too long to be present
 174         if (fromlen > *len) return;
 175
 176         p = strstr(buffer, from);
 177         while(p)
 178         {
 179                 if (!isspace || (p > buffer && *(p-1) != ' '))
 180                 {
 181                         (*changes)++;
 182                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
 183                         memcpy(p, to, tolen);
 184                         *len += tolen - fromlen;
 185                 }
 186                 p = strstr(p+1, from);
 187         }
 188 }
 189
 190 void str_dupspaces(char* buffer)
 191 {
 192         char *out;
 193         int wasspace;
 194
 195         out = buffer;
 196         wasspace = 0;
 197         while(*buffer)
 198         {
 199                 if (wasspace && *buffer != ' ') wasspace = 0;
 200                 if (!wasspace)
 201                 {
 202                         *out = *buffer;
 203                         out++;
 204                         wasspace = (*buffer == ' ');
 205                 }
 206                 buffer++;
 207         }
 208         *out = 0;
 209 }
 210
 211 PG_FUNCTION_INFO_V1( gettokenstring );
 212 Datum
 213 gettokenstring( PG_FUNCTION_ARGS )
 214 {
 215         text *source;
 216         unsigned char *sourcedata;
 217         int sourcedatalength;
 218
 219         char * buffer;
 220         int len;
 221         int changes;
 222
 223         text *result;
 224
 225         if (GetDatabaseEncoding() != PG_UTF8)
 226         {
 227                 ereport(ERROR,
 228                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 229                                          errmsg("requires UTF8 database encoding")));
 230         }
 231
 232         if (PG_ARGISNULL(0))
 233         {
 234                 PG_RETURN_NULL();
 235         }
 236
 237         // The original string
 238         source = PG_GETARG_TEXT_P(0);
 239         sourcedata = (unsigned char *)VARDATA(source);
 240         sourcedatalength = VARSIZE(source) - VARHDRSZ;
 241
 242         // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
 243         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
 244         memcpy(buffer+1, sourcedata, sourcedatalength);
 245         buffer[0] = 32;
 246         buffer[sourcedatalength+1] = 32;
 247         buffer[sourcedatalength+2] = 0;
 248         len = sourcedatalength+3;
 249
 250         changes = 1;
 251         str_dupspaces(buffer);
 252         while(changes)
 253         {
 254                 changes = 0;
 255                 #include <tokenstringreplacements.inc>
 256                 str_dupspaces(buffer);
 257         }
 258
 259         // 'and' in various languages
 260         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
 261         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
 262         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
 263         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
 264         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
 265
 266         // 'the' (and similar)
 267         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
 268         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
 269         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
 270         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
 271         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
 272         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
 273         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
 274         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
 275         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
 276
 277         // german
 278         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
 279         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
 280         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
 281         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
 282         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
 283         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
 284
 285         // russian
 286         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
 287         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
 288
 289         // allocate & create the result
 290         len--;// Drop the terminating zero
 291         result = (text *)palloc(len + VARHDRSZ);
 292         SET_VARSIZE(result, len + VARHDRSZ);
 293         memcpy(VARDATA(result), buffer, len);
 294
 295         pfree(buffer);
 296
 297         PG_RETURN_TEXT_P(result);
 298 }
 299