module/nominatim.c

   1 #include "postgres.h"
   2 #include "fmgr.h"
   3 #include "mb/pg_wchar.h"
   4 #include <utfasciitable.h>
   5
   6 #ifdef PG_MODULE_MAGIC
   7 PG_MODULE_MAGIC;
   8 #endif
   9
  10 Datum transliteration( PG_FUNCTION_ARGS );
  11 Datum gettokenstring( PG_FUNCTION_ARGS );
  12 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
  13 void str_dupspaces(char* buffer);
  14
  15 PG_FUNCTION_INFO_V1( transliteration );
  16 Datum
  17 transliteration( PG_FUNCTION_ARGS )
  18 {
  19         static char * ascii = UTFASCII;
  20         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
  21         char * asciipos;
  22
  23         text *source;
  24         unsigned char *sourcedata;
  25         int sourcedatalength;
  26
  27         unsigned int c1,c2,c3,c4;
  28         unsigned int * wchardata;
  29         unsigned int * wchardatastart;
  30
  31         text *result;
  32         unsigned char *resultdata;
  33         int resultdatalength;
  34         int iLen;
  35
  36         if (GetDatabaseEncoding() != PG_UTF8)
  37         {
  38                 ereport(ERROR,
  39                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
  40                                          errmsg("requires UTF8 database encoding")));
  41         }
  42
  43         if (PG_ARGISNULL(0))
  44         {
  45                 PG_RETURN_NULL();
  46         }
  47
  48         // The original string
  49         source = PG_GETARG_TEXT_P(0);
  50         sourcedata = (unsigned char *)VARDATA(source);
  51         sourcedatalength = VARSIZE(source) - VARHDRSZ;
  52
  53         // Intermediate wchar version of string
  54         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
  55
  56         // Based on pg_utf2wchar_with_len from wchar.c
  57         // Postgresql strings are not zero terminalted
  58         while (sourcedatalength > 0)
  59         {
  60                 if ((*sourcedata & 0x80) == 0)
  61                 {
  62                         *wchardata = *sourcedata++;
  63                         wchardata++;
  64                         sourcedatalength--;
  65                 }
  66                 else if ((*sourcedata & 0xe0) == 0xc0)
  67                 {
  68                         if (sourcedatalength < 2) break;
  69                         c1 = *sourcedata++ & 0x1f;
  70                         c2 = *sourcedata++ & 0x3f;
  71                         *wchardata = (c1 << 6) | c2;
  72                         if (*wchardata < 65536) wchardata++;
  73                         sourcedatalength -= 2;
  74                 }
  75                 else if ((*sourcedata & 0xf0) == 0xe0)
  76                 {
  77                         if (sourcedatalength < 3) break;
  78                         c1 = *sourcedata++ & 0x0f;
  79                         c2 = *sourcedata++ & 0x3f;
  80                         c3 = *sourcedata++ & 0x3f;
  81                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
  82                         if (*wchardata < 65536) wchardata++;
  83                         sourcedatalength -= 3;
  84                 }
  85                 else if ((*sourcedata & 0xf8) == 0xf0)
  86                 {
  87                         if (sourcedatalength < 4) break;
  88                         c1 = *sourcedata++ & 0x07;
  89                         c2 = *sourcedata++ & 0x3f;
  90                         c3 = *sourcedata++ & 0x3f;
  91                         c4 = *sourcedata++ & 0x3f;
  92                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
  93                         if (*wchardata < 65536) wchardata++;
  94                         sourcedatalength -= 4;
  95                 }
  96                 else if ((*sourcedata & 0xfc) == 0xf8)
  97                 {
  98                         // table does not extend beyond 4 char long, just skip
  99                         if (sourcedatalength < 5) break;
 100                         sourcedatalength -= 5;
 101                         sourcedata += 5;
 102                 }
 103                 else if ((*sourcedata & 0xfe) == 0xfc)
 104                 {
 105                         // table does not extend beyond 4 char long, just skip
 106                         if (sourcedatalength < 6) break;
 107                         sourcedatalength -= 6;
 108                         sourcedata += 6;
 109                 }
 110                 else
 111                 {
 112                         // assume lenngth 1, silently drop bogus characters
 113                         sourcedatalength--;
 114                         sourcedata += 1;
 115                 }
 116         }
 117         *wchardata = 0;
 118
 119         // calc the length of transliteration string
 120         resultdatalength = 0;
 121         wchardata = wchardatastart;
 122         while(*wchardata)
 123         {
 124                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
 125                 wchardata++;
 126         }
 127
 128         // allocate & create the result
 129         result = (text *)palloc(resultdatalength + VARHDRSZ);
 130         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
 131         resultdata = (unsigned char *)VARDATA(result);
 132
 133         wchardata = wchardatastart;
 134         while(*wchardata)
 135         {
 136                 if (*(asciilookup + *wchardata) > 0)
 137                 {
 138                         asciipos = ascii + *(asciilookup + *wchardata);
 139                         for(iLen = *asciipos; iLen > 0; iLen--)
 140                         {
 141                                 asciipos++;
 142                                 *resultdata = *asciipos;
 143                                 resultdata++;
 144                         }
 145                 }
 146                 /*else
 147                 {
 148                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
 149                               errmsg( "missing char: %i\n", *wchardata )));
 150
 151                 }*/
 152                 wchardata++;
 153         }
 154
 155         pfree(wchardatastart);
 156
 157         PG_RETURN_TEXT_P(result);
 158 }
 159
 160 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
 161 {
 162         char *p;
 163
 164         // Search string is too long to be pressent
 165         if (fromlen > *len) return;
 166
 167         p = strstr(buffer, from);
 168         while(p)
 169         {
 170                 if (!isspace || *(p-1) != ' ')
 171                 {
 172                         (*changes)++;
 173                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
 174                         memcpy(p, to, tolen);
 175                         *len += tolen - fromlen;
 176                 }
 177                 p = strstr(p+1, from);
 178         }
 179 }
 180
 181 void str_dupspaces(char* buffer)
 182 {
 183         char *out;
 184         int wasspace;
 185
 186         out = buffer;
 187         wasspace = 0;
 188         while(*buffer)
 189         {
 190                 if (wasspace && *buffer != ' ') wasspace = 0;
 191                 if (!wasspace)
 192                 {
 193                         *out = *buffer;
 194                         out++;
 195                         wasspace = (*buffer == ' ');
 196                 }
 197                 buffer++;
 198         }
 199         *out = 0;
 200 }
 201
 202 PG_FUNCTION_INFO_V1( gettokenstring );
 203 Datum
 204 gettokenstring( PG_FUNCTION_ARGS )
 205 {
 206         text *source;
 207         unsigned char *sourcedata;
 208         int sourcedatalength;
 209
 210         char * buffer;
 211         int len;
 212         int changes;
 213
 214         text *result;
 215
 216         if (GetDatabaseEncoding() != PG_UTF8)
 217         {
 218                 ereport(ERROR,
 219                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 220                                          errmsg("requires UTF8 database encoding")));
 221         }
 222
 223         if (PG_ARGISNULL(0))
 224         {
 225                 PG_RETURN_NULL();
 226         }
 227
 228         // The original string
 229         source = PG_GETARG_TEXT_P(0);
 230         sourcedata = (unsigned char *)VARDATA(source);
 231         sourcedatalength = VARSIZE(source) - VARHDRSZ;
 232
 233         // Buffer for doing the replace in - string could get slightly longer (double is mastive overkill)
 234         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
 235         memcpy(buffer+1, sourcedata, sourcedatalength);
 236         buffer[0] = 32;
 237         buffer[sourcedatalength+1] = 32;
 238         buffer[sourcedatalength+2] = 0;
 239         len = sourcedatalength+3;
 240
 241         changes = 1;
 242         str_dupspaces(buffer);
 243         while(changes)
 244         {
 245                 changes = 0;
 246                 #include <tokenstringreplacements.inc>
 247                 str_dupspaces(buffer);
 248         }
 249
 250         // 'and' in various languages
 251         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
 252         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
 253         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
 254         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
 255         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
 256
 257         // 'the' (and similar)
 258         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
 259         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
 260         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
 261         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
 262         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
 263         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
 264         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
 265         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
 266         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
 267
 268         // german
 269         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
 270         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
 271         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
 272         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
 273         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
 274         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
 275
 276         // russian
 277         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
 278         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
 279
 280         // allocate & create the result
 281         len--;// Drop the terminating zero
 282         result = (text *)palloc(len + VARHDRSZ);
 283         SET_VARSIZE(result, len + VARHDRSZ);
 284         memcpy(VARDATA(result), buffer, len);
 285
 286         pfree(buffer);
 287
 288         PG_RETURN_TEXT_P(result);
 289 }
 290