3 #include "mb/pg_wchar.h"
4 #include <utfasciitable.h>
10 Datum transliteration( PG_FUNCTION_ARGS );
11 Datum gettokenstring( PG_FUNCTION_ARGS );
12 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
13 void str_dupspaces(char* buffer);
15 PG_FUNCTION_INFO_V1( transliteration );
17 transliteration( PG_FUNCTION_ARGS )
19 static char * ascii = UTFASCII;
20 static uint16 asciilookup[65536] = UTFASCIILOOKUP;
24 unsigned char *sourcedata;
27 unsigned int c1,c2,c3,c4;
28 unsigned int * wchardata;
29 unsigned int * wchardatastart;
32 unsigned char *resultdata;
36 if (GetDatabaseEncoding() != PG_UTF8)
39 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
40 errmsg("requires UTF8 database encoding")));
48 // The original string
49 source = PG_GETARG_TEXT_P(0);
50 sourcedata = (unsigned char *)VARDATA(source);
51 sourcedatalength = VARSIZE(source) - VARHDRSZ;
53 // Intermediate wchar version of string
54 wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
56 // Based on pg_utf2wchar_with_len from wchar.c
57 while (sourcedatalength > 0 && *sourcedata)
59 if ((*sourcedata & 0x80) == 0)
61 *wchardata = *sourcedata++;
65 else if ((*sourcedata & 0xe0) == 0xc0)
67 if (sourcedatalength < 2) break;
68 c1 = *sourcedata++ & 0x1f;
69 c2 = *sourcedata++ & 0x3f;
70 *wchardata = (c1 << 6) | c2;
72 sourcedatalength -= 2;
74 else if ((*sourcedata & 0xf0) == 0xe0)
76 if (sourcedatalength < 3) break;
77 c1 = *sourcedata++ & 0x0f;
78 c2 = *sourcedata++ & 0x3f;
79 c3 = *sourcedata++ & 0x3f;
80 *wchardata = (c1 << 12) | (c2 << 6) | c3;
82 sourcedatalength -= 3;
84 else if ((*sourcedata & 0xf8) == 0xf0)
86 if (sourcedatalength < 4) break;
87 c1 = *sourcedata++ & 0x07;
88 c2 = *sourcedata++ & 0x3f;
89 c3 = *sourcedata++ & 0x3f;
90 c4 = *sourcedata++ & 0x3f;
91 *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
93 sourcedatalength -= 4;
95 else if ((*sourcedata & 0xfc) == 0xf8)
97 // table does not extend beyond 4 char long, just skip
98 if (sourcedatalength < 5) break;
99 sourcedatalength -= 5;
101 else if ((*sourcedata & 0xfe) == 0xfc)
103 // table does not extend beyond 4 char long, just skip
104 if (sourcedatalength < 6) break;
105 sourcedatalength -= 6;
109 // assume lenngth 1, silently drop bogus characters
115 // calc the length of transliteration string
116 resultdatalength = 0;
117 wchardata = wchardatastart;
120 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
124 // allocate & create the result
125 result = (text *)palloc(resultdatalength + VARHDRSZ);
126 SET_VARSIZE(result, resultdatalength + VARHDRSZ);
127 resultdata = (unsigned char *)VARDATA(result);
129 wchardata = wchardatastart;
132 if (*(asciilookup + *wchardata) > 0)
134 asciipos = ascii + *(asciilookup + *wchardata);
135 for(iLen = *asciipos; iLen > 0; iLen--)
138 *resultdata = *asciipos;
144 ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
145 errmsg( "missing char: %i\n", *wchardata )));
151 pfree(wchardatastart);
153 PG_RETURN_TEXT_P(result);
156 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
160 // Search string is too long to be pressent
161 if (fromlen > *len) return;
163 p = strstr(buffer, from);
166 if (!isspace || *(p-1) != ' ')
169 if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
170 memcpy(p, to, tolen);
171 *len += tolen - fromlen;
173 p = strstr(p+1, from);
177 void str_dupspaces(char* buffer)
186 if (wasspace && *buffer != ' ') wasspace = 0;
191 wasspace = (*buffer == ' ');
198 PG_FUNCTION_INFO_V1( gettokenstring );
200 gettokenstring( PG_FUNCTION_ARGS )
203 unsigned char *sourcedata;
204 int sourcedatalength;
212 if (GetDatabaseEncoding() != PG_UTF8)
215 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
216 errmsg("requires UTF8 database encoding")));
224 // The original string
225 source = PG_GETARG_TEXT_P(0);
226 sourcedata = (unsigned char *)VARDATA(source);
227 sourcedatalength = VARSIZE(source) - VARHDRSZ;
229 // Buffer for doing the replace in - string could get slightly longer (double is mastive overkill)
230 buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
231 memcpy(buffer+1, sourcedata, sourcedatalength);
233 buffer[sourcedatalength+1] = 32;
234 buffer[sourcedatalength+2] = 0;
235 len = sourcedatalength+3;
238 str_dupspaces(buffer);
242 #include <tokenstringreplacements.inc>
243 str_dupspaces(buffer);
246 // 'and' in various languages
247 str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
248 str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
249 str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
250 str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
251 str_replace(buffer, &len, &changes, " e ", 3, " ", 1, 0);
252 str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
254 // 'the' (and similar)
255 str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
256 str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
257 str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
258 str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
259 str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
260 str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
261 str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
262 str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
263 str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
266 str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
267 str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
268 str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
269 str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
270 str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
271 str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
274 str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
275 str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
277 // allocate & create the result
278 len--;// Drop the terminating zero
279 result = (text *)palloc(len + VARHDRSZ);
280 SET_VARSIZE(result, len + VARHDRSZ);
281 memcpy(VARDATA(result), buffer, len);
285 PG_RETURN_TEXT_P(result);