2 * SPDX-License-Identifier: GPL-2.0-only
4 * This file is part of Nominatim. (https://nominatim.org)
6 * Copyright (C) 2022 by the Nominatim developer community.
7 * For a full list of authors see the git log.
11 #include "mb/pg_wchar.h"
12 #include <utfasciitable.h>
14 #if PG_MAJORVERSION_NUM > 15
20 Datum transliteration( PG_FUNCTION_ARGS );
21 Datum gettokenstring( PG_FUNCTION_ARGS );
22 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
23 void str_dupspaces(char* buffer);
25 PG_FUNCTION_INFO_V1( transliteration );
27 transliteration( PG_FUNCTION_ARGS )
29 static char * ascii = UTFASCII;
30 static uint16 asciilookup[65536] = UTFASCIILOOKUP;
34 unsigned char *sourcedata;
37 unsigned int c1,c2,c3,c4;
38 unsigned int * wchardata;
39 unsigned int * wchardatastart;
42 unsigned char *resultdata;
46 if (GetDatabaseEncoding() != PG_UTF8)
49 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
50 errmsg("requires UTF8 database encoding")));
58 // The original string
59 source = PG_GETARG_TEXT_P(0);
60 sourcedata = (unsigned char *)VARDATA(source);
61 sourcedatalength = VARSIZE(source) - VARHDRSZ;
63 // Intermediate wchar version of string
64 wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
66 // Based on pg_utf2wchar_with_len from wchar.c
67 // Postgresql strings are not zero terminalted
68 while (sourcedatalength > 0)
70 if ((*sourcedata & 0x80) == 0)
72 *wchardata = *sourcedata++;
76 else if ((*sourcedata & 0xe0) == 0xc0)
78 if (sourcedatalength < 2) break;
79 c1 = *sourcedata++ & 0x1f;
80 c2 = *sourcedata++ & 0x3f;
81 *wchardata = (c1 << 6) | c2;
82 if (*wchardata < 65536) wchardata++;
83 sourcedatalength -= 2;
85 else if ((*sourcedata & 0xf0) == 0xe0)
87 if (sourcedatalength < 3) break;
88 c1 = *sourcedata++ & 0x0f;
89 c2 = *sourcedata++ & 0x3f;
90 c3 = *sourcedata++ & 0x3f;
91 *wchardata = (c1 << 12) | (c2 << 6) | c3;
92 if (*wchardata < 65536) wchardata++;
93 sourcedatalength -= 3;
95 else if ((*sourcedata & 0xf8) == 0xf0)
97 if (sourcedatalength < 4) break;
98 c1 = *sourcedata++ & 0x07;
99 c2 = *sourcedata++ & 0x3f;
100 c3 = *sourcedata++ & 0x3f;
101 c4 = *sourcedata++ & 0x3f;
102 *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
103 if (*wchardata < 65536) wchardata++;
104 sourcedatalength -= 4;
106 else if ((*sourcedata & 0xfc) == 0xf8)
108 // table does not extend beyond 4 char long, just skip
109 if (sourcedatalength < 5) break;
110 sourcedatalength -= 5;
113 else if ((*sourcedata & 0xfe) == 0xfc)
115 // table does not extend beyond 4 char long, just skip
116 if (sourcedatalength < 6) break;
117 sourcedatalength -= 6;
122 // assume lenngth 1, silently drop bogus characters
129 // calc the length of transliteration string
130 resultdatalength = 0;
131 wchardata = wchardatastart;
134 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
138 // allocate & create the result
139 result = (text *)palloc(resultdatalength + VARHDRSZ);
140 SET_VARSIZE(result, resultdatalength + VARHDRSZ);
141 resultdata = (unsigned char *)VARDATA(result);
143 wchardata = wchardatastart;
146 if (*(asciilookup + *wchardata) > 0)
148 asciipos = ascii + *(asciilookup + *wchardata);
149 for(iLen = *asciipos; iLen > 0; iLen--)
152 *resultdata = *asciipos;
158 ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
159 errmsg( "missing char: %i\n", *wchardata )));
165 pfree(wchardatastart);
167 PG_RETURN_TEXT_P(result);
170 // Set isspace=1 if the replacement _only_ adds a space before the search string. I.e. to == " " + from
171 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
175 // Search string is too long to be present
176 if (fromlen > *len) return;
178 p = strstr(buffer, from);
181 if (!isspace || (p > buffer && *(p-1) != ' '))
184 if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
185 memcpy(p, to, tolen);
186 *len += tolen - fromlen;
188 p = strstr(p+1, from);
192 void str_dupspaces(char* buffer)
201 if (wasspace && *buffer != ' ') wasspace = 0;
206 wasspace = (*buffer == ' ');
213 PG_FUNCTION_INFO_V1( gettokenstring );
215 gettokenstring( PG_FUNCTION_ARGS )
218 unsigned char *sourcedata;
219 int sourcedatalength;
227 if (GetDatabaseEncoding() != PG_UTF8)
230 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
231 errmsg("requires UTF8 database encoding")));
239 // The original string
240 source = PG_GETARG_TEXT_P(0);
241 sourcedata = (unsigned char *)VARDATA(source);
242 sourcedatalength = VARSIZE(source) - VARHDRSZ;
244 // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
245 buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
246 memcpy(buffer+1, sourcedata, sourcedatalength);
248 buffer[sourcedatalength+1] = 32;
249 buffer[sourcedatalength+2] = 0;
250 len = sourcedatalength+3;
253 str_dupspaces(buffer);
257 #include <tokenstringreplacements.inc>
258 str_dupspaces(buffer);
261 // 'and' in various languages
262 str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
263 str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
264 str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
265 str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
266 str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
268 // 'the' (and similar)
269 str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
270 str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
271 str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
272 str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
273 str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
274 str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
275 str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
276 str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
277 str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
280 str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
281 str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
282 str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
283 str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
284 str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
285 str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
288 str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
289 str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
291 // allocate & create the result
292 len--;// Drop the terminating zero
293 result = (text *)palloc(len + VARHDRSZ);
294 SET_VARSIZE(result, len + VARHDRSZ);
295 memcpy(VARDATA(result), buffer, len);
299 PG_RETURN_TEXT_P(result);