]> git.openstreetmap.org Git - nominatim.git/blob - module/nominatim.c
document tokens
[nominatim.git] / module / nominatim.c
1 #include "postgres.h"
2 #include "fmgr.h"
3 #include "mb/pg_wchar.h"
4 #include <utfasciitable.h>
5
6 #ifdef PG_MODULE_MAGIC
7 PG_MODULE_MAGIC;
8 #endif
9
10 Datum transliteration( PG_FUNCTION_ARGS );
11 Datum gettokenstring( PG_FUNCTION_ARGS );
12 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
13 void str_dupspaces(char* buffer);
14
15 PG_FUNCTION_INFO_V1( transliteration );
16 Datum
17 transliteration( PG_FUNCTION_ARGS )
18 {
19         static char * ascii = UTFASCII;
20         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
21         char * asciipos;
22
23         text *source;
24         unsigned char *sourcedata;
25         int sourcedatalength;
26
27         unsigned int c1,c2,c3,c4;
28         unsigned int * wchardata;
29         unsigned int * wchardatastart;
30
31         text *result;
32         unsigned char *resultdata;
33         int resultdatalength;
34         int iLen;
35
36         if (GetDatabaseEncoding() != PG_UTF8) 
37         {
38                 ereport(ERROR,
39                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
40                                          errmsg("requires UTF8 database encoding")));
41         }
42
43         if (PG_ARGISNULL(0))
44         {
45                 PG_RETURN_NULL();
46         }
47
48         // The original string
49         source = PG_GETARG_TEXT_P(0);
50         sourcedata = (unsigned char *)VARDATA(source);
51         sourcedatalength = VARSIZE(source) - VARHDRSZ;
52
53         // Intermediate wchar version of string
54         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
55
56         // Based on pg_utf2wchar_with_len from wchar.c
57         // Postgresql strings are not zero terminalted
58         while (sourcedatalength > 0)
59         {
60                 if ((*sourcedata & 0x80) == 0)
61                 {
62                         *wchardata = *sourcedata++;
63                         wchardata++;
64                         sourcedatalength--;
65                 }
66                 else if ((*sourcedata & 0xe0) == 0xc0)
67                 {
68                         if (sourcedatalength < 2) break;
69                         c1 = *sourcedata++ & 0x1f;
70                         c2 = *sourcedata++ & 0x3f;
71                         *wchardata = (c1 << 6) | c2;
72                         if (*wchardata < 65536) wchardata++;
73                         sourcedatalength -= 2;
74                 }
75                 else if ((*sourcedata & 0xf0) == 0xe0)
76                 {
77                         if (sourcedatalength < 3) break;
78                         c1 = *sourcedata++ & 0x0f;
79                         c2 = *sourcedata++ & 0x3f;
80                         c3 = *sourcedata++ & 0x3f;
81                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
82                         if (*wchardata < 65536) wchardata++;
83                         sourcedatalength -= 3;
84                 }
85                 else if ((*sourcedata & 0xf8) == 0xf0)
86                 {
87                         if (sourcedatalength < 4) break;
88                         c1 = *sourcedata++ & 0x07;
89                         c2 = *sourcedata++ & 0x3f;
90                         c3 = *sourcedata++ & 0x3f;
91                         c4 = *sourcedata++ & 0x3f;
92                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
93                         if (*wchardata < 65536) wchardata++;
94                         sourcedatalength -= 4;
95                 }
96                 else if ((*sourcedata & 0xfc) == 0xf8)
97                 {
98                         // table does not extend beyond 4 char long, just skip
99                         if (sourcedatalength < 5) break;
100                         sourcedatalength -= 5;
101                         sourcedata += 5;
102                 }
103                 else if ((*sourcedata & 0xfe) == 0xfc)
104                 {
105                         // table does not extend beyond 4 char long, just skip
106                         if (sourcedatalength < 6) break;
107                         sourcedatalength -= 6;
108                         sourcedata += 6;
109                 }
110                 else
111                 {
112                         // assume lenngth 1, silently drop bogus characters
113                         sourcedatalength--;
114                         sourcedata += 1;
115                 }
116         }
117         *wchardata = 0;
118
119         // calc the length of transliteration string
120         resultdatalength = 0;
121         wchardata = wchardatastart;
122         while(*wchardata)
123         {
124                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
125                 wchardata++;
126         }
127
128         // allocate & create the result
129         result = (text *)palloc(resultdatalength + VARHDRSZ);
130         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
131         resultdata = (unsigned char *)VARDATA(result);
132
133         wchardata = wchardatastart;
134         while(*wchardata)
135         {
136                 if (*(asciilookup + *wchardata) > 0)
137                 {
138                         asciipos = ascii + *(asciilookup + *wchardata);
139                         for(iLen = *asciipos; iLen > 0; iLen--)
140                         {
141                                 asciipos++;
142                                 *resultdata = *asciipos;
143                                 resultdata++;
144                         }
145                 }
146                 /*else
147                 {
148                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
149                               errmsg( "missing char: %i\n", *wchardata )));
150                         
151                 }*/
152                 wchardata++;
153         }
154
155         pfree(wchardatastart);
156
157         PG_RETURN_TEXT_P(result);
158 }
159
160 // Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
161 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
162 {
163         char *p;
164
165         // Search string is too long to be present
166         if (fromlen > *len) return;
167
168         p = strstr(buffer, from);
169         while(p)
170         {
171                 if (!isspace || (p > buffer && *(p-1) != ' '))
172                 {
173                         (*changes)++;
174                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
175                         memcpy(p, to, tolen);
176                         *len += tolen - fromlen;
177                 }
178                 p = strstr(p+1, from);
179         }
180 }
181
182 void str_dupspaces(char* buffer)
183 {
184         char *out;
185         int wasspace;
186
187         out = buffer;
188         wasspace = 0;
189         while(*buffer)
190         {
191                 if (wasspace && *buffer != ' ') wasspace = 0;
192                 if (!wasspace)
193                 {
194                         *out = *buffer;
195                         out++;
196                         wasspace = (*buffer == ' ');
197                 }
198                 buffer++;
199         }
200         *out = 0;
201 }
202
203 PG_FUNCTION_INFO_V1( gettokenstring );
204 Datum
205 gettokenstring( PG_FUNCTION_ARGS )
206 {
207         text *source;
208         unsigned char *sourcedata;
209         int sourcedatalength;
210
211         char * buffer;
212         int len;
213         int changes;
214
215         text *result;
216
217         if (GetDatabaseEncoding() != PG_UTF8) 
218         {
219                 ereport(ERROR,
220                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
221                                          errmsg("requires UTF8 database encoding")));
222         }
223
224         if (PG_ARGISNULL(0))
225         {
226                 PG_RETURN_NULL();
227         }
228
229         // The original string
230         source = PG_GETARG_TEXT_P(0);
231         sourcedata = (unsigned char *)VARDATA(source);
232         sourcedatalength = VARSIZE(source) - VARHDRSZ;
233
234         // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
235         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
236         memcpy(buffer+1, sourcedata, sourcedatalength);
237         buffer[0] = 32;
238         buffer[sourcedatalength+1] = 32;
239         buffer[sourcedatalength+2] = 0;
240         len = sourcedatalength+3;
241
242         changes = 1;
243         str_dupspaces(buffer);
244         while(changes)
245         {
246                 changes = 0;
247                 #include <tokenstringreplacements.inc>
248                 str_dupspaces(buffer);
249         }
250
251         // 'and' in various languages
252         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
253         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
254         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
255         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
256         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
257
258         // 'the' (and similar)
259         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
260         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
261         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
262         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
263         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
264         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
265         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
266         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
267         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
268
269         // german
270         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
271         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
272         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
273         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
274         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
275         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
276
277         // russian
278         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
279         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
280
281         // allocate & create the result
282         len--;// Drop the terminating zero
283         result = (text *)palloc(len + VARHDRSZ);
284         SET_VARSIZE(result, len + VARHDRSZ);
285         memcpy(VARDATA(result), buffer, len);
286
287         pfree(buffer);
288
289         PG_RETURN_TEXT_P(result);
290 }
291