]> git.openstreetmap.org Git - nominatim.git/blob - module/nominatim.c
155531530082013c76137d9556124d148f8faa6b
[nominatim.git] / module / nominatim.c
1 #include "postgres.h"
2 #include "fmgr.h"
3 #include "mb/pg_wchar.h"
4 #include <utfasciitable.h>
5
6 #ifdef PG_MODULE_MAGIC
7 PG_MODULE_MAGIC;
8 #endif
9
10 Datum transliteration( PG_FUNCTION_ARGS );
11 Datum gettokenstring( PG_FUNCTION_ARGS );
12 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
13 void str_dupspaces(char* buffer);
14
15 PG_FUNCTION_INFO_V1( transliteration );
16 Datum
17 transliteration( PG_FUNCTION_ARGS )
18 {
19         static char * ascii = UTFASCII;
20         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
21         char * asciipos;
22
23         text *source;
24         unsigned char *sourcedata;
25         int sourcedatalength;
26
27         unsigned int c1,c2,c3,c4;
28         unsigned int * wchardata;
29         unsigned int * wchardatastart;
30
31         text *result;
32         unsigned char *resultdata;
33         int resultdatalength;
34         int iLen;
35
36         if (GetDatabaseEncoding() != PG_UTF8) 
37         {
38                 ereport(ERROR,
39                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
40                                          errmsg("requires UTF8 database encoding")));
41         }
42
43         if (PG_ARGISNULL(0))
44         {
45                 PG_RETURN_NULL();
46         }
47
48         // The original string
49         source = PG_GETARG_TEXT_P(0);
50         sourcedata = (unsigned char *)VARDATA(source);
51         sourcedatalength = VARSIZE(source) - VARHDRSZ;
52
53         // Intermediate wchar version of string
54         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
55
56         // Based on pg_utf2wchar_with_len from wchar.c
57         while (sourcedatalength > 0 && *sourcedata)
58         {
59                 if ((*sourcedata & 0x80) == 0)
60                 {
61                         *wchardata = *sourcedata++;
62                         wchardata++;
63                         sourcedatalength--;
64                 }
65                 else if ((*sourcedata & 0xe0) == 0xc0)
66                 {
67                         if (sourcedatalength < 2) break;
68                         c1 = *sourcedata++ & 0x1f;
69                         c2 = *sourcedata++ & 0x3f;
70                         *wchardata = (c1 << 6) | c2;
71                         wchardata++;
72                         sourcedatalength -= 2;
73                 }
74                 else if ((*sourcedata & 0xf0) == 0xe0)
75                 {
76                         if (sourcedatalength < 3) break;
77                         c1 = *sourcedata++ & 0x0f;
78                         c2 = *sourcedata++ & 0x3f;
79                         c3 = *sourcedata++ & 0x3f;
80                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
81                         wchardata++;
82                         sourcedatalength -= 3;
83                 }
84                 else if ((*sourcedata & 0xf8) == 0xf0)
85                 {
86                         if (sourcedatalength < 4) break;
87                         c1 = *sourcedata++ & 0x07;
88                         c2 = *sourcedata++ & 0x3f;
89                         c3 = *sourcedata++ & 0x3f;
90                         c4 = *sourcedata++ & 0x3f;
91                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
92                         wchardata++;
93                         sourcedatalength -= 4;
94                 }
95                 else if ((*sourcedata & 0xfc) == 0xf8)
96                 {
97                         // table does not extend beyond 4 char long, just skip
98                         if (sourcedatalength < 5) break;
99                         sourcedatalength -= 5;
100                 }
101                 else if ((*sourcedata & 0xfe) == 0xfc)
102                 {
103                         // table does not extend beyond 4 char long, just skip
104                         if (sourcedatalength < 6) break;
105                         sourcedatalength -= 6;
106                 }
107                 else
108                 {
109                         // assume lenngth 1, silently drop bogus characters
110                         sourcedatalength--;
111                 }
112         }
113         *wchardata = 0;
114
115         // calc the length of transliteration string
116         resultdatalength = 0;
117         wchardata = wchardatastart;
118         while(*wchardata)
119         {
120                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
121                 wchardata++;
122         }
123
124         // allocate & create the result
125         result = (text *)palloc(resultdatalength + VARHDRSZ);
126         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
127         resultdata = (unsigned char *)VARDATA(result);
128
129         wchardata = wchardatastart;
130         while(*wchardata)
131         {
132                 if (*(asciilookup + *wchardata) > 0)
133                 {
134                         asciipos = ascii + *(asciilookup + *wchardata);
135                         for(iLen = *asciipos; iLen > 0; iLen--)
136                         {
137                                 asciipos++;
138                                 *resultdata = *asciipos;
139                                 resultdata++;
140                         }
141                 }
142                 else
143                 {
144                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
145                               errmsg( "missing char: %i\n", *wchardata )));
146                         
147                 }
148                 wchardata++;
149         }
150
151         pfree(wchardatastart);
152
153         PG_RETURN_TEXT_P(result);
154 }
155
156 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
157 {
158         char *p;
159
160         // Search string is too long to be pressent
161         if (fromlen > *len) return;
162
163         p = strstr(buffer, from);
164         while(p)
165         {
166                 if (!isspace || *(p-1) != ' ')
167                 {
168                         (*changes)++;
169                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
170                         memcpy(p, to, tolen);
171                         *len += tolen - fromlen;
172                 }
173                 p = strstr(p+1, from);
174         }
175 }
176
177 void str_dupspaces(char* buffer)
178 {
179         char *out;
180         int wasspace;
181
182         out = buffer;
183         wasspace = 0;
184         while(*buffer)
185         {
186                 if (wasspace && *buffer != ' ') wasspace = 0;
187                 if (!wasspace)
188                 {
189                         *out = *buffer;
190                         out++;
191                         wasspace = (*buffer == ' ');
192                 }
193                 buffer++;
194         }
195         *out = 0;
196 }
197
198 PG_FUNCTION_INFO_V1( gettokenstring );
199 Datum
200 gettokenstring( PG_FUNCTION_ARGS )
201 {
202         text *source;
203         unsigned char *sourcedata;
204         int sourcedatalength;
205
206         char * buffer;
207         int len;
208         int changes;
209
210         text *result;
211
212         if (GetDatabaseEncoding() != PG_UTF8) 
213         {
214                 ereport(ERROR,
215                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
216                                          errmsg("requires UTF8 database encoding")));
217         }
218
219         if (PG_ARGISNULL(0))
220         {
221                 PG_RETURN_NULL();
222         }
223
224         // The original string
225         source = PG_GETARG_TEXT_P(0);
226         sourcedata = (unsigned char *)VARDATA(source);
227         sourcedatalength = VARSIZE(source) - VARHDRSZ;
228
229         // Buffer for doing the replace in - string could get slightly longer (double is mastive overkill)
230         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
231         memcpy(buffer+1, sourcedata, sourcedatalength);
232         buffer[0] = 32;
233         buffer[sourcedatalength+1] = 32;
234         buffer[sourcedatalength+2] = 0;
235         len = sourcedatalength+3;
236
237         changes = 1;
238         str_dupspaces(buffer);
239         while(changes)
240         {
241                 changes = 0;
242                 #include <tokenstringreplacements.inc>
243                 str_dupspaces(buffer);
244         }
245
246         // 'and' in various languages
247         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
248         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
249         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
250         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
251         str_replace(buffer, &len, &changes, " e ", 3, " ", 1, 0);
252         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
253
254         // 'the' (and similar)
255         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
256         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
257         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
258         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
259         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
260         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
261         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
262         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
263         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
264
265         // german
266         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
267         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
268         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
269         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
270         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
271         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
272
273         // russian
274         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
275         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
276
277         // allocate & create the result
278         len--;// Drop the terminating zero
279         result = (text *)palloc(len + VARHDRSZ);
280         SET_VARSIZE(result, len + VARHDRSZ);
281         memcpy(VARDATA(result), buffer, len);
282
283         pfree(buffer);
284
285         PG_RETURN_TEXT_P(result);
286 }
287