]> git.openstreetmap.org Git - nominatim.git/blob - module/nominatim.c
Change to regular regex group
[nominatim.git] / module / nominatim.c
1 /**
2  * SPDX-License-Identifier: GPL-2.0-only
3  *
4  * This file is part of Nominatim. (https://nominatim.org)
5  *
6  * Copyright (C) 2022 by the Nominatim developer community.
7  * For a full list of authors see the git log.
8  */
9 #include "postgres.h"
10 #include "fmgr.h"
11 #include "mb/pg_wchar.h"
12 #include <utfasciitable.h>
13
14 #ifdef PG_MODULE_MAGIC
15 PG_MODULE_MAGIC;
16 #endif
17
18 Datum transliteration( PG_FUNCTION_ARGS );
19 Datum gettokenstring( PG_FUNCTION_ARGS );
20 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
21 void str_dupspaces(char* buffer);
22
23 PG_FUNCTION_INFO_V1( transliteration );
24 Datum
25 transliteration( PG_FUNCTION_ARGS )
26 {
27         static char * ascii = UTFASCII;
28         static uint16 asciilookup[65536] = UTFASCIILOOKUP;
29         char * asciipos;
30
31         text *source;
32         unsigned char *sourcedata;
33         int sourcedatalength;
34
35         unsigned int c1,c2,c3,c4;
36         unsigned int * wchardata;
37         unsigned int * wchardatastart;
38
39         text *result;
40         unsigned char *resultdata;
41         int resultdatalength;
42         int iLen;
43
44         if (GetDatabaseEncoding() != PG_UTF8) 
45         {
46                 ereport(ERROR,
47                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
48                                          errmsg("requires UTF8 database encoding")));
49         }
50
51         if (PG_ARGISNULL(0))
52         {
53                 PG_RETURN_NULL();
54         }
55
56         // The original string
57         source = PG_GETARG_TEXT_P(0);
58         sourcedata = (unsigned char *)VARDATA(source);
59         sourcedatalength = VARSIZE(source) - VARHDRSZ;
60
61         // Intermediate wchar version of string
62         wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
63
64         // Based on pg_utf2wchar_with_len from wchar.c
65         // Postgresql strings are not zero terminalted
66         while (sourcedatalength > 0)
67         {
68                 if ((*sourcedata & 0x80) == 0)
69                 {
70                         *wchardata = *sourcedata++;
71                         wchardata++;
72                         sourcedatalength--;
73                 }
74                 else if ((*sourcedata & 0xe0) == 0xc0)
75                 {
76                         if (sourcedatalength < 2) break;
77                         c1 = *sourcedata++ & 0x1f;
78                         c2 = *sourcedata++ & 0x3f;
79                         *wchardata = (c1 << 6) | c2;
80                         if (*wchardata < 65536) wchardata++;
81                         sourcedatalength -= 2;
82                 }
83                 else if ((*sourcedata & 0xf0) == 0xe0)
84                 {
85                         if (sourcedatalength < 3) break;
86                         c1 = *sourcedata++ & 0x0f;
87                         c2 = *sourcedata++ & 0x3f;
88                         c3 = *sourcedata++ & 0x3f;
89                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
90                         if (*wchardata < 65536) wchardata++;
91                         sourcedatalength -= 3;
92                 }
93                 else if ((*sourcedata & 0xf8) == 0xf0)
94                 {
95                         if (sourcedatalength < 4) break;
96                         c1 = *sourcedata++ & 0x07;
97                         c2 = *sourcedata++ & 0x3f;
98                         c3 = *sourcedata++ & 0x3f;
99                         c4 = *sourcedata++ & 0x3f;
100                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
101                         if (*wchardata < 65536) wchardata++;
102                         sourcedatalength -= 4;
103                 }
104                 else if ((*sourcedata & 0xfc) == 0xf8)
105                 {
106                         // table does not extend beyond 4 char long, just skip
107                         if (sourcedatalength < 5) break;
108                         sourcedatalength -= 5;
109                         sourcedata += 5;
110                 }
111                 else if ((*sourcedata & 0xfe) == 0xfc)
112                 {
113                         // table does not extend beyond 4 char long, just skip
114                         if (sourcedatalength < 6) break;
115                         sourcedatalength -= 6;
116                         sourcedata += 6;
117                 }
118                 else
119                 {
120                         // assume lenngth 1, silently drop bogus characters
121                         sourcedatalength--;
122                         sourcedata += 1;
123                 }
124         }
125         *wchardata = 0;
126
127         // calc the length of transliteration string
128         resultdatalength = 0;
129         wchardata = wchardatastart;
130         while(*wchardata)
131         {
132                 if (*(asciilookup + *wchardata) > 0) resultdatalength += *(ascii + *(asciilookup + *wchardata));
133                 wchardata++;
134         }
135
136         // allocate & create the result
137         result = (text *)palloc(resultdatalength + VARHDRSZ);
138         SET_VARSIZE(result, resultdatalength + VARHDRSZ);
139         resultdata = (unsigned char *)VARDATA(result);
140
141         wchardata = wchardatastart;
142         while(*wchardata)
143         {
144                 if (*(asciilookup + *wchardata) > 0)
145                 {
146                         asciipos = ascii + *(asciilookup + *wchardata);
147                         for(iLen = *asciipos; iLen > 0; iLen--)
148                         {
149                                 asciipos++;
150                                 *resultdata = *asciipos;
151                                 resultdata++;
152                         }
153                 }
154                 /*else
155                 {
156                         ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
157                               errmsg( "missing char: %i\n", *wchardata )));
158                         
159                 }*/
160                 wchardata++;
161         }
162
163         pfree(wchardatastart);
164
165         PG_RETURN_TEXT_P(result);
166 }
167
168 // Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
169 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
170 {
171         char *p;
172
173         // Search string is too long to be present
174         if (fromlen > *len) return;
175
176         p = strstr(buffer, from);
177         while(p)
178         {
179                 if (!isspace || (p > buffer && *(p-1) != ' '))
180                 {
181                         (*changes)++;
182                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
183                         memcpy(p, to, tolen);
184                         *len += tolen - fromlen;
185                 }
186                 p = strstr(p+1, from);
187         }
188 }
189
190 void str_dupspaces(char* buffer)
191 {
192         char *out;
193         int wasspace;
194
195         out = buffer;
196         wasspace = 0;
197         while(*buffer)
198         {
199                 if (wasspace && *buffer != ' ') wasspace = 0;
200                 if (!wasspace)
201                 {
202                         *out = *buffer;
203                         out++;
204                         wasspace = (*buffer == ' ');
205                 }
206                 buffer++;
207         }
208         *out = 0;
209 }
210
211 PG_FUNCTION_INFO_V1( gettokenstring );
212 Datum
213 gettokenstring( PG_FUNCTION_ARGS )
214 {
215         text *source;
216         unsigned char *sourcedata;
217         int sourcedatalength;
218
219         char * buffer;
220         int len;
221         int changes;
222
223         text *result;
224
225         if (GetDatabaseEncoding() != PG_UTF8) 
226         {
227                 ereport(ERROR,
228                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
229                                          errmsg("requires UTF8 database encoding")));
230         }
231
232         if (PG_ARGISNULL(0))
233         {
234                 PG_RETURN_NULL();
235         }
236
237         // The original string
238         source = PG_GETARG_TEXT_P(0);
239         sourcedata = (unsigned char *)VARDATA(source);
240         sourcedatalength = VARSIZE(source) - VARHDRSZ;
241
242         // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
243         buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
244         memcpy(buffer+1, sourcedata, sourcedatalength);
245         buffer[0] = 32;
246         buffer[sourcedatalength+1] = 32;
247         buffer[sourcedatalength+2] = 0;
248         len = sourcedatalength+3;
249
250         changes = 1;
251         str_dupspaces(buffer);
252         while(changes)
253         {
254                 changes = 0;
255                 #include <tokenstringreplacements.inc>
256                 str_dupspaces(buffer);
257         }
258
259         // 'and' in various languages
260         str_replace(buffer, &len, &changes, " and ", 5, " ", 1, 0);
261         str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
262         str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
263         str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
264         str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
265
266         // 'the' (and similar)
267         str_replace(buffer, &len, &changes, " the ", 5, " ", 1, 0);
268         str_replace(buffer, &len, &changes, " der ", 5, " ", 1, 0);
269         str_replace(buffer, &len, &changes, " den ", 5, " ", 1, 0);
270         str_replace(buffer, &len, &changes, " die ", 5, " ", 1, 0);
271         str_replace(buffer, &len, &changes, " das ", 5, " ", 1, 0);
272         str_replace(buffer, &len, &changes, " la ", 4, " ", 1, 0);
273         str_replace(buffer, &len, &changes, " le ", 4, " ", 1, 0);
274         str_replace(buffer, &len, &changes, " el ", 4, " ", 1, 0);
275         str_replace(buffer, &len, &changes, " il ", 4, " ", 1, 0);
276
277         // german
278         str_replace(buffer, &len, &changes, "ae", 2, "a", 1, 0);
279         str_replace(buffer, &len, &changes, "oe", 2, "o", 1, 0);
280         str_replace(buffer, &len, &changes, "ue", 2, "u", 1, 0);
281         str_replace(buffer, &len, &changes, "sss", 3, "ss", 2, 0);
282         str_replace(buffer, &len, &changes, "ih", 2, "i", 1, 0);
283         str_replace(buffer, &len, &changes, "eh", 2, "e", 1, 0);
284
285         // russian
286         str_replace(buffer, &len, &changes, "ie", 2, "i", 1, 0);
287         str_replace(buffer, &len, &changes, "yi", 2, "i", 1, 0);
288
289         // allocate & create the result
290         len--;// Drop the terminating zero
291         result = (text *)palloc(len + VARHDRSZ);
292         SET_VARSIZE(result, len + VARHDRSZ);
293         memcpy(VARDATA(result), buffer, len);
294
295         pfree(buffer);
296
297         PG_RETURN_TEXT_P(result);
298 }
299