+/**
+ * SPDX-License-Identifier: GPL-2.0-only
+ *
+ * This file is part of Nominatim. (https://nominatim.org)
+ *
+ * Copyright (C) 2022 by the Nominatim developer community.
+ * For a full list of authors see the git log.
+ */
#include "postgres.h"
#include "fmgr.h"
#include "mb/pg_wchar.h"
#include <utfasciitable.h>
-#ifdef PG_MODULE_MAGIC
-PG_MODULE_MAGIC;
+#if PG_MAJORVERSION_NUM > 15
+#include "varatt.h"
#endif
+PG_MODULE_MAGIC;
+
Datum transliteration( PG_FUNCTION_ARGS );
Datum gettokenstring( PG_FUNCTION_ARGS );
void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int);
wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
// Based on pg_utf2wchar_with_len from wchar.c
- while (sourcedatalength > 0 && *sourcedata)
+ // Postgresql strings are not zero terminalted
+ while (sourcedatalength > 0)
{
if ((*sourcedata & 0x80) == 0)
{
c1 = *sourcedata++ & 0x1f;
c2 = *sourcedata++ & 0x3f;
*wchardata = (c1 << 6) | c2;
- wchardata++;
+ if (*wchardata < 65536) wchardata++;
sourcedatalength -= 2;
}
else if ((*sourcedata & 0xf0) == 0xe0)
c2 = *sourcedata++ & 0x3f;
c3 = *sourcedata++ & 0x3f;
*wchardata = (c1 << 12) | (c2 << 6) | c3;
- wchardata++;
+ if (*wchardata < 65536) wchardata++;
sourcedatalength -= 3;
}
else if ((*sourcedata & 0xf8) == 0xf0)
c3 = *sourcedata++ & 0x3f;
c4 = *sourcedata++ & 0x3f;
*wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
- wchardata++;
+ if (*wchardata < 65536) wchardata++;
sourcedatalength -= 4;
}
else if ((*sourcedata & 0xfc) == 0xf8)
// table does not extend beyond 4 char long, just skip
if (sourcedatalength < 5) break;
sourcedatalength -= 5;
+ sourcedata += 5;
}
else if ((*sourcedata & 0xfe) == 0xfc)
{
// table does not extend beyond 4 char long, just skip
if (sourcedatalength < 6) break;
sourcedatalength -= 6;
+ sourcedata += 6;
}
else
{
// assume lenngth 1, silently drop bogus characters
sourcedatalength--;
+ sourcedata += 1;
}
}
*wchardata = 0;
resultdata++;
}
}
- else
+ /*else
{
ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
errmsg( "missing char: %i\n", *wchardata )));
- }
+ }*/
wchardata++;
}
PG_RETURN_TEXT_P(result);
}
+// Set isspace=1 if the replacement _only_ adds a space before the search string. I.e. to == " " + from
void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
{
char *p;
- // Search string is too long to be pressent
+ // Search string is too long to be present
if (fromlen > *len) return;
p = strstr(buffer, from);
while(p)
{
- if (!isspace || *(p-1) != ' ')
+ if (!isspace || (p > buffer && *(p-1) != ' '))
{
(*changes)++;
if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
sourcedata = (unsigned char *)VARDATA(source);
sourcedatalength = VARSIZE(source) - VARHDRSZ;
- // Buffer for doing the replace in - string could get slightly longer (double is mastive overkill)
+ // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
memcpy(buffer+1, sourcedata, sourcedatalength);
buffer[0] = 32;
str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
- str_replace(buffer, &len, &changes, " e ", 3, " ", 1, 0);
str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
// 'the' (and similar)