]> git.openstreetmap.org Git - nominatim.git/blobdiff - module/nominatim.c
Merge pull request #2360 from AntoJvlt/postcodes-place-table
[nominatim.git] / module / nominatim.c
index 155531530082013c76137d9556124d148f8faa6b..73bf16c8df487eac4d462621fe566534e6476621 100644 (file)
@@ -54,7 +54,8 @@ transliteration( PG_FUNCTION_ARGS )
        wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
 
        // Based on pg_utf2wchar_with_len from wchar.c
        wchardatastart = wchardata = (unsigned int *)palloc((sourcedatalength+1)*sizeof(int));
 
        // Based on pg_utf2wchar_with_len from wchar.c
-        while (sourcedatalength > 0 && *sourcedata)
+       // Postgresql strings are not zero terminalted
+        while (sourcedatalength > 0)
         {
                 if ((*sourcedata & 0x80) == 0)
                 {
         {
                 if ((*sourcedata & 0x80) == 0)
                 {
@@ -68,7 +69,7 @@ transliteration( PG_FUNCTION_ARGS )
                         c1 = *sourcedata++ & 0x1f;
                         c2 = *sourcedata++ & 0x3f;
                         *wchardata = (c1 << 6) | c2;
                         c1 = *sourcedata++ & 0x1f;
                         c2 = *sourcedata++ & 0x3f;
                         *wchardata = (c1 << 6) | c2;
-                       wchardata++;
+                       if (*wchardata < 65536) wchardata++;
                         sourcedatalength -= 2;
                 }
                 else if ((*sourcedata & 0xf0) == 0xe0)
                         sourcedatalength -= 2;
                 }
                 else if ((*sourcedata & 0xf0) == 0xe0)
@@ -78,7 +79,7 @@ transliteration( PG_FUNCTION_ARGS )
                         c2 = *sourcedata++ & 0x3f;
                         c3 = *sourcedata++ & 0x3f;
                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
                         c2 = *sourcedata++ & 0x3f;
                         c3 = *sourcedata++ & 0x3f;
                         *wchardata = (c1 << 12) | (c2 << 6) | c3;
-                       wchardata++;
+                       if (*wchardata < 65536) wchardata++;
                         sourcedatalength -= 3;
                 }
                 else if ((*sourcedata & 0xf8) == 0xf0)
                         sourcedatalength -= 3;
                 }
                 else if ((*sourcedata & 0xf8) == 0xf0)
@@ -89,7 +90,7 @@ transliteration( PG_FUNCTION_ARGS )
                         c3 = *sourcedata++ & 0x3f;
                         c4 = *sourcedata++ & 0x3f;
                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
                         c3 = *sourcedata++ & 0x3f;
                         c4 = *sourcedata++ & 0x3f;
                         *wchardata = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
-                       wchardata++;
+                       if (*wchardata < 65536) wchardata++;
                         sourcedatalength -= 4;
                 }
                 else if ((*sourcedata & 0xfc) == 0xf8)
                         sourcedatalength -= 4;
                 }
                 else if ((*sourcedata & 0xfc) == 0xf8)
@@ -97,17 +98,20 @@ transliteration( PG_FUNCTION_ARGS )
                        // table does not extend beyond 4 char long, just skip
                        if (sourcedatalength < 5) break;
                        sourcedatalength -= 5;
                        // table does not extend beyond 4 char long, just skip
                        if (sourcedatalength < 5) break;
                        sourcedatalength -= 5;
+                       sourcedata += 5;
                }
                 else if ((*sourcedata & 0xfe) == 0xfc)
                 {
                        // table does not extend beyond 4 char long, just skip
                        if (sourcedatalength < 6) break;
                        sourcedatalength -= 6;
                }
                 else if ((*sourcedata & 0xfe) == 0xfc)
                 {
                        // table does not extend beyond 4 char long, just skip
                        if (sourcedatalength < 6) break;
                        sourcedatalength -= 6;
+                       sourcedata += 6;
                }
                 else
                 {
                        // assume lenngth 1, silently drop bogus characters
                         sourcedatalength--;
                }
                 else
                 {
                        // assume lenngth 1, silently drop bogus characters
                         sourcedatalength--;
+                       sourcedata += 1;
                 }
         }
         *wchardata = 0;
                 }
         }
         *wchardata = 0;
@@ -139,12 +143,12 @@ transliteration( PG_FUNCTION_ARGS )
                                resultdata++;
                        }
                }
                                resultdata++;
                        }
                }
-               else
+               /*else
                {
                        ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
                              errmsg( "missing char: %i\n", *wchardata )));
                        
                {
                        ereport( WARNING, ( errcode( ERRCODE_SUCCESSFUL_COMPLETION ),
                              errmsg( "missing char: %i\n", *wchardata )));
                        
-               }
+               }*/
                wchardata++;
        }
 
                wchardata++;
        }
 
@@ -153,17 +157,18 @@ transliteration( PG_FUNCTION_ARGS )
        PG_RETURN_TEXT_P(result);
 }
 
        PG_RETURN_TEXT_P(result);
 }
 
+// Set isspace=1 if the replacement _only_ adds a space before the search string.  I.e. to == " " + from
 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
 {
         char *p;
 
 void str_replace(char* buffer, int* len, int* changes, char* from, int fromlen, char* to, int tolen, int isspace)
 {
         char *p;
 
-        // Search string is too long to be pressent
+        // Search string is too long to be present
         if (fromlen > *len) return;
 
         p = strstr(buffer, from);
         while(p)
         {
         if (fromlen > *len) return;
 
         p = strstr(buffer, from);
         while(p)
         {
-                if (!isspace || *(p-1) != ' ')
+                if (!isspace || (p > buffer && *(p-1) != ' '))
                 {
                         (*changes)++;
                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
                 {
                         (*changes)++;
                         if (tolen != fromlen) memmove(p+tolen, p+fromlen, *len-(p-buffer)+1);
@@ -226,7 +231,7 @@ gettokenstring( PG_FUNCTION_ARGS )
        sourcedata = (unsigned char *)VARDATA(source);
        sourcedatalength = VARSIZE(source) - VARHDRSZ;
 
        sourcedata = (unsigned char *)VARDATA(source);
        sourcedatalength = VARSIZE(source) - VARHDRSZ;
 
-       // Buffer for doing the replace in - string could get slightly longer (double is mastive overkill)
+       // Buffer for doing the replace in - string could get slightly longer (double is massive overkill)
        buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
        memcpy(buffer+1, sourcedata, sourcedatalength);
        buffer[0] = 32;
        buffer = (char *)palloc((sourcedatalength*2)*sizeof(char));
        memcpy(buffer+1, sourcedata, sourcedatalength);
        buffer[0] = 32;
@@ -248,7 +253,6 @@ gettokenstring( PG_FUNCTION_ARGS )
        str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
        str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
        str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
        str_replace(buffer, &len, &changes, " und ", 5, " ", 1, 0);
        str_replace(buffer, &len, &changes, " en ", 4, " ", 1, 0);
        str_replace(buffer, &len, &changes, " et ", 4, " ", 1, 0);
-       str_replace(buffer, &len, &changes, " e ", 3, " ", 1, 0);
        str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
 
        // 'the' (and similar)
        str_replace(buffer, &len, &changes, " y ", 3, " ", 1, 0);
 
        // 'the' (and similar)