Unicode study
Ioannis Nompelis
nompelis at nobelware.com
Mon Feb 18 01:50:32 UTC 2019
The email clutter I promised is attached below; my "percent encoding" done
in two ways. I also injected some verbosity. Pick one:
cc percent.c
cc -D_SWEEP_ percent.c
cc -D_SWEEP_ -D_DEBUG_ENCODE_ percent.c
The one with the _SWEEP_ uses less memory (by about 200 bytes) but does more
work. The _DEBUG_ENCODE_ shows some more stuff on the screen. This code is
meant to be instructive, not concise.
CURL has a function to do this, of course, but for those of us who sometimes
want to keep things dependence-free...
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Fucntion to percent-encode a chunk of data
* Returns the number of bytes written in the destination buffer.
* This function can be called with the destination buffer as NULL, in
* which case it will return the minimum size of the buffer that it needs
* to store the encoded soucce data; this is useful for memory allocations.
*
* Ioannis Nompelis <nompelis at nobelware.com> Created: 20190215
* Ioannis Nompelis <nompelis at nobelware.com> Last modified: 20190217
*/
int inUtils_PercentEncode( int len, const char *src, char *dst )
{
#ifdef _SWEEP_
const char array[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y', 'z',
0x2D, 0x2E, 0x5F, 0x7E, // `-`, `.', `_', `~'
'\0' };
#else
const char array[] = {
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '-', '.', '\0', '0', '1',
'2', '3', '4', '5', '6', '7', '8', '9', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E',
'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
'Z', '\0', '\0', '\0', '\0', '_', '\0', 'a', 'b', 'c',
'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', '\0', '\0', '\0', '~', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', '\0', '\0' };
#endif
int i,j;
i=0, j=0;
while( i < len ) {
int k=0,m=-1;
#ifdef _DEBUG_ENCODE_
// do this only for printable ASCII used for testing
if( dst != NULL ) printf(" \"%c\" ",src[i]);
#endif
#ifdef _SWEEP_
while( array[k] != '\0' && m == -1 ) {
if( src[i] == array[k] ) m = k;
++k;
}
if( m != -1 ) {
if( dst != NULL ) {
dst[j++] = src[i];
} else {
++j;
}
#ifdef _DEBUG_ENCODE_
// do this only for printable ASCII used for testing
if( dst != NULL ) printf(" \"%c\" (allowed)\n",dst[j-1]);
#endif
} else {
char asc[3];
sprintf( asc, "%X", src[i] );
if( dst != NULL ) {
dst[j++] = '%'; // put a percent sign
dst[j++] = asc[0]; // followed by the most significant hex
dst[j++] = asc[1]; // followed by the least significant hex
} else {
j += 3;
}
#ifdef _DEBUG_ENCODE_
// do this only for printable ASCII used for testing
if( dst != NULL) printf(" \"\%%%s\" (not allowed)\n",asc);
#endif
}
#else
if( array[ (size_t) (src[i]) ] != '\0' ) {
if( dst != NULL ) {
dst[j++] = src[i];
} else {
++j;
}
#ifdef _DEBUG_ENCODE_
// do this only for printable ASCII used for testing
if( dst != NULL ) printf(" \"%c\" (allowed)\n",dst[j-1]);
#endif
} else {
char asc[3];
sprintf( asc, "%X", src[i] );
if( dst != NULL ) {
dst[j++] = '%';
dst[j++] = asc[0];
dst[j++] = asc[1];
} else {
j += 3;
}
#ifdef _DEBUG_ENCODE_
// do this only for printable ASCII used for testing
if( dst != NULL ) printf(" \"\%%%s\" (not allowed)\n",asc);
#endif
}
#endif
++i;
}
return(j);
}
int main()
{
char source[] = "1234567890 qwerty %^&*-+/ more -._~";
char dest[1000];
int len,iret;
printf("SOURCE: \"%s\"\n",source);
len = strlen( source );
// iret = inUtils_PercentEncode( len, source, NULL ); // calc size only
iret = inUtils_PercentEncode( len, source, dest ); // copy to buffer
printf("DESTINATION: \"%s\" %d\n",dest,iret);
return 0;
}
/*
* Code to create C code printable ASCII for use in "percent encode" function
* (This is how I made the 256 byte array that is used below.)
*
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
int i,k,m;
const char array[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y', 'z',
0x2D, 0x2E, 0x5F, 0x7E, // `-`, `.', `_', `~'
'\0' };
for(i=0;i<256;++i) {
char c = (char) i;
k=0, m=-1;
while( array[k] != '\0' && m == -1 ) {
if( c == array[k] ) m = k;
++k;
}
if( m == -1 ) {
printf("\'\\0\', ");
} else {
printf(" \'%c\', ",c);
}
if( (i+1) % 10 == 0 ) printf("\n");
}
return 0;
}
*/
#ifdef __cplusplus
}
#endif
More information about the Friends
mailing list