Unicode study

Ioannis Nompelis nompelis at nobelware.com
Mon Feb 18 01:50:32 UTC 2019


The email clutter I promised is attached below; my "percent encoding" done
in two ways. I also injected some verbosity. Pick one:

cc percent.c
cc -D_SWEEP_ percent.c
cc -D_SWEEP_ -D_DEBUG_ENCODE_ percent.c

The one with the _SWEEP_ uses less memory (by about 200 bytes) but does more
work. The _DEBUG_ENCODE_ shows some more stuff on the screen. This code is
meant to be instructive, not concise.

CURL has a function to do this, of course, but for those of us who sometimes
want to keep things dependence-free...


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Fucntion to percent-encode a chunk of data
 * Returns the number of bytes written in the destination buffer.
 * This function can be called with the destination buffer as NULL, in
 * which case it will return the minimum size of the buffer that it needs
 * to store the encoded soucce data; this is useful for memory allocations.
 *
 * Ioannis Nompelis <nompelis at nobelware.com>       Created: 20190215
 * Ioannis Nompelis <nompelis at nobelware.com> Last modified: 20190217
 */

int inUtils_PercentEncode( int len, const char *src, char *dst )
{
#ifdef _SWEEP_
   const char array[] = {
      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
      'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
      'U', 'V', 'W', 'X', 'Y', 'Z',
      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
      'u', 'v', 'w', 'x', 'y', 'z',
      0x2D, 0x2E, 0x5F, 0x7E,     // `-`, `.', `_', `~'
      '\0' };
#else
   const char array[] = {
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0',  '-',  '.', '\0',  '0',  '1', 
       '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0',  'A',  'B',  'C',  'D',  'E', 
       'F',  'G',  'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O', 
       'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',  'X',  'Y', 
       'Z', '\0', '\0', '\0', '\0',  '_', '\0',  'a',  'b',  'c', 
       'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm', 
       'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w', 
       'x',  'y',  'z', '\0', '\0', '\0',  '~', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 
      '\0', '\0', '\0', '\0', '\0', '\0' };
#endif
   int i,j;


   i=0, j=0;
   while( i < len ) {
      int k=0,m=-1;

#ifdef _DEBUG_ENCODE_
      // do this only for printable ASCII used for testing
      if( dst != NULL ) printf(" \"%c\"   ",src[i]);
#endif
#ifdef _SWEEP_
      while( array[k] != '\0' && m == -1 ) {
         if( src[i] == array[k] ) m = k;
         ++k;
      }

      if( m != -1 )  {
         if( dst != NULL ) {
            dst[j++] = src[i];
         } else {
            ++j;
         }
#ifdef _DEBUG_ENCODE_
      // do this only for printable ASCII used for testing
      if( dst != NULL ) printf(" \"%c\"      (allowed)\n",dst[j-1]);
#endif
      } else {
         char asc[3];

         sprintf( asc, "%X", src[i] );

         if( dst != NULL )  {
            dst[j++] = '%';        // put a percent sign
            dst[j++] = asc[0];     // followed by the most significant hex
            dst[j++] = asc[1];     // followed by the least significant hex
         } else {
            j += 3;
         }
#ifdef _DEBUG_ENCODE_
      // do this only for printable ASCII used for testing
      if( dst != NULL) printf(" \"\%%%s\"    (not allowed)\n",asc);
#endif
      }
#else
      if( array[ (size_t) (src[i]) ] != '\0' ) {
         if( dst != NULL ) {
            dst[j++] = src[i];
         } else {
            ++j;
         }
#ifdef _DEBUG_ENCODE_
      // do this only for printable ASCII used for testing
      if( dst != NULL ) printf(" \"%c\"      (allowed)\n",dst[j-1]);
#endif
      } else {
         char asc[3];

         sprintf( asc, "%X", src[i] );
         if( dst != NULL ) {
            dst[j++] = '%';
            dst[j++] = asc[0];
            dst[j++] = asc[1];
         } else {
            j += 3;
         }
#ifdef _DEBUG_ENCODE_
      // do this only for printable ASCII used for testing
      if( dst != NULL ) printf(" \"\%%%s\"    (not allowed)\n",asc);
#endif
      }
#endif

      ++i;
   }

   return(j);
}


int main()
{
   char source[] = "1234567890 qwerty %^&*-+/ more -._~";
   char dest[1000];
   int len,iret;

   printf("SOURCE: \"%s\"\n",source);
   len = strlen( source );
// iret = inUtils_PercentEncode( len, source, NULL );  // calc size only
   iret = inUtils_PercentEncode( len, source, dest );  // copy to buffer
   printf("DESTINATION: \"%s\" %d\n",dest,iret);

   return 0;
}


/*
 * Code to create C code printable ASCII for use in "percent encode" function
 * (This is how I made the 256 byte array that is used below.)
 *
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
   int i,k,m;
   const char array[] = {
      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
      'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
      'U', 'V', 'W', 'X', 'Y', 'Z',
      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
      'u', 'v', 'w', 'x', 'y', 'z',
      0x2D, 0x2E, 0x5F, 0x7E,     // `-`, `.', `_', `~'
      '\0' };

   for(i=0;i<256;++i) {
      char c = (char) i;
      k=0, m=-1;
      while( array[k] != '\0' && m == -1 ) {
         if( c == array[k] ) m = k;
         ++k;
      }
      if( m == -1 ) {
         printf("\'\\0\', ");
      } else {
         printf(" \'%c\', ",c);
      }
      if( (i+1) % 10 == 0 ) printf("\n");
   }

   return 0;
}
 */

#ifdef __cplusplus
}
#endif


More information about the Friends mailing list