Unicode study

Ioannis Nompelis nompelis at nobelware.com
Fri Feb 22 19:03:32 UTC 2019


I had been a little busy the last few days, so I was unable to do any of this
reading. But I did manage to spend some "recreation time" to write up a base64
encoder that tries to be useful and verbose for learning purposes. So, I am
spamming you with the C code attached at the end of this message. Along with
the percent encoding that I discussed earlier, this covers 90% of what the
web and email is based on -- in my estimation. I think learning about this
was, and will be, time well-spent. Thanks Joe for following up on my study idea.

I hope you like my bitwise operations tricks and my bit-visualization hacks.
I invote the youngins to show us how it's really done!

Compile with: 'cc -D_DEBUG_ -D_DEBUG2_ base64.c' for full verbosity.

Keep hacking!

--------------------------------
#include <stdio.h>
#include <stdlib.h>

void inUtils_Bits_int( int i, char *string, int itype )
{
   int k,j, mask = 0x0001;
   char byte[2], d = '0';     // change this to turn zeros to, say.... '_'

   if( itype == 0 ) {

      for(k=0;k<32;++k) {
         j = (i >> k) & mask;      // do bit-shift and mask
         sprintf(byte,"%1d",j);    // print 1 byte (last one)
         if( itype == -1 && byte[0] == '0' ) byte[0] = d;
         string[31-k] = byte[0];   // place the byte in the string backwards
      }
      string[32] = '\0';
   // string[31] = '_';   // hack to show the last byte at printing

   } else {
      int kk = 32+3;

      for(k=0;k<32;++k) {
         if( kk % 9 == 8 ) {
            string[kk--] = ' ';    // inject a space every 8 bits
         }
         j = (i >> k) & mask;      // do 8 bit shift and mask
         sprintf(byte,"%1d",j);    // print 1 byte (last one)
         if( itype == -1 && byte[0] == '0' ) byte[0] = d;
         string[kk--] = byte[0];   // place the byte in the string backwards
      }
      string[35] = '\0';
   // string[35] = '_';   // hack to show the last byte at printing

   }
}


/*
 * Function to do "Base64 encoding" with the choice "+" and "/" to be the
 * last two printable characters in the index. Requires a buffer of 4//3
 * size ratio to store the encoded result.
 *
 * Description of base64 encoding and this algorithm
 *
 * Base64 encoding is used to turn a chunk of data into something printable.
 * In base64 encoding se seek to use a set of printable characters and some
 * conventions to represent a chunk of bytes (this is because not all 256
 * characters are printable). Suppose that we have 64 printable characers
 * (and we actually have more than that). The number of bits required to
 * represent the numbers 0-63 (all 64 of them) is 6. So, if we split the
 * buffer into chunks of 6 bits, we can associate each group of 6 bits in
 * a row in the original buffer to one number, which we will call an index.
 * Then, given an ordered printable set of 64 characters, we can associate
 * each index with one of those characters. It is a mapping convention of
 * every 6 bits in the buffer to a character. This and some added conventions
 * to deal with some complications that arise is base64 enoding.
 *
 * The idea is that an encoded chunk is split into 6-bit chunks. Each chunk
 * represents a number 0 to 63 (2^6). So, we can take 6 bits at a time
 * and represent them with an "index" 0 to 63. Then we can make each index
 * to a printable character. If we were given a buffer that is not an integer
 * multiple of 6 bits, for the last byte we would take the remainder (2 or
 * 4 bits) and just complete the sequence with (4 or 2) zero bits to form a
 * final sextet. However, this description is missing a convention to
 * indicate termination. The solution is to always store the result (encoded
 * chunk) in multiples of 24 bits, which is 4 sextets and also 3 bytes. This
 * means we always will use 3 bytes to store any remainder and there will be
 * some "empty" storage at the end (either 6 or 12 bits). By convention, we
 * use the equal sign "+" to padd that "empty" bits in the last 2 bytes.
 *
 * This algorithm works as follows. ... [fill this in]
 */

char* inUtils_Base64_Encode( int length, char src[], char *dst )
{
   // map 6-bit index to a printable character
   const char map[] = {
      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
      'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
      'U', 'V', 'W', 'X', 'Y', 'Z',
      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
      'u', 'v', 'w', 'x', 'y', 'z',
      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
      '+', '/'
   };
   int i,j,k;


   // sanity check
   if( length <= 0 || src == NULL ) return( NULL );

   // allocate memory if needed
   if ( dst == NULL ) {
      i = length*4;
      j = i % 3;
      i = i/3 + j;
#ifdef _DEBUG_
      printf("Buffer length: %d \n",length);
      printf("Encoded buffer size: %d \n",i);
#endif
      dst = (char *) malloc( ((size_t) i+1)*sizeof(char));
      if( dst == NULL ) return( dst );
   }

#ifdef _DEBUG2_
      printf(" Bits in an int   |----------------- 32 bits -------| \n");
      printf(" Table of bits:   <IGNORE> |-------- 24 bits -------| \n");
#endif
   // sweep through the input chunk in increments of 3-bytes
   k = 0;
   for( i = 0; i < length; i += 3 ) {
      int dum = 0;      // initialize the 4-byte "buffer" to all-bits-are-zero
      int bytes = 0;    // number of bytes processed in the triplet
      unsigned char c;

#ifdef _DEBUG_
      printf("Transcribing 3-byte triplet; start byte position %d \n", i);
#endif
#ifdef _DEBUG2_
      char bit_string[36];
      inUtils_Bits_int( dum, bit_string, -1 );
      printf(" bits in buffer: [%s] (initial)\n",bit_string);
#endif
      // sweep through the chunk of three bytes or until the end is reached
      for( j = i; j < length && j < i+3; ++j, ++bytes ) {

         // do a bit-shift of 8 bits to the left of the whole buffer
         // (the highest 8 bits are dropped; the lowest are filled with zero)
         // only do this when it matters for now (after j=i)
         if( j != i ) dum = dum << 8;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (shift left by 8 bits))\n",bit_string);
#endif

         // do a bit-wise OR with the first byte in sequence and store the
         // result (this allows for writing bits 1-8 of "dum" without touching
         // the higher bits in "dum", which were set earlier and bit-shifted)
         dum = dum | src[j];
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (OR with byte %d)\n",bit_string,j);
#endif
      }

#ifdef _DEBUG_
      printf("Number of bytes processed: %d \n",bytes);
#endif

      // work on the last 24 bits of "dum" 6 bits at a time
      if( bytes == 3 ) {

#ifdef _DEBUG2_
         // show 6-bit shifts as they happen and the "index" bits are stored
         int tmp;
         printf("Manipulating triplet's bits \n");
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (before drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 3) \n",bit_string);
#endif
         dst[ k+3 ] = map[ c ];

         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 2) \n",bit_string);
#endif
         dst[ k+2 ] = map[ c ];

         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 1) \n",bit_string);
#endif
         dst[ k+1 ] = map[ c ];

         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 0) \n",bit_string);
#endif
         dst[ k+0 ] = map[ c ];

      } else if( bytes == 2 ) {

         // final byte is fixed
         dst[ k+3 ] = '=';

         // Do a bit-shift of 8 bits to the left of the whole buffer
         // (the highest 8 bits are dropped; the lowest are filled with zero)
         // This is not needed, but I am doing it such that the buffer looks
         // like the one in the table of the Wikipedia page on base64 encoding
         dum = dum << 8;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (remainder byte shift) \n",bit_string);
#endif

#ifdef _DEBUG2_
         // show 6-bit shifts as they happen and the "index" bits are stored
         int tmp;
         printf("Manipulating triplet's bits \n");
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (before drop) \n",bit_string);
#endif
         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 2) \n",bit_string);
#endif
         dst[ k+2 ] = map[ c ];

         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 1) \n",bit_string);
#endif
         dst[ k+1 ] = map[ c ];

         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 0) \n",bit_string);
#endif
         dst[ k+0 ] = map[ c ];

      } else {

         // final two bytes are fixed
         dst[ k+3 ] = '=';
         dst[ k+2 ] = '=';

         // Do a bit-shift of 16 bits to the left of the whole buffer
         // (the highest 16 bits are dropped; the lowest are filled with zero)
         // This is not needed, but I am doing it such that the buffer looks
         // like the one in the table of the Wikipedia page on base64 encoding
         dum = dum << 16;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (remainder 2-byte shift) \n",bit_string);
#endif

         dum = dum >> 12;
#ifdef _DEBUG2_
         // show 6-bit shifts as they happen and the "index" bits are stored
         int tmp;
         printf("Manipulating triplet's bits \n");
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (before drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 0) \n",bit_string);
#endif
         dst[ k+1 ] = map[ c ];

         dum = dum >> 6;
#ifdef _DEBUG2_
         inUtils_Bits_int( dum, bit_string, -1 );
         printf(" bits in buffer: [%s] (6 bit drop) \n",bit_string);
#endif
         c = 0;
         c = dum | c;
         c = c << 2;
         c = c >> 2;
#ifdef _DEBUG2_
         tmp = 0;
         tmp = tmp | c;
         inUtils_Bits_int( tmp, bit_string, -1 );
         printf(" bits in buffer: [%s] (bits in character 0) \n",bit_string);
#endif
         dst[ k+0 ] = map[ c ];

      }

      k += 4;
   }

   return( dst );
}


int main( int argc, char *argv[] )
{
   char src[1000];
   char *dst;
   int len;


   // form a very specific source chunk
// sprintf( src, "1234567890" );
   sprintf( src, "1234" );
   len=0;
   while( src[len] != '\0' ) ++len;
   printf("Length of source: %d \n",len);

   dst = inUtils_Base64_Encode( len, src, NULL );
   if( dst != NULL ) {
      printf("ENCODED: \"%s\"\n",dst);
      free( dst );
   }

   return( EXIT_SUCCESS );
}



More information about the Friends mailing list