Donnerstag, 3. Oktober 2013

Serializing Arrays and Alignment in CUDA

Perhaps you have to deal with message buffers in general or serialized arrays of mixed types, i.e., different arrays packed together into a generic char buffer array on host side and then unpacking on device side again. There you have to take care about aligning the data right, otherwise kernel launches will fail due to misaligned memory accesses. A 64-bit address is only allowed to start at multiples of 8 bytes.

If there is a char array with packed data in this order:

[3x int, 12x double]

... then misalignment takes place. The double starts not at a 64-bit aligned address. Launching a kernel which unpacks the buffer will fail. cuda-memcheck helps you out by this hint:

========= Invalid __global__ read of size 8
=========     at 0x00000248 in kernel(char*, unsigned int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x500340004 is misaligned


You can place the data in descending order according to the type size:

[12x double, 3x int]

So remember about aligning next time (also pointing at me!).

If you just come here to see some old-fashioned serialization in action, here you get it:


//nvcc -O2 -m64 -gencode arch=compute_30,code=sm_30 mwe_char.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include <driver_types.h>


#define HANDLE_ERROR(err) __handleError(err, __FILE__, __LINE__)
#define HANDLE_LAST(msg)  __handleLastError(msg, __FILE__, __LINE__)

void __handleError( cudaError err, const char *file, const int line )
{
  if( cudaSuccess != err) {
    fprintf(stderr, "%s(%i) : Runtime API error %d: %s.\n",
                file, line, (int)err, cudaGetErrorString( err ) );
    exit(-1);
  }
}
void __handleLastError( const char *errorMessage, const char *file, const int line )
{
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) {
    fprintf(stderr, "%s(%i) : CUDA error : %s : (%d) %s.\n",
            file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
  }
}

/*
 *
 */
__device__ __host__ int toBuffer(char* buffer, unsigned var, unsigned long long llvar)
{
  unsigned long long* ptr0 = reinterpret_cast<unsigned long long*>(buffer);
  unsigned* ptr1 = reinterpret_cast<unsigned*>(buffer+sizeof(unsigned long long));

  *ptr1 = var;
  *ptr0 = llvar;
  return 0;
} 
/*
 *
 */
__device__ __host__ int toVars(char* buffer, unsigned* var, unsigned long long* llvar)
{
  unsigned long long* ptr0 = reinterpret_cast<unsigned long long*>(buffer);
  unsigned* ptr1 = reinterpret_cast<unsigned*>(buffer+sizeof(unsigned long long));
  //printf("Addresses: %p %p\n", ptr0, ptr1);

  *var = *ptr1;
  *llvar = *ptr0;
  return 0;
} 
/*
 *
 */
__global__ void kernel(char* buffer, unsigned size)
{
  unsigned id = threadIdx.x + blockDim.x*blockIdx.x;
  unsigned var = 0;
  unsigned long long llvar = 0;
  if(id>0)
    return;

  for(unsigned k=0; k<size; ++k)
    printf("%u ", buffer[k]);
  printf("\n");

  int err = toVars(buffer, &var, &llvar);

  if(err)
   printf("Error %d.\n", err);
  else
  {
    printf("CUDA: %u %llu\n", var, llvar);
  }
}
/*
 *
 */
int main()
{
  unsigned var = 42;
  unsigned long long llvar = 123456789123456789;
  const size_t BUFFER_COUNT = sizeof(unsigned) + sizeof(unsigned long long);
  char* buffer = new char[BUFFER_COUNT];
  char* dbuffer = NULL;
  int err = 0;
  printf("Vars: %u %llu\n", var, llvar);
  // --- CPU ---
  err = toBuffer(buffer, var, llvar);
  if(err)
    return err;
  var = llvar = 0;
  err = toVars(buffer, &var, &llvar);
  if(err)
    return err;
  printf("CPU: toVars(): %u %llu\n", var, llvar);
  // --- CUDA ---
  HANDLE_ERROR( cudaSetDevice(0) );
  HANDLE_ERROR( cudaMalloc((void**)(&dbuffer), BUFFER_COUNT*sizeof(char)) );
  HANDLE_ERROR( cudaMemcpy(dbuffer, buffer, BUFFER_COUNT*sizeof(char), cudaMemcpyHostToDevice) );
  kernel<<<1,1>>>(dbuffer, BUFFER_COUNT);
  HANDLE_LAST("Kernel launch failed.");
  HANDLE_ERROR( cudaFree(dbuffer) );
  HANDLE_ERROR( cudaDeviceReset() );
  delete[] buffer;
  return 0;
}