StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

PO
text
Body
copied!<p>Here is a rough outline of what I had in mind for a custom allocator and pool that would hide some of the mechanics of using a class both on the host and the device.</p> <p>I don't consider it to be a paragon of programming excellence. It is merely intended to be a rough outline of the steps that I think would be involved. I'm sure there are many bugs. I didn't include it, but I think you would want a public method that would get the <code>size</code> as well.</p> <pre><code>#include <iostream> #include <assert.h> #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) typedef float mytype; __device__ unsigned int pool_allocated = 0; __device__ unsigned int pool_size = 0; __device__ mytype *pool = 0; __device__ unsigned int pool_reserve(size_t size){ assert((pool_allocated+size) < pool_size); unsigned int offset = atomicAdd(&pool_allocated, size); assert (offset < pool_size); return offset; } __host__ void init_pool(size_t psize){ mytype *temp; unsigned int my_size = psize; cudaMalloc((void **)&temp, psize*sizeof(mytype)); cudaCheckErrors("init pool cudaMalloc fail"); cudaMemcpyToSymbol(pool, &temp, sizeof(mytype *)); cudaCheckErrors("init pool cudaMemcpyToSymbol 1 fail"); cudaMemcpyToSymbol(pool_size, &my_size, sizeof(unsigned int)); cudaCheckErrors("init pool cudaMemcpyToSymbol 2 fail"); } class A{ public: mytype *data; __host__ __device__ void pool_allocate_and_copy() { assert(d_data == 0); assert(size != 0); #ifdef __CUDA_ARCH__ unsigned int offset = pool_reserve(size); d_data = pool + offset; memcpy(d_data, data, size*sizeof(mytype)); #else cudaMalloc((void **)&d_data, size*sizeof(mytype)); cudaCheckErrors("pool_allocate_and_copy cudaMalloc fail"); cudaMemcpy(d_data, data, size*sizeof(mytype), cudaMemcpyHostToDevice); cudaCheckErrors("pool_allocate_and_copy cudaMemcpy fail"); #endif /* __CUDA_ARCH__ */ } __host__ __device__ void update(){ #ifdef __CUDA_ARCH__ assert(data != 0); data = d_data; assert(data != 0); #else if (h_data == 0) h_data = (mytype *)malloc(size*sizeof(mytype)); data = h_data; assert(data != 0); cudaMemcpy(data, d_data, size*sizeof(mytype), cudaMemcpyDeviceToHost); cudaCheckErrors("update cudaMempcy fail"); #endif } __host__ __device__ void allocate(size_t asize) { assert(data == 0); data = (mytype *)malloc(asize*sizeof(mytype)); assert(data != 0); #ifndef __CUDA_ARCH__ h_data = data; #endif size = asize; } __host__ __device__ void copyobj(A *obj){ assert(obj != 0); #ifdef __CUDA_ARCH__ memcpy(this, obj, sizeof(A)); #else cudaMemcpy(this, obj, sizeof(A), cudaMemcpyDefault); cudaCheckErrors("copy cudaMempcy fail"); #endif this->update(); } __host__ __device__ A(); private: unsigned int size; mytype *d_data; mytype *h_data; }; __host__ __device__ A::A(){ data = 0; d_data = 0; h_data = 0; size = 0; } __global__ void mykernel(A obj, A *res){ A mylocal; mylocal.copyobj(&obj); A mylocal2; mylocal2.allocate(24); mylocal2.data[0]=45; mylocal2.pool_allocate_and_copy(); res->copyobj(&mylocal2); printf("kernel data %f\n", mylocal.data[0]); } int main(){ A my_obj; A *d_result, h_result; my_obj.allocate(32); my_obj.data[0] = 12; init_pool(1048576); my_obj.pool_allocate_and_copy(); cudaMalloc((void **)&d_result, sizeof(A)); cudaCheckErrors("main cudaMalloc fail"); mykernel<<<1,1>>>(my_obj, d_result); cudaDeviceSynchronize(); cudaCheckErrors("kernel fail"); h_result.copyobj(d_result); printf("host data %f\n", h_result.data[0]); return 0; } </code></pre>

Querying!

Guidance

An individual column

Larger individual text columns get their own page to allow for proper reading.

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload