Note that there are some explanatory texts on larger screens.

plurals
  1. POCuda - copy from device global memory to texture memory
    primarykey
    data
    text
    <p>I'm trying to perform two tasks (separated into 2 kernels) on the GPU using Cuda and C++. As input I take a NxM matrix (stored in memory on the host as a float array). I will then use a kernel that performs some operations on this matrix to make it a NxMxD matrix. I then have a second kernel which performs some operations on this 3D matrix (I just read the values, I don't have to write values to it). </p> <p>Operating in texture memory seems to be much faster for my task so my question is if it is possible to copy my data from global memory on the device after kernel 1 and transfer it directly to texture memory for kernel 2 without bringing it back to the host?</p> <p><strong>UPDATE</strong></p> <p>I've added some code to illustrate my problem better.</p> <p>Here are the two kernels. The first is just a place holder for now and replicates the 2D matrix into 3D. </p> <pre><code>__global__ void computeFeatureVector(float* imData3D_dev, int imX, int imY, int imZ) { //calculate each thread global index int xindex=blockIdx.x*blockDim.x+threadIdx.x; int yindex=blockIdx.y*blockDim.y+threadIdx.y; #pragma unroll for (int z=0; z&lt;imZ; z++) { imData3D_dev[xindex+yindex*imX + z*imX*imY] = tex2D(texImIp,xindex,yindex); } } </code></pre> <p>The second will take this 3D matrix, now represented as a texture and perform some operations on it. Blank for now.</p> <pre><code>__global__ void kernel2(float* resData_dev, int imX) { //calculate each thread global index int xindex=blockIdx.x*blockDim.x+threadIdx.x; int yindex=blockIdx.y*blockDim.y+threadIdx.y; resData_dev[xindex+yindex*imX] = tex3D(texImIp3D,xindex,yindex, 0); return; } </code></pre> <p>Then the main body of the code is as follows:</p> <pre><code>// declare textures texture&lt;float,2,cudaReadModeElementType&gt; texImIp; texture&lt;float,3,cudaReadModeElementType&gt; texImIp3D; void main_fun() { // constants int imX = 1024; int imY = 768; int imZ = 16; // input data float* imData2D = new float[sizeof(float)*imX*imY]; for(int x=0; x&lt;imX*imY; x++) imData2D[x] = (float) rand()/RAND_MAX; //create channel to describe data type cudaArray* carrayImIp; cudaChannelFormatDesc channel; channel=cudaCreateChannelDesc&lt;float&gt;(); //allocate device memory for cuda array cudaMallocArray(&amp;carrayImIp,&amp;channel,imX,imY); //copy matrix from host to device memory cudaMemcpyToArray(carrayImIp,0,0,imData2D,sizeof(float)*imX*imY,cudaMemcpyHostToDevice); // Set texture properties texImIp.filterMode=cudaFilterModePoint; texImIp.addressMode[0]=cudaAddressModeClamp; texImIp.addressMode[1]=cudaAddressModeClamp; // bind texture reference with cuda array cudaBindTextureToArray(texImIp,carrayImIp); // kernel params dim3 blocknum; dim3 blocksize; blocksize.x=16; blocksize.y=16; blocksize.z=1; blocknum.x=(int)ceil((float)imX/16); blocknum.y=(int)ceil((float)imY/16); // store output here float* imData3D_dev; cudaMalloc((void**)&amp;imData3D_dev,sizeof(float)*imX*imY*imZ); // execute kernel computeFeatureVector&lt;&lt;&lt;blocknum,blocksize&gt;&gt;&gt;(imData3D_dev, imX, imY, imZ); //unbind texture reference to free resource cudaUnbindTexture(texImIp); // check copied ok float* imData3D = new float[sizeof(float)*imX*imY*imZ]; cudaMemcpy(imData3D,imData3D_dev,sizeof(float)*imX*imY*imZ,cudaMemcpyDeviceToHost); cout &lt;&lt; " kernel 1" &lt;&lt; endl; for (int x=0; x&lt;10;x++) cout &lt;&lt; imData3D[x] &lt;&lt; " "; cout &lt;&lt; endl; delete [] imData3D; // // kernel 2 // // copy data on device to 3d array cudaArray* carrayImIp3D; cudaExtent volumesize; volumesize = make_cudaExtent(imX, imY, imZ); cudaMalloc3DArray(&amp;carrayImIp3D,&amp;channel,volumesize); cudaMemcpyToArray(carrayImIp3D,0,0,imData3D_dev,sizeof(float)*imX*imY*imZ,cudaMemcpyDeviceToDevice); // texture params and bind texImIp3D.filterMode=cudaFilterModePoint; texImIp3D.addressMode[0]=cudaAddressModeClamp; texImIp3D.addressMode[1]=cudaAddressModeClamp; texImIp3D.addressMode[2]=cudaAddressModeClamp; cudaBindTextureToArray(texImIp3D,carrayImIp3D,channel); // store output here float* resData_dev; cudaMalloc((void**)&amp;resData_dev,sizeof(float)*imX*imY); // kernel 2 kernel2&lt;&lt;&lt;blocknum,blocksize&gt;&gt;&gt;(resData_dev, imX); cudaUnbindTexture(texImIp3D); //copy result matrix from device to host memory float* resData = new float[sizeof(float)*imX*imY]; cudaMemcpy(resData,resData_dev,sizeof(float)*imX*imY,cudaMemcpyDeviceToHost); // check copied ok cout &lt;&lt; " kernel 2" &lt;&lt; endl; for (int x=0; x&lt;10;x++) cout &lt;&lt; resData[x] &lt;&lt; " "; cout &lt;&lt; endl; delete [] imData2D; delete [] resData; cudaFree(imData3D_dev); cudaFree(resData_dev); cudaFreeArray(carrayImIp); cudaFreeArray(carrayImIp3D); } </code></pre> <p>Im happy that the first kernel is working correctly but the 3D matrix imData3D_dev does not seem to be bound to the texture texImIp3D correctly. </p> <p><strong>ANSWER</strong></p> <p>I solved my problem using cudaMemcpy3D. Here is revised code for the second part of the main function. imData3D_dev contains the 3D matrix in global memory from the first kernel. </p> <pre><code> cudaArray* carrayImIp3D; cudaExtent volumesize; volumesize = make_cudaExtent(imX, imY, imZ); cudaMalloc3DArray(&amp;carrayImIp3D,&amp;channel,volumesize); cudaMemcpy3DParms copyparms={0}; copyparms.extent = volumesize; copyparms.dstArray = carrayImIp3D; copyparms.kind = cudaMemcpyDeviceToDevice; copyparms.srcPtr = make_cudaPitchedPtr((void*)imData3D_dev, sizeof(float)*imX,imX,imY); cudaMemcpy3D(&amp;copyparms); // texture params and bind texImIp3D.filterMode=cudaFilterModePoint; texImIp3D.addressMode[0]=cudaAddressModeClamp; texImIp3D.addressMode[1]=cudaAddressModeClamp; texImIp3D.addressMode[2]=cudaAddressModeClamp; cudaBindTextureToArray(texImIp3D,carrayImIp3D,channel); // store output here float* resData_dev; cudaMalloc((void**)&amp;resData_dev,sizeof(float)*imX*imY); kernel2&lt;&lt;&lt;blocknum,blocksize&gt;&gt;&gt;(resData_dev, imX); // ... clean up </code></pre>
    singulars
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload