Note that there are some explanatory texts on larger screens.

plurals
  1. POCUDA performance doubts
    primarykey
    data
    text
    <p>Since i didnt got a response from the CUDA forum, ill try it here:</p> <p>After doing a few programs in CUDA ive now started to obtain their effective bandwidth. However i have some strange results, for example in the following code, where i can sum all the elements in a vector(regardless of dimension), the bandwidth with the Unroll Code and the "normal" code seems to have the same median result(around 3000 Gb/s) I dont know if im doing something wrong(AFAIK the program works fine) but from what ive read so far, the Unroll code should have a higher bandwidth.</p> <pre><code>#include &lt;stdio.h&gt; #include &lt;limits.h&gt; #include &lt;stdlib.h&gt; #include &lt;math.h&gt; #define elements 1000 #define blocksize 16 __global__ void vecsumkernel(float*input, float*output,int nelements){ __shared__ float psum[blocksize]; int tid=threadIdx.x; if(tid + blockDim.x * blockIdx.x &lt; nelements) psum[tid]=input[tid+blockDim.x*blockIdx.x]; else psum[tid]=0.0f; __syncthreads(); //WITHOUT UNROLL int stride; for(stride=blockDim.x/2;stride&gt;0;stride&gt;&gt;=1){ if(tid&lt;stride) psum[tid]+=psum[tid+stride]; __syncthreads(); } if(tid==0) output[blockIdx.x]=psum[0]; //WITH UNROLL /* if(blocksize&gt;=512 &amp;&amp; tid&lt;256) psum[tid]+=psum[tid+256];__syncthreads(); if(blocksize&gt;=256 &amp;&amp; tid&lt;128) psum[tid]+=psum[tid+128];__syncthreads(); if(blocksize&gt;=128 &amp;&amp; tid&lt;64) psum[tid]+=psum[tid+64];__syncthreads(); if (tid &lt; 32) { if (blocksize &gt;= 64) psum[tid] += psum[tid + 32]; if (blocksize &gt;= 32) psum[tid] += psum[tid + 16]; if (blocksize &gt;= 16) psum[tid] += psum[tid + 8]; if (blocksize &gt;= 8) psum[tid] += psum[tid + 4]; if (blocksize &gt;= 4) psum[tid] += psum[tid + 2]; if (blocksize &gt;= 2) psum[tid] += psum[tid + 1]; }*/ if(tid==0) output[blockIdx.x]=psum[0]; } void vecsumv2(float*input, float*output, int nelements){ dim3 dimBlock(blocksize,1,1); int i; for(i=((int)ceil((double)(nelements)/(double)blocksize))*blocksize;i&gt;1;i(int)ceil((double)i/(double)blocksize)){ dim3 dimGrid((int)ceil((double)i/(double)blocksize),1,1); printf("\ni=%d\ndimgrid=%u\n ",i,dimGrid.x); vecsumkernel&lt;&lt;&lt;dimGrid,dimBlock&gt;&gt;&gt;(i==((int)ceil((double)(nelements)/(double)blocksize))*blocksize ?input:output,output,i==((int)ceil((double)(nelements)/(double)blocksize))*blocksize ? elements:i); } } void printVec(float*vec,int dim){ printf("\n{"); for(int i=0;i&lt;dim;i++) printf("%f ",vec[i]); printf("}\n"); } int main(){ cudaEvent_t evstart, evstop; cudaEventCreate(&amp;evstart); cudaEventCreate(&amp;evstop); float*input=(float*)malloc(sizeof(float)*(elements)); for(int i=0;i&lt;elements;i++) input[i]=(float) i; float*output=(float*)malloc(sizeof(float)*elements); float *input_d,*output_d; cudaMalloc((void**)&amp;input_d,elements*sizeof(float)); cudaMalloc((void**)&amp;output_d,elements*sizeof(float)); cudaMemcpy(input_d,input,elements*sizeof(float),cudaMemcpyHostToDevice); cudaEventRecord(evstart,0); vecsumv2(input_d,output_d,elements); cudaEventRecord(evstop,0); cudaEventSynchronize(evstop); float time; cudaEventElapsedTime(&amp;time,evstart,evstop); printf("\ntempo gasto:%f\n",time); float Bandwidth=((1000*4*2)/10^9)/time; printf("\n Bandwidth:%f Gb/s\n",Bandwidth); cudaMemcpy(output,output_d,elements*sizeof(float),cudaMemcpyDeviceToHost); cudaFree(input_d); cudaFree(output_d); printf("soma do vector"); printVec(output,4); } </code></pre>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload