Note that there are some explanatory texts on larger screens.

plurals
  1. PO
    primarykey
    data
    text
    <p>I can't reproduce this with CUDA 3.2 and QT4 on a 64 bit Ubuntu 10.04LTS system. I took this main:</p> <pre><code>#include &lt;QtCore/QCoreApplication&gt; extern float cudamain(); int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); float gflops = cudamain(); return 0; } </code></pre> <p>and a <code>cudamain()</code> containing this:</p> <pre><code>#include &lt;assert.h&gt; #define blocksize 16 #define HM (4096) #define WM (4096) #define WN (4096) #define HN WM #define WP WN #define HP HM #define PTH WM #define PTW HM __global__ void nonsquare(float*M, float*N, float*P, int uWM,int uWN) { __shared__ float MS[blocksize][blocksize]; __shared__ float NS[blocksize][blocksize]; int tx=threadIdx.x, ty=threadIdx.y, bx=blockIdx.x, by=blockIdx.y; int rowM=ty+by*blocksize; int colN=tx+bx*blocksize; float Pvalue=0; for(int m=0; m&lt;uWM; m+=blocksize){ MS[ty][tx]=M[rowM*uWM+(m+tx)] ; NS[ty][tx]=M[colN + uWN*(m+ty)]; __syncthreads(); for(int k=0;k&lt;blocksize;k++) Pvalue+=MS[ty][k]*NS[k][tx]; __syncthreads(); } P[rowM*WP+colN]=Pvalue; } inline void gpuerrorchk(cudaError_t state) { assert(state == cudaSuccess); } float cudamain(){ cudaEvent_t evstart, evstop; cudaEventCreate(&amp;evstart); cudaEventCreate(&amp;evstop); float*M=(float*)malloc(sizeof(float)*HM*WM); float*N=(float*)malloc(sizeof(float)*HN*WN); for(int i=0;i&lt;WM*HM;i++) M[i]=(float)i; for(int i=0;i&lt;WN*HN;i++) N[i]=(float)i; float*P=(float*)malloc(sizeof(float)*HP*WP); float *Md,*Nd,*Pd; gpuerrorchk( cudaMalloc((void**)&amp;Md,HM*WM*sizeof(float)) ); gpuerrorchk( cudaMalloc((void**)&amp;Nd,HN*WN*sizeof(float)) ); gpuerrorchk( cudaMalloc((void**)&amp;Pd,HP*WP*sizeof(float)) ); gpuerrorchk( cudaMemcpy(Md,M,HM*WM*sizeof(float),cudaMemcpyHostToDevice) ); gpuerrorchk( cudaMemcpy(Nd,N,HN*WN*sizeof(float),cudaMemcpyHostToDevice) ); dim3 dimBlock(blocksize,blocksize);//(tile_width , tile_width); dim3 dimGrid(WN/dimBlock.x,HM/dimBlock.y);//(width/tile_width , width/tile_witdh); gpuerrorchk( cudaEventRecord(evstart,0) ); nonsquare&lt;&lt;&lt;dimGrid,dimBlock&gt;&gt;&gt;(Md,Nd,Pd,WM, WN); gpuerrorchk( cudaPeekAtLastError() ); gpuerrorchk( cudaEventRecord(evstop,0) ); gpuerrorchk( cudaEventSynchronize(evstop) ); float time; cudaEventElapsedTime(&amp;time,evstart,evstop); gpuerrorchk( cudaMemcpy(P,Pd,WP*HP*sizeof(float),cudaMemcpyDeviceToHost) ); cudaFree(Md); cudaFree(Nd); cudaFree(Pd); float gflops=(2.e-6*WM*WM*WM)/(time); cudaThreadExit(); return gflops; } </code></pre> <p>(pay no attention to the actual code other than it doing memory transactions and running a kernel, it is nonsense otherwise). </p> <p>Compiling the code like this:</p> <pre><code>cuda:~$ nvcc -arch=sm_20 -c -o cudamain.o cudamain.cu cuda:~$ g++ -o qtprob -I/usr/include/qt4 qtprob.cc cudamain.o -L $CUDA_INSTALL_PATH/lib64 -lQtCore -lcuda -lcudart cuda:~$ ldd qtprob linux-vdso.so.1 =&gt; (0x00007fff242c8000) libQtCore.so.4 =&gt; /opt/cuda-3.2/computeprof/bin/libQtCore.so.4 (0x00007fbe62344000) libcuda.so.1 =&gt; /usr/lib/libcuda.so.1 (0x00007fbe61a3d000) libcudart.so.3 =&gt; /opt/cuda-3.2/lib64/libcudart.so.3 (0x00007fbe617ef000) libstdc++.so.6 =&gt; /usr/lib/libstdc++.so.6 (0x00007fbe614db000) libm.so.6 =&gt; /lib/libm.so.6 (0x00007fbe61258000) libgcc_s.so.1 =&gt; /lib/libgcc_s.so.1 (0x00007fbe61040000) libc.so.6 =&gt; /lib/libc.so.6 (0x00007fbe60cbd000) libz.so.1 =&gt; /lib/libz.so.1 (0x00007fbe60aa6000) libgthread-2.0.so.0 =&gt; /usr/lib/libgthread-2.0.so.0 (0x00007fbe608a0000) libglib-2.0.so.0 =&gt; /lib/libglib-2.0.so.0 (0x00007fbe605c2000) librt.so.1 =&gt; /lib/librt.so.1 (0x00007fbe603ba000) libpthread.so.0 =&gt; /lib/libpthread.so.0 (0x00007fbe6019c000) libdl.so.2 =&gt; /lib/libdl.so.2 (0x00007fbe5ff98000) /lib64/ld-linux-x86-64.so.2 (0x00007fbe626c0000) libpcre.so.3 =&gt; /lib/libpcre.so.3 (0x00007fbe5fd69000) </code></pre> <p>produces an executable which profiles without error as many times as I care to run it with the CUDA 3.2 release profiler. </p> <p>All I can suggest is try my repro case and see whether it works or not. If it fails, then perhaps you have either a broken CUDA or QT installation. If it doesn't fail (and I suspect it won't), then you either have a problem with the way you are building the QT project or the actual CUDA code you are running itself.</p>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    1. VO
      singulars
      1. This table or related slice is empty.
    2. VO
      singulars
      1. This table or related slice is empty.
    3. VO
      singulars
      1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload