StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

POCUDA DE kernel not launching
primarykey
Id
16813935
data
AcceptedAnswerId
0
AnswerCount
1
ClosedDate
CommentCount
10
CommunityOwnedDate
CreationDate
2013-05-29T12:30:17.560
FavoriteCount
1
LastActivityDate
2013-06-04T10:51:53.040
LastEditDate
2013-06-03T13:15:10.370
LastEditorUserId
755974
OwnerUserId
755974
ParentId
0
PostTypeId
1
Score
0
ViewCount
232
LastEditorDisplayName
text
Body
I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched. Any help? Here's the entire code: <pre><code>#include <iostream> #include <curand_kernel.h> using namespace std; /**** ERROR HANDLING ****/ static void HandleError(cudaError_t err,const char *file, int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); system("pause"); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) /**** HOST AND DEVICE CONSTANTS****/ const int hNP=100, hD=31, hN=10; __constant__ int NP, D, N; __constant__ float Cr, F; /*** EVAL FUNCTION******/ __device__ float lennardJones(float a[3], float b[3]) { float distance = sqrt((a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1]) + (a[2] - b[2]) * (a[2] - b[2])); float distance6 = distance * distance * distance * distance * distance * distance; float distance12 = distance6 * distance6; return 1/distance12 - 2/distance6; } /**** RANDOM GENERATORS***/ __device__ float rndFloat(curandState* globalState, int id) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return RANDOM; } __device__ int rndInt(curandState* globalState, int id, int max) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return RANDOM*max; } __device__ float rndFloat(curandState* globalState, int id, int max) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return RANDOM*max; } __device__ float rndFloat(curandState* globalState, int id, int min,int max) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return min+RANDOM*(max-min); } /*** SEEDS ****/ __global__ void setup_kernel (curandState * state, unsigned long seed) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id < NP) curand_init(seed, id, 0,&state[id]); } /**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/ __global__ void kernelE(curandState* globalState, float *population) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id < NP) { //init, just populating array with some specific numbers population[D*id]=0; population[D*id+N]=0; population[D*id +2*N]=0; population[D*id+1]=rndFloat(globalState,threadIdx.x,4); population[D*id+N+1]=0; population[D*id +2*N+1]=0; for(int i=2; i<N; i++){ float min= -4 - 1/4*abs((int)((i-4)/3)); float max= 4 + 1/4*abs((int)((i-4)/3)); if(i==2) { population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359); population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max); population[D*id +2*N+2]=0; } else { population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max); population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max); population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max); } } //eval float e=0; for(int i=0; i<N; i++) { for(int j=0; j<i; j++) { float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]}; e += lennardJones(a,b); } } population[D*id + D-1]=e; } } /**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/ __global__ void kernelP(curandState* globalState, int *mutation) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id<NP) { int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP); while(a == id){a = rndInt(globalState, id, NP);} while(b == a && b==id){b=rndInt(globalState, id, NP);} while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);} mutation[D*id+0]=a; mutation[D*id+1]=b; mutation[D*id+2]=c; } } /**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/ __global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id<NP) { int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2]; //DE mutation and crossover int j=rndInt(globalState, id, NP); for(int i=0; i<D-1; i++) { //DE mutation pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]); //DE crossover if(Cr > rndFloat(globalState, id) && i!= j) pop[D*id+i]=population[D*id +i]; } // Eval pop[D*id+D-1]=0; for(int i=0; i<N; i++) { for(int j=0; j<i; j++) { float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]}; pop[D*id+D-1] += lennardJones(a,b); } } __syncthreads(); //DE selection if(pop[D*id+D-1] < population[D*id +D-1]) { for(int i=0; i<D; i++) population[D*id +i]=pop[D*id+i]; } } } void getBestScore(float *hpopulation) { int max=0; for(int i=1; i<hNP; i++) { if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1]) max=i; } for(int j=0; j<hN; j++) cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl; cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl; } int main() { cudaEvent_t start,stop; HANDLE_ERROR(cudaEventCreate(&start)); HANDLE_ERROR(cudaEventCreate(&stop)); HANDLE_ERROR(cudaEventRecord(start,0)); int device, st=100; float hCr=0.6f, hF=0.8f; cudaDeviceProp prop; HANDLE_ERROR(cudaGetDevice(&device)); HANDLE_ERROR(cudaGetDeviceProperties(&prop, device)); // int SN = prop.maxThreadsPerBlock; //512 threads per block //int SB = (hNP+(SN-1))/SN; //constants NP, D, N, Cr, F HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int))); HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int))); HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int))); HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float))); HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float))); //seeds curandState* devStates; HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState))); setup_kernel <<< 1, hNP>>> (devStates, 50); //population float *population, *pop; float hpopulation[hNP*hD]; HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float))); //mutation int *mutation, *mutation1; int *hmutation; HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault)); HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int))); //stream cudaStream_t stream_i, stream_j; HANDLE_ERROR(cudaStreamCreate(&stream_i)); HANDLE_ERROR(cudaStreamCreate(&stream_j)); kernelE<<<1,hNP, 0,stream_i>>>(devStates,population); kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation); while(st != 0) { /*** COPYING MUTATION INDICES***/ HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j)); HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i)); /**** CALLING KERNELS****/ kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation); kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop); st--; //HANDLE_ERROR(cudaStreamSynchronize(stream_i)); //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); //getBestScore(hpopulation); //cin.get(); } HANDLE_ERROR(cudaStreamSynchronize(stream_i)); HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); getBestScore(hpopulation); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float time; HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop)); cout<<endl<<"Tme: "<<time/1000<<"s"<<endl; HANDLE_ERROR(cudaEventDestroy(start)); HANDLE_ERROR(cudaEventDestroy(stop)); HANDLE_ERROR(cudaStreamDestroy(stream_i)); HANDLE_ERROR(cudaStreamDestroy(stream_j)); HANDLE_ERROR(cudaFree(population)); HANDLE_ERROR(cudaFree(pop)); HANDLE_ERROR(cudaFreeHost(hmutation)); HANDLE_ERROR(cudaFree(mutation1)); HANDLE_ERROR(cudaFree(devStates)); system("pause"); return 0; } </code></pre> UPDATE - Solution: <pre><code>#include <iostream> #include <curand_kernel.h> using namespace std; /**** ERROR HANDLING ****/ static void HandleError(cudaError_t err,const char *file, int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); system("pause"); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) /**** HOST AND DEVICE CONSTANTS****/ const int hNP=100, hD=31, hN=10; __constant__ int NP, D, N; __constant__ float Cr, F; /*** EVAL FUNCTION******/ __device__ float lennardJones(float a[3], float b[3]) { float distance = sqrt((a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1]) + (a[2] - b[2]) * (a[2] - b[2])); float distance6 = distance * distance * distance * distance * distance * distance; float distance12 = distance6 * distance6; return 1/distance12 - 2/distance6; } /**** RANDOM GENERATORS***/ __device__ float rndFloat(curandState* globalState, int id) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return RANDOM; } __device__ int rndInt(curandState* globalState, int id, int max) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return RANDOM*max; } __device__ float rndFloat(curandState* globalState, int id, int max) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return RANDOM*max; } __device__ float rndFloat(curandState* globalState, int id, int min,int max) { curandState localState = globalState[id]; float RANDOM = curand_uniform(&localState); globalState[id] = localState; return min+RANDOM*(max-min); } /*** SEEDS ****/ __global__ void setup_kernel (curandState * state, unsigned long seed) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id < NP) curand_init(seed, id, 0,&state[id]); } /**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/ __global__ void kernelE(curandState* globalState, float *population) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id < NP) { //init, just populating array with some specific numbers population[D*id]=0; population[D*id+N]=0; population[D*id +2*N]=0; population[D*id+1]=rndFloat(globalState,threadIdx.x,4); population[D*id+N+1]=0; population[D*id +2*N+1]=0; for(int i=2; i<N; i++){ float min= -4 - 1/4*abs((int)((i-4)/3)); float max= 4 + 1/4*abs((int)((i-4)/3)); if(i==2) { population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359); population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max); population[D*id +2*N+2]=0; } else { population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max); population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max); population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max); } } //eval float e=0; for(int i=0; i<N; i++) { for(int j=0; j<i; j++) { float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]}; e += lennardJones(a,b); } } population[D*id + D-1]=e; } } /**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/ __global__ void kernelP(curandState* globalState, int *mutation) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id<NP) { int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP); while(a == id){a = rndInt(globalState, id, NP);} while(b == a && b==id){b=rndInt(globalState, id, NP);} while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);} mutation[3*id+0]=a; mutation[3*id+1]=b; mutation[3*id+2]=c; } } /**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/ __global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop) { int id= threadIdx.x+blockIdx.x*blockDim.x; if(id<NP) { int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2]; //DE mutation and crossover int j=rndInt(globalState, id, NP); for(int i=0; i<D-1; i++) { //DE mutation pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]); //DE crossover if(Cr > rndFloat(globalState, id) && i!= j) pop[D*id+i]=population[D*id +i]; } // Eval pop[D*id+D-1]=0; for(int i=0; i<N; i++) { for(int j=0; j<i; j++) { float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]}; pop[D*id+D-1] += lennardJones(a,b); } } __syncthreads(); //DE selection if(pop[D*id+D-1] < population[D*id +D-1]) { for(int i=0; i<D; i++) population[D*id +i]=pop[D*id+i]; } } } void getBestScore(float *hpopulation) { int max=0; for(int i=1; i<hNP; i++) { if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1]) max=i; } for(int j=0; j<hN; j++) cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl; cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl; } int main() { cudaEvent_t start,stop; HANDLE_ERROR(cudaEventCreate(&start)); HANDLE_ERROR(cudaEventCreate(&stop)); HANDLE_ERROR(cudaEventRecord(start,0)); int device, st=100; float hCr=0.6f, hF=0.8f; cudaDeviceProp prop; HANDLE_ERROR(cudaGetDevice(&device)); HANDLE_ERROR(cudaGetDeviceProperties(&prop, device)); // int SN = prop.maxThreadsPerBlock; //512 threads per block //int SB = (hNP+(SN-1))/SN; //constants NP, D, N, Cr, F HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int))); HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int))); HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int))); HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float))); HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float))); //seeds curandState* devStates; HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState))); setup_kernel <<< 1, hNP>>> (devStates, 50); //population float *population, *pop; float hpopulation[hNP*hD]; HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float))); //mutation int *mutation, *mutation1; int *hmutation; HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault)); HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int))); HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int))); //stream cudaStream_t stream_i, stream_j; HANDLE_ERROR(cudaStreamCreate(&stream_i)); HANDLE_ERROR(cudaStreamCreate(&stream_j)); kernelE<<<1,hNP, 0,stream_i>>>(devStates,population); kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation); while(st != 0) { /*** COPYING MUTATION INDICES***/ HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j)); HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i)); /**** CALLING KERNELS****/ kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation); kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop); st--; //HANDLE_ERROR(cudaStreamSynchronize(stream_i)); //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); //getBestScore(hpopulation); //cin.get(); } HANDLE_ERROR(cudaStreamSynchronize(stream_i)); HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); getBestScore(hpopulation); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float time; HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop)); cout<<endl<<"Tme: "<<time/1000<<"s"<<endl; HANDLE_ERROR(cudaEventDestroy(start)); HANDLE_ERROR(cudaEventDestroy(stop)); HANDLE_ERROR(cudaStreamDestroy(stream_i)); HANDLE_ERROR(cudaStreamDestroy(stream_j)); HANDLE_ERROR(cudaFree(population)); HANDLE_ERROR(cudaFree(pop)); HANDLE_ERROR(cudaFreeHost(hmutation)); HANDLE_ERROR(cudaFree(mutation1)); HANDLE_ERROR(cudaFree(devStates)); system("pause"); return 0; } </code></pre>
Tags
<cuda><kernel>
Title
CUDA DE kernel not launching
singulars
PostAcceptedAnswerId
1. This table or related slice is empty.
PostParentId
1. This table or related slice is empty.
PostTypePostTypeId
1. PTQuestion
UserLastEditorUserId
1. USuser755974
UserOwnerUserId
1. USuser755974
plurals
PostLinksPostIdRelatedPostId
1. PL
 singulars
 LinkTypeLinkTypeId
 LTLinked
PostLinksRelatedPostIdPostId
1. This table or related slice is empty.
PostsAcceptedAnswerId
1. This table or related slice is empty.
PostsParentIdCreationDate
1. PO
 singulars
 PostTypePostTypeId
 PTAnswer
VotesPostIdCreationDate
1. VO
 singulars
 PostPostId
 POCUDA DE kernel not launching
 UserUserId
 This table or related slice is empty.
 VoteTypeVoteTypeId
 VTClose
2. VO
 singulars
 PostPostId
 POCUDA DE kernel not launching
 UserUserId
 This table or related slice is empty.
 VoteTypeVoteTypeId
 VTClose
CommentsPostId

Querying!

Guidance

A row detail

Detail views are divided into sections. All the information in the data section comes from columns in the selected row. The other sections display data from other, related rows.

Related data can be related in a to-one or a to-many fashion. Captions of data related in a to-many fashion link to a list view showing a filtered view of the table.

Try moving around until you find a non-empty to-many entry and click on the label to get to one. You can move back to the root by clicking on the database name in the header.