StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

PO
primarykey
Id
14173508
data
AcceptedAnswerId
0
AnswerCount
0
ClosedDate
CommentCount
0
CommunityOwnedDate
CreationDate
2013-01-05T15:57:27.477
FavoriteCount
0
LastActivityDate
2013-01-05T16:12:43.853
LastEditDate
2013-01-05T16:12:43.853
LastEditorUserId
1760057
OwnerUserId
1760057
ParentId
13896560
PostTypeId
2
Score
1
ViewCount
0
LastEditorDisplayName
text
Body
<p>we can use tiled matrix multiplication and i found it has a better execution time . </p> <pre><code>#include <wb.h> #define wbCheck(stmt) do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ return -1; \ } \ } while(0) // Compute C = A * B __global__ void matrixMultiplyShared(float * A, float * B, float * C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here //@@ You have to use shared memory for this MP const int TILE_WIDTH = 32; __shared__ float sharedA[TILE_WIDTH][TILE_WIDTH]; __shared__ float sharedB[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int Row = by*TILE_WIDTH + ty; int Col = bx*TILE_WIDTH + tx; float Cvalue = 0.0; if (numAColumns != numBRows) return ; for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++) { if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){ sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx]; }else{ sharedA[ty][tx] = 0.0; } if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){ sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col]; }else{ sharedB[ty][tx] = 0.0; } __syncthreads(); if(Row < numARows && Col < numBColumns){ for(int j = 0; j < TILE_WIDTH; j++) Cvalue += sharedA[ty][j] * sharedB[j][tx]; } __syncthreads(); } if (Row < numCRows && Col < numCColumns) C[Row*numCColumns + Col] = Cvalue; } int main(int argc, char ** argv) { wbArg_t args; float * hostA; // The A matrix float * hostB; // The B matrix float * hostC; // The output C matrix float * deviceA; float * deviceB; float * deviceC; int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A int numBRows; // number of rows in the matrix B int numBColumns; // number of columns in the matrix B int numCRows; // number of rows in the matrix C (you have to set this) int numCColumns; // number of columns in the matrix C (you have to set this) int TILE_WIDTH = 32; args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); //@@ Set numCRows and numCColumns numCRows = 0; numCColumns = 0; numCRows = numARows; numCColumns = numBColumns; //@@ Allocate the hostC matrix hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory."); //@@ Allocate GPU memory here cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns ); cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU."); //@@ Copy memory to the GPU here cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here int dimX = (int)(ceil((float)numCColumns / TILE_WIDTH)); int dimY = (int)(ceil((float)numCRows / TILE_WIDTH)); dim3 DimGrid(dimX, dimY); dim3 DimBlock(TILE_WIDTH, TILE_WIDTH); wbTime_start(Compute, "Performing CUDA computation"); //@@ Launch the GPU Kernel here matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); cudaThreadSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here cudaFree(deviceA); cudaFree(deviceB); cudaFree(deviceC); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA); free(hostB); free(hostC); return 0; } </code></pre>
Tags
Title
singulars
PostAcceptedAnswerId
1. This table or related slice is empty.
PostParentId
1. POMultiply Rectangular Matrices in CUDA
  singulars
  PostTypePostTypeId
  PTQuestion
PostTypePostTypeId
1. PTAnswer
UserLastEditorUserId
1. USmzn.rft
UserOwnerUserId
1. USmzn.rft
plurals
PostLinksPostIdRelatedPostId
1. This table or related slice is empty.
PostLinksRelatedPostIdPostId
1. This table or related slice is empty.
PostsAcceptedAnswerId
1. This table or related slice is empty.
PostsParentIdCreationDate
1. This table or related slice is empty.
VotesPostIdCreationDate
1. This table or related slice is empty.
CommentsPostId
1. This table or related slice is empty.

Querying!

Guidance

A row detail

Detail views are divided into sections. All the information in the data section comes from columns in the selected row. The other sections display data from other, related rows.

Related data can be related in a to-one or a to-many fashion. Captions of data related in a to-many fashion link to a list view showing a filtered view of the table.

Try moving around until you find a non-empty to-many entry and click on the label to get to one. You can move back to the root by clicking on the database name in the header.