StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

PO2D CUDA median filter optimization
primarykey
Id
19634328
data
AcceptedAnswerId
0
AnswerCount
4
ClosedDate
CommentCount
4
CommunityOwnedDate
CreationDate
2013-10-28T12:04:12.493
FavoriteCount
4
LastActivityDate
2017-03-25T01:06:15.057
LastEditDate
2014-08-26T21:46:00.610
LastEditorUserId
1886641
OwnerUserId
1235021
ParentId
0
PostTypeId
1
Score
3
ViewCount
6930
LastEditorDisplayName
text
Body
I have implemented a 2D median filter in CUDA and the whole program is shown below. <pre><code>#include "cuda_runtime.h" #include "cuda_runtime_api.h" #include "device_launch_parameters.h" #include <iostream> #include <fstream> #include <iomanip> #include <windows.h> #include <io.h> #include <stdio.h> #include<conio.h> #include <cstdlib> #include "cstdlib" #include <process.h> #include <stdlib.h> #include <malloc.h> #include <ctime> using namespace std; #define MEDIAN_DIMENSION 3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9...... #define MEDIAN_LENGTH 9 // Shoul be MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3 #define BLOCK_WIDTH 16 // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH] #define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH] __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){ __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]; int iterator; const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1; int StartPoint=MEDIAN_DIMENSION/2; int EndPoint=StartPoint+1; const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; const int tid=threadIdx.y*blockDim.y+threadIdx.x; if(x>=Image_Width || y>=Image_Height) return; //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION if (x == 0 || x == Image_Width - StartPoint || y == 0 || y == Image_Height - StartPoint) { } else { iterator = 0; for (int r = x - StartPoint; r < x + (EndPoint); r++) { for (int c = y - StartPoint; c < y + (EndPoint); c++) { surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r); iterator++; } } //Sort the Surround Array to Find Median. Use Bubble Short if Matrix oF 3 x 3 Matrix //You can use Insertion commented below to Short Bigger Dimension Matrix //// bubble short // for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i) { // Find position of minimum element int min=i; for ( int l=i+1; l<MEDIAN_LENGTH; ++l) if (surround[tid][l] <surround[tid][min] ) min=l; // Put found minimum element in its place unsigned short temp= surround[tid][i]; surround[tid][i]=surround[tid][min]; surround[tid][min]=temp; }//bubble short end //////insertion sort start // /*int t,j,i; for ( i = 1 ; i< MEDIAN_LENGTH ; i++) { j = i; while ( j > 0 && surround[tid][j] < surround[tid][j-1]) { t= surround[tid][j]; surround[tid][j]= surround[tid][j-1]; surround[tid][j-1] = t; j--; } }*/ ////insertion sort end *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1]; // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix __syncthreads(); } } int main( int argc, const char** argv ) { int dataLength; int p1; unsigned short* Host_ImageData = NULL; ifstream is; // Read File is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary ); // get length of file: is.seekg (0, ios::end); dataLength = is.tellg(); is.seekg (0, ios::beg); Host_ImageData = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)]; is.read ((char*)Host_ImageData,dataLength); is.close(); int Image_Width = 1580; int Image_Height = 1050; unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength); unsigned short *Device_ImageData = NULL; ///////////////////////////// // As First time cudaMalloc take more time for memory alocation, i dont want to cosider this time in my process. //So Please Ignore Code For Displaying First CudaMelloc Time clock_t begin = clock(); unsigned short *forFirstCudaMalloc = NULL; cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) ); clock_t end = clock(); double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout<<"First CudaMelloc time = "<<elapsed_secs<<" Second\n" ; cudaFree( forFirstCudaMalloc ); //////////////////////////// //Actual Process Starts From Here clock_t beginOverAll = clock(); // cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) ); cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH)); int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT)); const dim3 grid (x, y, 1); const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); begin = clock(); MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height); cudaDeviceSynchronize(); end = clock(); elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout<<"Process time = "<<elapsed_secs<<" Second\n" ; cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done clock_t endOverall = clock(); elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC; cout<<"Complete Time = "<<elapsed_secs<<" Second\n" ; ofstream of2; //Write Filtered Image Into File of2.open("D:\\Filtered_Image.raw", ios::binary); of2.write((char*)Host_ResultData,dataLength); of2.close(); cout<<"\nEnd of Writing File. Press Any Key To Exit..!!"; cudaFree(Device_ImageData); delete Host_ImageData; delete Host_ResultData; getch(); return 0; } </code></pre> <a href="https://drive.google.com/file/d/0B08968iGFVeOVnJmOHBRTkJUdTA/edit?usp=sharing" rel="nofollow">Here</a> is the link for the file I use. I used <a href="http://rsbweb.nih.gov/ij/download.html" rel="nofollow">ImajeJ</a> to store the image in "raw" format and the same for reading the "raw" Image. My image pixel is <code>16</code> bit, <code>unsigned short</code>. The width of the image is <code>1580</code> and the height is <code>1050</code>. I strongly believe that the filter can be made more efficient and fast by using proper CUDA optimization. Indeed, I'm running on a GeForce GT 520M card and the timings are the following 1) For <code>MEDIAN_DIMENSION</code> of <code>3 x 3 = 0.027 seconds</code> 2) For <code>MEDIAN_DIMENSION</code> of <code>5 x 5 = 0.206 seconds</code> 3) For <code>MEDIAN_DIMENSION</code> of <code>7 x 7 = 1.11 seconds</code> 4) For <code>MEDIAN_DIMENSION</code> of <code>9 x 9 = 4.931 seconds</code> As you can see, as we increase <code>MEDIAN_DIMENSION</code>, the time increases very much and I have applications where I generally use higher <code>MEDIAN_DIMENSION</code> like <code>7 x 7</code> and <code>9 x 9</code>. I think that, by using Cuda, even for <code>9 x 9</code> the time should be less than <code>1 second</code>. Since I think that the sorting part is taking most of the time here, can we make the sorting part of the algorithm faster? Can we use <code>grid</code> and <code>block</code> more efficiently? Can I use larger <code>BLOCK_WIDTH</code> and <code>BLOCK_HEIGHT</code> (like <code>32</code> and <code>32</code>) and still not hit the maximum <code>__shared__</code> memory limit which is <code>4Kb</code> for my device? Can <code>__shared__</code> memory be used more efficiently? Any help will be appreciated. Thanks in advance. 
Tags
<c++><performance><optimization><cuda><median>
Title
2D CUDA median filter optimization
singulars
PostAcceptedAnswerId
1. This table or related slice is empty.
PostParentId
1. This table or related slice is empty.
PostTypePostTypeId
1. PTQuestion
UserLastEditorUserId
1. USJackOLantern
UserOwnerUserId
1. USJony
plurals
PostLinksPostIdRelatedPostId
1. PL
 singulars
 LinkTypeLinkTypeId
 LTLinked
PostLinksRelatedPostIdPostId
1. PL
 singulars
 LinkTypeLinkTypeId
 LTLinked
PostsAcceptedAnswerId
1. This table or related slice is empty.
PostsParentIdCreationDate
1. PO
 singulars
 PostTypePostTypeId
 PTAnswer
2. PO
 singulars
 PostTypePostTypeId
 PTAnswer
VotesPostIdCreationDate
1. VO
 singulars
 PostPostId
 PO2D CUDA median filter optimization
 UserUserId
 USJay
 VoteTypeVoteTypeId
 VTFavorite
2. VO
 singulars
 PostPostId
 PO2D CUDA median filter optimization
 UserUserId
 This table or related slice is empty.
 VoteTypeVoteTypeId
 VTUpMod
3. VO
 singulars
 PostPostId
 PO2D CUDA median filter optimization
 UserUserId
 USJony
 VoteTypeVoteTypeId
 VTFavorite
CommentsPostId

Querying!

Guidance

A row detail

Detail views are divided into sections. All the information in the data section comes from columns in the selected row. The other sections display data from other, related rows.

Related data can be related in a to-one or a to-many fashion. Captions of data related in a to-many fashion link to a list view showing a filtered view of the table.

Try moving around until you find a non-empty to-many entry and click on the label to get to one. You can move back to the root by clicking on the database name in the header.