Note that there are some explanatory texts on larger screens.

plurals
  1. PO2D CUDA median filter optimization
    primarykey
    data
    text
    <p>I have implemented a 2D median filter in CUDA and the whole program is shown below.</p> <pre><code>#include "cuda_runtime.h" #include "cuda_runtime_api.h" #include "device_launch_parameters.h" #include &lt;iostream&gt; #include &lt;fstream&gt; #include &lt;iomanip&gt; #include &lt;windows.h&gt; #include &lt;io.h&gt; #include &lt;stdio.h&gt; #include&lt;conio.h&gt; #include &lt;cstdlib&gt; #include "cstdlib" #include &lt;process.h&gt; #include &lt;stdlib.h&gt; #include &lt;malloc.h&gt; #include &lt;ctime&gt; using namespace std; #define MEDIAN_DIMENSION 3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9...... #define MEDIAN_LENGTH 9 // Shoul be MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3 #define BLOCK_WIDTH 16 // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH] #define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH] __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){ __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]; int iterator; const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1; int StartPoint=MEDIAN_DIMENSION/2; int EndPoint=StartPoint+1; const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; const int tid=threadIdx.y*blockDim.y+threadIdx.x; if(x&gt;=Image_Width || y&gt;=Image_Height) return; //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION if (x == 0 || x == Image_Width - StartPoint || y == 0 || y == Image_Height - StartPoint) { } else { iterator = 0; for (int r = x - StartPoint; r &lt; x + (EndPoint); r++) { for (int c = y - StartPoint; c &lt; y + (EndPoint); c++) { surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r); iterator++; } } //Sort the Surround Array to Find Median. Use Bubble Short if Matrix oF 3 x 3 Matrix //You can use Insertion commented below to Short Bigger Dimension Matrix //// bubble short // for ( int i=0; i&lt;Half_Of_MEDIAN_LENGTH; ++i) { // Find position of minimum element int min=i; for ( int l=i+1; l&lt;MEDIAN_LENGTH; ++l) if (surround[tid][l] &lt;surround[tid][min] ) min=l; // Put found minimum element in its place unsigned short temp= surround[tid][i]; surround[tid][i]=surround[tid][min]; surround[tid][min]=temp; }//bubble short end //////insertion sort start // /*int t,j,i; for ( i = 1 ; i&lt; MEDIAN_LENGTH ; i++) { j = i; while ( j &gt; 0 &amp;&amp; surround[tid][j] &lt; surround[tid][j-1]) { t= surround[tid][j]; surround[tid][j]= surround[tid][j-1]; surround[tid][j-1] = t; j--; } }*/ ////insertion sort end *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1]; // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix __syncthreads(); } } int main( int argc, const char** argv ) { int dataLength; int p1; unsigned short* Host_ImageData = NULL; ifstream is; // Read File is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary ); // get length of file: is.seekg (0, ios::end); dataLength = is.tellg(); is.seekg (0, ios::beg); Host_ImageData = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)]; is.read ((char*)Host_ImageData,dataLength); is.close(); int Image_Width = 1580; int Image_Height = 1050; unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength); unsigned short *Device_ImageData = NULL; ///////////////////////////// // As First time cudaMalloc take more time for memory alocation, i dont want to cosider this time in my process. //So Please Ignore Code For Displaying First CudaMelloc Time clock_t begin = clock(); unsigned short *forFirstCudaMalloc = NULL; cudaMalloc( (void**)&amp;forFirstCudaMalloc, dataLength * sizeof(unsigned short) ); clock_t end = clock(); double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout&lt;&lt;"First CudaMelloc time = "&lt;&lt;elapsed_secs&lt;&lt;" Second\n" ; cudaFree( forFirstCudaMalloc ); //////////////////////////// //Actual Process Starts From Here clock_t beginOverAll = clock(); // cudaMalloc( (void**)&amp;Device_ImageData, dataLength * sizeof(unsigned short) ); cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering int x = static_cast&lt;int&gt;(ceilf(static_cast&lt;float&gt;(1580.0) /BLOCK_WIDTH)); int y = static_cast&lt;int&gt;(ceilf(static_cast&lt;float&gt;(1050.0) /BLOCK_HEIGHT)); const dim3 grid (x, y, 1); const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); begin = clock(); MedianFilter_gpu&lt;&lt;&lt;grid,block&gt;&gt;&gt;( Device_ImageData, Image_Width, Image_Height); cudaDeviceSynchronize(); end = clock(); elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout&lt;&lt;"Process time = "&lt;&lt;elapsed_secs&lt;&lt;" Second\n" ; cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done clock_t endOverall = clock(); elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC; cout&lt;&lt;"Complete Time = "&lt;&lt;elapsed_secs&lt;&lt;" Second\n" ; ofstream of2; //Write Filtered Image Into File of2.open("D:\\Filtered_Image.raw", ios::binary); of2.write((char*)Host_ResultData,dataLength); of2.close(); cout&lt;&lt;"\nEnd of Writing File. Press Any Key To Exit..!!"; cudaFree(Device_ImageData); delete Host_ImageData; delete Host_ResultData; getch(); return 0; } </code></pre> <p><a href="https://drive.google.com/file/d/0B08968iGFVeOVnJmOHBRTkJUdTA/edit?usp=sharing" rel="nofollow">Here</a> is the link for the file I use. I used <a href="http://rsbweb.nih.gov/ij/download.html" rel="nofollow">ImajeJ</a> to store the image in "raw" format and the same for reading the "raw" Image. My image pixel is <code>16</code> bit, <code>unsigned short</code>. The width of the image is <code>1580</code> and the height is <code>1050</code>.</p> <p><strong>I strongly believe that the filter can be made more efficient and fast by using proper CUDA optimization.</strong></p> <p>Indeed, I'm running on a GeForce GT 520M card and the timings are the following </p> <p>1) For <code>MEDIAN_DIMENSION</code> of <code>3 x 3 = 0.027 seconds</code></p> <p>2) For <code>MEDIAN_DIMENSION</code> of <code>5 x 5 = 0.206 seconds</code></p> <p>3) For <code>MEDIAN_DIMENSION</code> of <code>7 x 7 = 1.11 seconds</code></p> <p>4) For <code>MEDIAN_DIMENSION</code> of <code>9 x 9 = 4.931 seconds</code></p> <p>As you can see, as we increase <code>MEDIAN_DIMENSION</code>, the time increases very much and I have applications where I generally use higher <code>MEDIAN_DIMENSION</code> like <code>7 x 7</code> and <code>9 x 9</code>. I think that, by using Cuda, even for <code>9 x 9</code> the time should be less than <code>1 second</code>. </p> <p>Since I think that the sorting part is taking most of the time here, can we make the sorting part of the algorithm faster? </p> <p>Can we use <code>grid</code> and <code>block</code> more efficiently? Can I use larger <code>BLOCK_WIDTH</code> and <code>BLOCK_HEIGHT</code> (like <code>32</code> and <code>32</code>) and still not hit the maximum <code>__shared__</code> memory limit which is <code>4Kb</code> for my device?</p> <p>Can <code>__shared__</code> memory be used more efficiently?</p> <p>Any help will be appreciated.</p> <p>Thanks in advance. </p>
    singulars
    1. This table or related slice is empty.
    1. This table or related slice is empty.
    plurals
    1. This table or related slice is empty.
 

Querying!

 
Guidance

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload