libcvd-members
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc...


From: Ethan Eade
Subject: [libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc...
Date: Fri, 22 Feb 2008 15:24:34 +0000

CVSROOT:        /cvsroot/libcvd
Module name:    libcvd
Changes by:     Ethan Eade <ethaneade>  08/02/22 15:24:34

Modified files:
        cvd            : convolution.h 
        cvd_src        : convolution.cc 
        cvd/internal   : aligned_mem.h 

Log message:
        Added SSE optimized gaussian convolution for float -> float.  Also 
added AlignedMem
        scope guard to automatically release aligned memory.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd/convolution.h?cvsroot=libcvd&r1=1.9&r2=1.10
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd_src/convolution.cc?cvsroot=libcvd&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd/internal/aligned_mem.h?cvsroot=libcvd&r1=1.10&r2=1.11

Patches:
Index: cvd/convolution.h
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd/convolution.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- cvd/convolution.h   4 Aug 2007 00:08:51 -0000       1.9
+++ cvd/convolution.h   22 Feb 2008 15:24:33 -0000      1.10
@@ -383,9 +383,9 @@
   template <class S> static inline T at(const T* input, const S& factor, const 
S* ) { return *input * factor; }
 };
 
-#if 1
 template <class T,class S> inline const T* convolveMiddle(const T* input, 
const S& factor, const S* kernel, int ksize, int n, T* output) {
-#define CALL_CM(I) for (int j=0; j<n; j++, input++) { *(output++) = 
ConvolveMiddle<T,I>::at(input, factor, kernel); } break
+#define CALL_CM(I) for (int j=0; j<n; ++j, ++input, ++output) { *output = 
ConvolveMiddle<T,I>::at(input, factor, kernel); } break
+    
   switch (ksize) {
   case 0: CALL_CM(0);
   case 1: CALL_CM(1);
@@ -406,14 +406,43 @@
 #undef CALL_CM
 }
 
-#else
 
-template <class T,class S> const T* convolveMiddle(const T* input, const S& 
factor, const S* kernel, int ksize, int n, T* output) {
-    assign_multiple(input, factor, output, n);    
-    for (int r=0; r <ksize; r++) {
-       add_multiple_of_sum(input-r-1, input+r+1, kernel[r], output, n);
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+
+#include <xmmintrin.h>
+
+template <class S> const float* convolveMiddle(const float* input, const S& 
factor, const S* kernel, int ksize, int n, float* output) {
+    __m128 kkkk[ksize+1] __attribute__ ((aligned(16)));
+    kkkk[0] = _mm_set1_ps(factor);
+    for (int i=1; i<=ksize; i++)
+       kkkk[i] = _mm_set1_ps(kernel[i-1]);
+    
+    int i=0;
+    for (; i<n && !is_aligned<16>(input); i++, ++input, ++output) { 
+       *output = ConvolveMiddle<float,-1>::at(input, factor, kernel, ksize); 
+    }    
+    
+    for (; i<n-3; i+=4) {
+       __m128 sum = _mm_mul_ps(kkkk[0], _mm_load_ps(input));
+       const float* back = input - ksize-1;
+       const float* front = input + ksize+1;
+       const __m128* kp = kkkk+ksize+1;
+       while (++back != --front) {
+           --kp;
+           const __m128& kr = *kp;
+           const __m128 b = _mm_loadu_ps(back);
+           const __m128 f = _mm_loadu_ps(front);
+           sum = _mm_add_ps(sum, _mm_mul_ps(kr, _mm_add_ps(b,f)));
+       }
+       _mm_stream_ps(output, sum);
+       output += 4;
+       input += 4;
     }
-    return input + n;
+
+    for (; i<n; i++, ++input, ++output) { 
+       *output = ConvolveMiddle<float,-1>::at(input, factor, kernel, ksize); 
+    }    
+    return input;
 }
 
 #endif
@@ -429,7 +458,8 @@
   typedef typename Pixel::traits<T>::float_type sum_type;
   assert(out.size() == I.size());
   int ksize = (int)(sigmas*sigma + 0.5);
-  sum_comp_type *kernel = new sum_comp_type[ksize];
+    //std::cerr << "sigma: " << sigma << " kernel: " << ksize << std::endl;
+    std::vector<sum_comp_type> kernel(ksize);
   sum_comp_type ksum = sum_comp_type();
   for (int i=1; i<=ksize; i++)
     ksum += (kernel[i-1] = 
static_cast<sum_comp_type>(exp(-i*i/(2*sigma*sigma))));
@@ -440,13 +470,16 @@
   int h = I.size().y;
   int swin = 2*ksize;
 
-  sum_type* buffer = Internal::aligned_mem<sum_type,16>::alloc(w*(swin+1));
-  sum_type* rowbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
-  sum_type* outbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
+    AlignedMem<sum_type,16> buffer(w*(swin+1));
+    AlignedMem<sum_type,16> aligned_rowbuf(w);
+    AlignedMem<sum_type,16> aligned_outbuf(w);
+
+    sum_type* rowbuf = aligned_rowbuf.data();
+    sum_type* outbuf = aligned_outbuf.data();
 
-  sum_type** rows = new sum_type*[swin+1];
+    std::vector<sum_type*> rows(swin+1);
   for (int k=0;k<swin+1;k++)
-    rows[k] = buffer + k*w;
+       rows[k] = buffer.data() + k*w;
 
   T* output = out.data();
   for (int i=0; i<h; i++) {
@@ -461,7 +494,7 @@
     }
     // middle of row
     input += ksize;
-    input = convolveMiddle<sum_type, sum_comp_type>(input, factor, kernel, 
ksize, w-swin, next_row+ksize);
+       input = convolveMiddle<sum_type, sum_comp_type>(input, factor, 
&kernel.front(), ksize, w-swin, next_row+ksize);
     // end of row
     for (int j=w-ksize; j<w; j++, input++) {
       sum_type hsum = *input * factor;
@@ -518,13 +551,13 @@
     rows[swin] = tmp;
   }
 
-  Internal::aligned_mem<sum_type,16>::release(buffer);
-  Internal::aligned_mem<sum_type,16>::release(rowbuf);
-  Internal::aligned_mem<sum_type,16>::release(outbuf);
   delete[] kernel;
-  delete[] rows;
 }
 
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out, 
double sigma, double sigmas=3.0);
+#endif
+
 template <class T, class O, class K> void convolve_gaussian_3(const 
BasicImage<T>& I, BasicImage<O>& out, K k1, K k2)
 {    
     assert(I.size() == out.size());

Index: cvd_src/convolution.cc
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd_src/convolution.cc,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- cvd_src/convolution.cc      26 Oct 2005 20:26:12 -0000      1.1
+++ cvd_src/convolution.cc      22 Feb 2008 15:24:34 -0000      1.2
@@ -104,4 +104,186 @@
     }
 }
 
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+inline void convolveMiddle5(const float* in, double factor, const double 
kernel[], int count, float* out)
+{
+    int i;
+    const __m128 ffff = _mm_set1_ps(factor);
+    const __m128 k1 = _mm_set1_ps(kernel[0]);
+    const __m128 k2 = _mm_set1_ps(kernel[1]);
+    for (i=0; i+3<count; i+=4, in += 4, out += 4) {
+       __m128 sum = (_mm_mul_ps(_mm_loadu_ps(in), ffff) + 
+                     _mm_mul_ps(k1, (_mm_loadu_ps(in-1) + _mm_loadu_ps(in+1))) 
+ 
+                     _mm_mul_ps(k2, (_mm_loadu_ps(in-2) + 
_mm_loadu_ps(in+2))));
+       _mm_storeu_ps(out, sum);
+    }
+
+    for (; i<count; ++i, ++in, ++out) {
+       double sum = in[0] * factor + kernel[0] * (in[-1] + in[1]) + kernel[1] 
* (in[-2] * in[2]);
+       *out = sum;
+    }
+}
+
+inline void convolveMiddle(const float* in, double factor, const 
vector<double>& kernel, int count, float* out)
+{
+    const int ksize = kernel.size();
+    if (ksize == 2) {
+       convolveMiddle5(in, factor, &kernel[0], count, out);
+       return;
+    }
+    int i;
+    const __m128 ffff = _mm_set1_ps(factor);
+    for (i=0; i+3<count; i+=4, in += 4, out += 4) {
+       __m128 sum = _mm_mul_ps(_mm_loadu_ps(in), ffff);
+       for (int k=1; k<=ksize; ++k)
+           sum += _mm_set1_ps(kernel[k-1]) * (_mm_loadu_ps(in-k) + 
_mm_loadu_ps(in+k));
+       _mm_storeu_ps(out, sum);
+    }
+
+    for (; i<count; ++i, ++in, ++out) {
+       double sum = in[0] * factor;
+       for (int k=1; k<=ksize; ++k)
+           sum += kernel[k-1] * (in[-k] + in[k]);
+       *out = sum;
+    }
+}
+
+inline void convolveVertical5(const vector<float*> row, double factor, double 
kernel[], int count, float* out)
+{
+    const int ksize = 2;
+    int i;
+    for (i=0; i<count && !is_aligned<16>(row[0] + i); ++i, ++out) {
+       double sum = row[ksize][i] * factor + (row[1][i] + row[3][i]) * 
kernel[0] + (row[0][i] + row[4][i]) * kernel[1];
+       *out = sum;
+    }
+
+    const __m128 ffff = _mm_set1_ps(factor);
+    const __m128 k1 = _mm_set1_ps(kernel[0]);
+    const __m128 k2 = _mm_set1_ps(kernel[1]);
+
+    for (; i+3<count; i+=4) {
+       __m128 sum = (_mm_load_ps(row[ksize] + i) * ffff
+                     + (_mm_load_ps(row[ksize-1]+i) + 
_mm_load_ps(row[ksize+1]+i)) * k1
+                     + (_mm_load_ps(row[ksize-2]+i) + 
_mm_load_ps(row[ksize+2]+i)) * k2);
+       _mm_store_ps(out + i, sum);
+    }    
+    for (; i<count; ++i, ++out) {
+       double sum = row[ksize][i] * factor + (row[1][i] + row[3][i]) * 
kernel[0] + (row[0][i] + row[4][i]) * kernel[1];
+       *out = sum;
+    }
+}
+
+inline void convolveVertical(const vector<float*> row, double factor, 
vector<double>& kernel, int count, float* out)
+{
+    const int ksize = kernel.size();
+    if (ksize == 2) {
+       convolveVertical5(row, factor, &kernel[0], count, out);
+       return;
+    }
+       
+    int i;
+    for (i=0; i<count && !is_aligned<16>(row[0] + i); ++i, ++out) {
+       double sum = row[ksize][i] * factor;
+       for (int k=1; k<=ksize; ++k)
+           sum += kernel[k-1] * (row[ksize-k][i] + row[ksize+k][i]);
+       *out = sum;
+    }
+    const __m128 ffff = _mm_set1_ps(factor);
+    for (; i+3<count; i+=4) {
+       __m128 sum = _mm_mul_ps(_mm_load_ps(row[ksize] + i), ffff);
+       for (int k=1; k<=ksize; ++k)
+           sum += _mm_set1_ps(kernel[k-1]) * (_mm_load_ps(row[ksize-k]+i) + 
_mm_load_ps(row[ksize+k]+i));
+       _mm_store_ps(out + i, sum);
+    }    
+    for (; i<count; ++i, ++out) {
+       double sum = row[ksize][i] * factor;
+       for (int k=1; k<=ksize; ++k)
+           sum += kernel[k-1] * (row[ksize-k][i] + row[ksize+k][i]);
+       *out = sum;
+    }
+}
+
+void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out, 
double sigma, double sigmas)
+{
+    assert(out.size() == I.size());
+    int ksize = (int)(sigmas*sigma + 0.5);
+    vector<double> kernel(ksize);
+    double ksum = 1.0;
+    for (int i=1; i<=ksize; i++)
+       ksum += 2*(kernel[i-1] = exp(-i*i/(2*sigma*sigma)));
+    double factor = 1.0/ksum;
+    for (int i=0; i<ksize; i++)
+       kernel[i] *= factor;
+    const int w = I.size().x;
+    const int h = I.size().y;
+    const int swin = 2*ksize;
+
+    AlignedMem<float,16> buffer(w*(swin+1));
+    vector<float*> rows(swin+1);
+
+    for (int k=0;k<swin+1;k++)
+       rows[k] = buffer.data() + k*w;
+
+    float* output = out.data();
+    for (int i=0; i<h; i++) {
+       float* next_row = rows[swin];
+       const float* input = I[i];
+       // beginning of row
+       for (int j=0; j<ksize; j++) {
+           double hsum = input[j] * factor;
+           for (int k=0; k<ksize; k++)
+               hsum += (input[abs(j-k-1)] + input[j+k+1]) * kernel[k];
+           next_row[j] = hsum;
+       }
+       // middle of row
+       input += ksize;
+       convolveMiddle(input, factor, kernel, w-swin, next_row+ksize);
+       input += w-swin;
+       // end of row
+       for (int j=w-ksize; j<w; j++, input++) {
+           double hsum = *input * factor;
+           const int room = w-j;
+           for (int k=0; k<ksize; k++) {
+               hsum += (input[-k-1] + (k+1 >= room ? input[2*room-k-2] : 
input[k+1])) * kernel[k];
+           }
+           next_row[j] = hsum;
+       }
+       // vertical
+       if (i >= swin) {
+           convolveVertical(rows, factor, kernel, w, output);
+           output += w;
+           if (i == h-1) {
+               for (int r=0; r<ksize; r++, output += w) {
+                   vector<float*> rrows(rows.size());
+                   rrows[ksize] = rows[ksize+r+1];
+                   for (int k=0; k<ksize; ++k) {
+                       rrows[ksize-k-1] = rows[ksize+r-k];
+                       int hi = ksize+r+k+2;
+                       rrows[ksize+k+1] = rows[hi > swin ? 2*swin - hi : hi];
+                   }
+                   convolveVertical(rrows, factor, kernel, w, output);
+               }
+           }
+       } else if (i == swin-1) {
+           for (int r=0; r<ksize; r++, output += w) {
+               vector<float*> rrows(rows.size());
+               rrows[ksize] = rows[r+1];
+               for (int k=0; k<ksize; ++k) {
+                   rrows[ksize-k-1] = rows[abs(r-k-1)+1];
+                   rrows[ksize+k+1] = rows[r+k+2];
+               }
+               convolveVertical(rrows, factor, kernel, w, output);
+           }
+       }
+    
+       float* tmp = rows[0];
+       for (int r=0;r<swin; r++)
+           rows[r] = rows[r+1];
+       rows[swin] = tmp;
+    }
+}
+#endif
+
+
 };
+

Index: cvd/internal/aligned_mem.h
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd/internal/aligned_mem.h,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- cvd/internal/aligned_mem.h  15 Jan 2008 15:28:07 -0000      1.10
+++ cvd/internal/aligned_mem.h  22 Feb 2008 15:24:34 -0000      1.11
@@ -96,6 +96,20 @@
     template <class T, int N> std::map<T*,typename aligned_mem<T,N>::entry> 
aligned_mem<T,N>::buffers;
 
   }
+
+    template <class T, int N> struct AlignedMem {
+       T* mem;
+       AlignedMem(size_t count) {
+           mem = Internal::aligned_mem<T,N>::alloc(count);
+       }
+       ~AlignedMem() {
+           Internal::aligned_mem<T,N>::release(mem);
+       }
+       T* data() { return mem; }
+       const T* data() const { return mem; }
+    };    
+
+    
 }
 
 #endif




reply via email to

[Prev in Thread] Current Thread [Next in Thread]