libcvd-members
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc...


From: Ethan Eade
Subject: [libcvd-members] libcvd cvd/convolution.h cvd_src/convolution.cc...
Date: Tue, 06 May 2008 19:56:04 +0000

CVSROOT:        /cvsroot/libcvd
Module name:    libcvd
Changes by:     Ethan Eade <ethaneade>  08/05/06 19:56:03

Modified files:
        cvd            : convolution.h 
        cvd_src        : convolution.cc 
        cvd_src/noarch : convolve_gaussian.cc 
        cvd_src/i686   : convolve_gaussian.cc 

Log message:
        Added implementation of Young - van Vliet recursive Gaussian filters for
        float images.  These do Gaussian blurring using causal and anti-causal 
IIR 
        filters in series. The result is an approximation to true Gaussian 
blurring 
        that gets more accurate with increasing sigma, and which has runtime 
        independent of sigma.
        
        The approximation is as good as a 3-sigma truncated Gaussian when 
sigma>=1, with the
        hitch that it can produce values slightly outside of the input range.  
If
        this proves to be a problem for anyone, clamping can be done inside the
        implementation if the correct range is known, at little to no 
performance
        cost.
        
        I've supplied an SSE-optimized version and a normal floating point 
version,
        both for images of floats only.  Using timings from my computer, I've 
attempted
        to make convolveGaussian switch to the fastest method for a given sigma
        (standard truncated-kernel blurring is faster for small sigma).
        
        On my current hardware, blurring a 640x480 grayscale image takes 1.54 
ms,
        independent of sigma.  With the non-SSE IIR version, it takes 6.2 ms.
        
        I've also fixed a couple of bugs in the existing truncated-kernel 
implementation.
        
        Both algorithms now assume an infinite extension of the border 
rows/columns
        instead of using reflection.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd/convolution.h?cvsroot=libcvd&r1=1.12&r2=1.13
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd_src/convolution.cc?cvsroot=libcvd&r1=1.4&r2=1.5
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd_src/noarch/convolve_gaussian.cc?cvsroot=libcvd&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/libcvd/cvd_src/i686/convolve_gaussian.cc?cvsroot=libcvd&r1=1.1&r2=1.2

Patches:
Index: cvd/convolution.h
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd/convolution.h,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -b -r1.12 -r1.13
--- cvd/convolution.h   28 Feb 2008 00:27:21 -0000      1.12
+++ cvd/convolution.h   6 May 2008 19:56:01 -0000       1.13
@@ -396,10 +396,6 @@
   case 6: CALL_CM(6);
   case 7: CALL_CM(7);
   case 8: CALL_CM(8);
-  case 9: CALL_CM(9);
-  case 10: CALL_CM(10);
-  case 11: CALL_CM(11);
-  case 12: CALL_CM(12);
   default: for (int j=0; j<n; j++, input++) { *(output++) = 
ConvolveMiddle<T,-1>::at(input, factor, kernel, ksize); }     
   }
   return input;
@@ -417,7 +413,7 @@
     typedef typename Pixel::traits<typename 
Pixel::Component<T>::type>::float_type sum_comp_type;
     typedef typename Pixel::traits<T>::float_type sum_type;
     assert(out.size() == I.size());
-    int ksize = (int)(sigmas*sigma + 0.5);
+    int ksize = (int)ceil(sigmas*sigma);
     //std::cerr << "sigma: " << sigma << " kernel: " << ksize << std::endl;
     std::vector<sum_comp_type> kernel(ksize);
     sum_comp_type ksum = sum_comp_type();
@@ -449,7 +445,7 @@
        for (int j=0; j<ksize; j++) {
            sum_type hsum = input[j] * factor;
            for (int k=0; k<ksize; k++)
-               hsum += (input[abs(j-k-1)] + input[j+k+1]) * kernel[k];
+               hsum += (input[std::max(j-k-1,0)] + input[j+k+1]) * kernel[k];
            next_row[j] = hsum;
        }
        // middle of row
@@ -460,7 +456,7 @@
            sum_type hsum = *input * factor;
            const int room = w-j;
            for (int k=0; k<ksize; k++) {
-               hsum += (input[-k-1] + (k+1 >= room ? input[2*room-k-2] : 
input[k+1])) * kernel[k];
+               hsum += (input[-k-1] + input[std::min(k+1,room-1)]) * kernel[k];
            }
            next_row[j] = hsum;
        }
@@ -483,7 +479,7 @@
                    for (int k=0; k<ksize; k++) {
                        const sum_comp_type m = kernel[k];
                        const sum_type* row1 = rows[ksize+r-k];
-                       const sum_type* row2 = rows[ksize+r+k+2 > swin ? 2*swin 
- (ksize+r+k+2) : ksize+r+k+2];
+                       const sum_type* row2 = rows[std::min(ksize+r+k+2, 
swin)];
                        add_multiple_of_sum(row1, row2, m, outbuf, w);
                    }
                    cast_copy(outbuf, output, w);
@@ -496,7 +492,7 @@
                assign_multiple(middle_row, factor, outbuf, w);
                for (int k=0; k<ksize; k++) {
                    const sum_comp_type m = kernel[k];
-                   const sum_type* row1 = rows[abs(r-k-1)+1];
+                   const sum_type* row1 = rows[std::max(r-k-1,0)+1];
                    const sum_type* row2 = rows[r+k+2]; 
                    add_multiple_of_sum(row1, row2, m, outbuf, w);
                }
@@ -512,6 +508,10 @@
     }
 }
 
+void compute_van_vliet_b(double sigma, double b[]);
+void compute_triggs_M(const double b[], double M[][3]);
+void van_vliet_blur(const double b[], const SubImage<float> in, 
SubImage<float> out);
+
 void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out, 
double sigma, double sigmas=3.0);
 
 template <class T, class O, class K> void convolve_gaussian_3(const 
BasicImage<T>& I, BasicImage<O>& out, K k1, K k2)

Index: cvd_src/convolution.cc
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd_src/convolution.cc,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- cvd_src/convolution.cc      28 Feb 2008 00:27:22 -0000      1.4
+++ cvd_src/convolution.cc      6 May 2008 19:56:02 -0000       1.5
@@ -103,6 +103,282 @@
     }
 }
 
+double compute_van_vliet_variance(const double d[])
+{
+    const double a = d[0];
+    const double b = d[1];
+    const double num = 2*(a*(a*(a-2) + 1 + b*b) - 2*b*b);
+    const double den1 = a*a - b*b -2*a + 1;
+    const double den2 = 2*(a-1)*b;
+    const double inv_den = 1.0/(den1*den1 + den2*den2);
+
+    const double inv_den3 = 1.0/((d[2]-1)*(d[2]-1));
+
+    return 2*(num * inv_den + d[2] * inv_den3);
+}
+
+double compute_van_vliet_variance(const double d[], double J[3])
+{
+    const double a = d[0];
+    const double b = d[1];
+    const double num = 2*(a*(a*(a-2) + 1 + b*b) - 2*b*b);
+    const double den1 = a*a - b*b -2*a + 1;
+    const double den2 = 2*(a-1)*b;
+    const double inv_den = 1.0/(den1*den1 + den2*den2);
+    const double inv_den3 = 1.0/((d[2]-1)*(d[2]-1));
+
+    double N_D = num*inv_den;
+    J[0] = 2*inv_den * (2*(a*(3*a - 4) + (1+b*b)) - N_D * 4*(den1*(a-1) + 
den2*b));
+    J[1] = 2*inv_den * (4*b*(a - 2) - N_D * 4*(den2*(a-1) - den1*b));
+    J[2] = 2*inv_den3*(1 - d[2]*inv_den3*2*(d[2]-1));
+
+    return 2*(N_D + d[2] * inv_den3);
+}
+
+void compute_scaling_jacobian(const double d[], double J[])
+{
+    double rr = d[0]*d[0] + d[1]*d[1];
+    double lnr = 0.5 * log(rr);
+    double theta = atan2(d[1], d[0]);
+    J[0] = d[0]*lnr - d[1]*theta;
+    J[1] = d[0]*theta + d[1]*lnr;
+    J[2] = d[2]*log(d[2]);
+}
+
+void scale_d(double d[], double p)
+{
+    double theta = atan2(d[1], d[0]);
+    double rr = d[0]*d[0] + d[1]*d[1];
+    double new_theta = theta*p;
+    double new_r = pow(rr, p*0.5);
+    d[0] = new_r * cos(new_theta);
+    d[1] = new_r * sin(new_theta);
+    d[2] = pow(d[2], p);
+}
+
+void compute_van_vliet_scaled_d(double sigma, double d[])
+{
+    // Approximately sigma = 2
+    d[0] = 1.41650;
+    d[1] = 1.00829;
+    d[2] = 1.86543;
+
+    // Cubic fitted to log(p) s.t. poles->poles^p gives sigma
+    const double A = 3.69892409013e-03;
+    const double B =-4.28967830150e-02;
+    const double C =-3.38065667167e-01;
+    const double D = 5.44264202732e-01;
+
+    double log_var = 2*log(sigma);
+    double log_p = D + log_var*(C + log_var*(B + log_var*A));
+    double predicted_p = exp(log_p);
+    
+    scale_d(d, predicted_p);
+
+    double total_p = 1;
+    const double target_var = sigma*sigma;
+
+    // Newton-Rapheson on scale of poles
+
+    for (int i=0; i<5; ++i)
+    {
+       double sj[3], vj[3];
+       compute_scaling_jacobian(d, sj);
+       double v = target_var - compute_van_vliet_variance(d, vj);
+       double step = v / (vj[0]*sj[0] + vj[1]*sj[1] + vj[2]*sj[2]);
+       if (std::abs(step) < 1e-6)
+           break;
+       double exp_step = exp(std::min(std::max(step, -1.0), 1.0));
+       scale_d(d, exp_step);
+       total_p *= exp_step;
+    }
+}
+
+void build_van_vliet_b(const double d[], double b[])
+{
+    double B = 1.0/(d[2]*(d[0]*d[0] + d[1]*d[1]));
+    b[2] = -B;
+    b[1] = B*(2*d[0] + d[2]);
+    b[0] = -B*(d[0]*d[0] + d[1]*d[1] + 2*(d[0] * d[2]));
+}
+
+    void compute_van_vliet_b(double sigma, double b[])
+    {
+       double d[3];
+       compute_van_vliet_scaled_d(sigma, d);
+       build_van_vliet_b(d, b);
+    }
+
+void compute_triggs_M(const double b[], double M[][3])
+{
+    const double a1= -b[0];
+    const double a2= -b[1];
+    const double a3= -b[2];
+
+    const double factor = 1.0/((1+a1-a2+a3)*
+                              (1-a1-a2-a3)*
+                              (1+a2+(a1-a3)*a3));
+    M[0][0] = factor*(1-a2-a1*a3-a3*a3);
+    M[0][1] = factor*(a3+a1)*(a2+a1*a3);
+    M[0][2] = factor*a3*(a1 + a2*a3);
+    M[1][0] = a1*M[0][0] + M[0][1];
+    M[1][1] = a2*M[0][0] + M[0][2];
+    M[1][2] = a3*M[0][0];
+    M[2][0] = a1*M[1][0] + M[1][1];
+    M[2][1] = a2*M[1][0] + M[1][2];    
+    M[2][2] = a3*M[1][0];
+}
+
+inline void forward_to_backward(const double M[][3], const double i_plus, 
const double inv_alpha, double& y1, double& y2, double& y3)
+{
+    const double u_plus = i_plus*inv_alpha;
+    const double v_plus = u_plus*inv_alpha;
+    double x1=y1-u_plus, x2=y2-u_plus, x3=y3-u_plus;
+    y1 = M[0][0]*x1 + M[0][1]*x2 + M[0][2]*x3 + v_plus;
+    y2 = M[1][0]*x1 + M[1][1]*x2 + M[1][2]*x3 + v_plus;
+    y3 = M[2][0]*x1 + M[2][1]*x2 + M[2][2]*x3 + v_plus;    
+}
+
+template <class T> inline T clamp01(T x) { return x < 0 ? 0 : (x > 1 ? 1 : x); 
}
+
+
+// Implementation of Young-van Vliet third-order recursive gaussian filter.
+// See "Recursive Gaussian Derivative Filters", by van Vliet, Young and 
Verbeck, 1998
+// and "Boundary Conditions for Young - van Vliet Recursive Filtering", by 
Triggs and Sdika, 2005
+// Can result in values just outside of the input range
+void van_vliet_blur(const double b[], const CVD::SubImage<float> in, 
CVD::SubImage<float> out)
+{
+    assert(in.size() == out.size());
+    const int w = in.size().x;
+    const int h = in.size().y;
+
+    double M[3][3];
+    compute_triggs_M(b, M);
+
+    vector<double> tmp(w);
+    
+    const double b0 = b[0];
+    const double b1 = b[1];
+    const double b2 = b[2];
+
+    const int rw = w%4;
+    const double alpha = 1 + b0 + b1 + b2;
+    const double inv_alpha = 1.0/alpha;
+
+    for (int i=0; i<h; ++i) {
+       const float* p = in[i]+w-1;
+
+       double y3, y2, y1;
+       y3 = y2 = y1 = inv_alpha* *p;
+
+
+       for (int j=w-1; j-3>=0; j-=4, p-=4) {
+           double y0 = p[0] - (b0*y1 + b1*y2 + b2 * y3);
+           y3 = p[-1] - (b0*y0 + b1*y1 + b2 * y2);
+           y2 = p[-2] - (b0*y3 + b1*y0 + b2 * y1);
+           y1 = p[-3] - (b0*y2 + b1*y3 + b2 * y0);
+           tmp[j] = y0;
+           tmp[j-1] = y3;
+           tmp[j-2] = y2;
+           tmp[j-3] = y1;
+       }
+
+       for (int j=rw-1; j>=0; --j, --p) {
+           double y0 = p[0] - (b0*y1 + b1*y2 + b2 * y3);
+           tmp[j] = y0;
+           y3 = y2; y2 = y1; y1 = y0;
+       }
+
+       {
+           const double i_plus = p[1];
+           double y0 = i_plus - (b0*y1 + b1*y2 + b2*y3);
+           y3=y2; y2=y1; y1=y0;
+           forward_to_backward(M, i_plus, inv_alpha, y1, y2, y3);              
    
+       }
+
+       float* o = out[i];
+       for (int j=0; j+3<w; j+=4, o+=4) {
+           double y0 = tmp[j] - (b0*y1 + b1*y2 + b2 * y3);
+           y3 = tmp[j+1] - (b0*y0 + b1*y1 + b2 * y2);
+           y2 = tmp[j+2] - (b0*y3 + b1*y0 + b2 * y1);
+           y1 = tmp[j+3] - (b0*y2 + b1*y3 + b2 * y0);
+           o[0] = y0;
+           o[1] = y3;
+           o[2] = y2;
+           o[3] = y1;
+       }
+
+       for (int j=w-rw; j<w; ++j, ++o) {
+           double y0 = tmp[j] - (b0*y1 + b1*y2 + b2 * y3);
+           o[0] = y0;
+           y3 = y2; y2 = y1; y1 = y0;
+       }
+
+    }
+
+    tmp.resize(h);
+
+    const double alpha_fourth = alpha*alpha*alpha*alpha;
+   
+    const int rh = h%4;
+    const int stride = out.row_stride();
+    for (int i=0; i<w; ++i) {
+       double y3, y2, y1;
+
+       const float* in = out[h-1] + i;
+       y3 = y2 = y1 = inv_alpha* *in;
+
+       for (int j=h-1; j-3>=0; j-=4, in -= stride) {
+           double y0 = in[0] - (b0*y1 + b1*y2 + b2 * y3);
+           in -= stride;
+           y3 = in[0] - (b0*y0 + b1*y1 + b2 * y2);
+           in -= stride;
+           y2 = in[0] - (b0*y3 + b1*y0 + b2 * y1);
+           in -= stride;
+           y1 = in[0] - (b0*y2 + b1*y3 + b2 * y0);
+           tmp[j] = y0;
+           tmp[j-1] = y3;
+           tmp[j-2] = y2;
+           tmp[j-3] = y1;
+       }
+
+       for (int j=rh-1; j>=0; --j, in-=stride) {
+           double y0 = in[0] - (b0*y1 + b1*y2 + b2 * y3);
+           tmp[j] = y0;
+           y3 = y2; y2 = y1; y1 = y0;
+       }
+
+       {
+           const double i_plus = in[stride];
+           double y0 = i_plus - (b0*y1 + b1*y2 + b2*y3);
+           y3=y2; y2=y1; y1=y0;
+           forward_to_backward(M, i_plus, inv_alpha, y1, y2, y3);
+       }
+
+       float* o = out[0]+i;
+       for (int j=0; j+3<h; j+=4) {
+           double y0 = tmp[j] - (b0*y1 + b1*y2 + b2 * y3);
+           y3 = tmp[j+1] - (b0*y0 + b1*y1 + b2 * y2);
+           y2 = tmp[j+2] - (b0*y3 + b1*y0 + b2 * y1);
+           y1 = tmp[j+3] - (b0*y2 + b1*y3 + b2 * y0);
+           o[0] = alpha_fourth*y0;//clamp01(alpha_fourth*y0); 
+           o+=stride;
+           o[0] = alpha_fourth*y3;//clamp01(alpha_fourth*y3); 
+           o+=stride;
+           o[0] = alpha_fourth*y2;//clamp01(alpha_fourth*y2); 
+           o+=stride;
+           o[0] = alpha_fourth*y1;//clamp01(alpha_fourth*y1); 
+           o+=stride;
+       }
+
+       for (int j=h-rh; j<h; ++j, o+=stride) {
+           double y0 = tmp[j] - (b0*y1 + b1*y2 + b2 * y3);
+           o[0] = alpha_fourth*y0;//clamp01(alpha_fourth*y0);
+           y3 = y2; y2 = y1; y1 = y0;
+       }
+    }
+}
+
 
 };
 

Index: cvd_src/noarch/convolve_gaussian.cc
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd_src/noarch/convolve_gaussian.cc,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- cvd_src/noarch/convolve_gaussian.cc 28 Feb 2008 00:27:24 -0000      1.1
+++ cvd_src/noarch/convolve_gaussian.cc 6 May 2008 19:56:02 -0000       1.2
@@ -1,9 +1,19 @@
-#include "cvd/convolution.h"
+#include <cvd/convolution.h>
+using namespace std;
 
 namespace CVD
 {
+
+    // Try to choose the fastest method
        void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& 
out, double sigma, double sigmas)
        {
+       int ksize = (int)ceil(sigma*sigmas);
+       if (ksize > 6) {
+           double b[3];
+           compute_van_vliet_b(sigma, b);
+           van_vliet_blur(b, I, out);
+       } else
                convolveGaussian<float>(I, out, sigma, sigmas);
        }
+    
 }

Index: cvd_src/i686/convolve_gaussian.cc
===================================================================
RCS file: /cvsroot/libcvd/libcvd/cvd_src/i686/convolve_gaussian.cc,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- cvd_src/i686/convolve_gaussian.cc   28 Feb 2008 00:27:23 -0000      1.1
+++ cvd_src/i686/convolve_gaussian.cc   6 May 2008 19:56:03 -0000       1.2
@@ -61,11 +61,11 @@
     const __m128 k1 = _mm_set1_ps(kernel[0]);
     const __m128 k2 = _mm_set1_ps(kernel[1]);
 
-    for (; i+3<count; i+=4) {
+    for (; i+3<count; i+=4, out+=4) {
        __m128 sum = (_mm_load_ps(row[ksize] + i) * ffff
                      + (_mm_load_ps(row[ksize-1]+i) + 
_mm_load_ps(row[ksize+1]+i)) * k1
                      + (_mm_load_ps(row[ksize-2]+i) + 
_mm_load_ps(row[ksize+2]+i)) * k2);
-       _mm_store_ps(out + i, sum);
+       _mm_store_ps(out, sum);
     }    
     for (; i<count; ++i, ++out) {
        double sum = row[ksize][i] * factor + (row[1][i] + row[3][i]) * 
kernel[0] + (row[0][i] + row[4][i]) * kernel[1];
@@ -89,11 +89,11 @@
        *out = sum;
     }
     const __m128 ffff = _mm_set1_ps(factor);
-    for (; i+3<count; i+=4) {
+    for (; i+3<count; i+=4, out+=4) {
        __m128 sum = _mm_mul_ps(_mm_load_ps(row[ksize] + i), ffff);
        for (int k=1; k<=ksize; ++k)
            sum += _mm_set1_ps(kernel[k-1]) * (_mm_load_ps(row[ksize-k]+i) + 
_mm_load_ps(row[ksize+k]+i));
-       _mm_store_ps(out + i, sum);
+       _mm_store_ps(out, sum);
     }    
     for (; i<count; ++i, ++out) {
        double sum = row[ksize][i] * factor;
@@ -103,10 +103,10 @@
     }
 }
 
-void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out, 
double sigma, double sigmas)
+void convolveGaussian_simd(const BasicImage<float>& I, BasicImage<float>& out, 
double sigma, double sigmas)
 {
     assert(out.size() == I.size());
-    int ksize = (int)(sigmas*sigma + 0.5);
+    int ksize = (int)ceil(sigmas*sigma);
     vector<double> kernel(ksize);
     double ksum = 1.0;
     for (int i=1; i<=ksize; i++)
@@ -132,7 +132,7 @@
        for (int j=0; j<ksize; j++) {
            double hsum = input[j] * factor;
            for (int k=0; k<ksize; k++)
-               hsum += (input[abs(j-k-1)] + input[j+k+1]) * kernel[k];
+               hsum += (input[std::max(j-k-1,0)] + input[j+k+1]) * kernel[k];
            next_row[j] = hsum;
        }
        // middle of row
@@ -144,7 +144,7 @@
            double hsum = *input * factor;
            const int room = w-j;
            for (int k=0; k<ksize; k++) {
-               hsum += (input[-k-1] + (k+1 >= room ? input[2*room-k-2] : 
input[k+1])) * kernel[k];
+               hsum += (input[-k-1] + input[std::min(k+1,room-1)]) * kernel[k];
            }
            next_row[j] = hsum;
        }
@@ -159,7 +159,7 @@
                    for (int k=0; k<ksize; ++k) {
                        rrows[ksize-k-1] = rows[ksize+r-k];
                        int hi = ksize+r+k+2;
-                       rrows[ksize+k+1] = rows[hi > swin ? 2*swin - hi : hi];
+                       rrows[ksize+k+1] = rows[std::min(hi,swin)];
                    }
                    convolveVertical(rrows, factor, kernel, w, output);
                }
@@ -169,7 +169,7 @@
                vector<float*> rrows(rows.size());
                rrows[ksize] = rows[r+1];
                for (int k=0; k<ksize; ++k) {
-                   rrows[ksize-k-1] = rows[abs(r-k-1)+1];
+                   rrows[ksize-k-1] = rows[std::max(r-k-1,0)+1];
                    rrows[ksize+k+1] = rows[r+k+2];
                }
                convolveVertical(rrows, factor, kernel, w, output);
@@ -183,5 +183,181 @@
     }
 }
 
+inline void transpose(__m128& x1, __m128& x2, __m128& x3, __m128& x4)
+{
+    // abcd  x1
+    // efgh  x2
+    // ijkl  x3
+    // mnop  x4
+    __m128 aebf = _mm_unpacklo_ps(x1,x2);
+    __m128 imjn = _mm_unpacklo_ps(x3,x4); 
+    __m128 cgdh = _mm_unpackhi_ps(x1,x2);
+    __m128 kolp = _mm_unpackhi_ps(x3,x4);
+    x1 = _mm_shuffle_ps(aebf, imjn, _MM_SHUFFLE(1,0,1,0));  //aeim
+    x2 = _mm_shuffle_ps(aebf, imjn, _MM_SHUFFLE(3,2,3,2));  //bfjn
+    x3 = _mm_shuffle_ps(cgdh, kolp, _MM_SHUFFLE(1,0,1,0));  //cgko
+    x4 = _mm_shuffle_ps(cgdh, kolp, _MM_SHUFFLE(3,2,3,2));  //dhlp
+}
+
+inline void forward_to_backward(const __m128 M[], const __m128 i_plus, const 
__m128 inv_alpha, __m128& y1, __m128& y2, __m128& y3)
+{
+    __m128 u_plus = inv_alpha * i_plus;
+    __m128 v_plus = inv_alpha * u_plus;
+    __m128 x1=y1-u_plus, x2=y2-u_plus, x3=y3-u_plus;
+    y1 = M[0]*x1 + M[1]*x2 + M[2]*x3 + v_plus;
+    y2 = M[3]*x1 + M[4]*x2 + M[5]*x3 + v_plus;
+    y3 = M[6]*x1 + M[7]*x2 + M[8]*x3 + v_plus;
+}
+
+
+// Optimized implementation of Young-van Vliet third-order recursive Gaussian 
filter.
+// See "Recursive Gaussian Derivative Filters", by van Vliet, Young and 
Verbeck, 1998
+// and "Boundary Conditions for Young - van Vliet Recursive Filtering", by 
Triggs and Sdika, 2005
+// This can produce output with values slightly outside the input range.
+void van_vliet_blur_simd(const double b[], const SubImage<float> in, 
SubImage<float> out)
+{
+    assert(in.size() == out.size());
+    const int w = in.size().x;
+    const int h = in.size().y;
+    const int is = in.row_stride();
+    const int os = out.row_stride();
+    assert(is_aligned<16>(in[0]) && is_aligned<16>(out[0]));
+    assert((w%4)==0 && (h%4)==0 && (is%4) == 0 && (os%4) == 0);
+
+    unsigned int csr_state = _mm_getcsr();
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+    
+    __m128 tmp[w>h?w:h] __attribute__((aligned(16)));
+    __m128 M[9] __attribute__((aligned(16)));
+
+    {  
+       double m[3][3];
+       compute_triggs_M(b, m);
+       for (int i=0; i<3; ++i)
+           for (int j=0; j<3; ++j)
+               M[i*3+j] = _mm_set1_ps(m[i][j]);
+    }
+
+    const double alpha = 1 + b[0] + b[1] + b[2];
+    
+    const __m128 inv_alpha = _mm_set1_ps(1.0/alpha);
+    const __m128 bb1 = _mm_set1_ps(b[0]);
+    const __m128 bb2 = _mm_set1_ps(b[1]);
+    const __m128 bb3 = _mm_set1_ps(b[2]);
+    
+    // horizontal
+    for (int i=0; i<h; i+=4) {
+       const float* p = in[i];
+
+       __m128 y3, y2, y1;
+       y3 = y2 = y1 = inv_alpha*_mm_setr_ps(p[0], p[is], p[2*is], p[3*is]);
+           
+       for (int j=0; j+3<w; j+=4, p+=4) {
+           __m128 x0 = _mm_setr_ps(p[0], p[is], p[2*is], p[3*is]);
+           __m128 x1 =  _mm_setr_ps(p[1], p[is+1], p[2*is+1], p[3*is+1]);
+           __m128 x2 = _mm_setr_ps(p[2], p[is+2], p[2*is+2], p[3*is+2]);
+           __m128 x3 = _mm_setr_ps(p[3], p[is+3], p[2*is+3], p[3*is+3]);
+           __m128 y0 = x0 - (bb1*y1 + bb2*y2 + bb3*y3);
+           tmp[j] = y0;
+           tmp[j+1] = y3 = x1 - (bb1*y0 + bb2*y1 + bb3*y2);
+           tmp[j+2] = y2 = x2 - (bb1*y3 + bb2*y0 + bb3*y1);
+           tmp[j+3] = y1 = x3 - (bb1*y2 + bb2*y3 + bb3*y0);
+       }
+       
+       {
+           const __m128 i_plus = _mm_setr_ps(p[-1], p[is-1], p[2*is-1], 
p[3*is-1]);
+           __m128 y0 = i_plus - (bb1*y1 + bb2*y2 + bb3*y3);
+           y3 = y2; y2 = y1; y1 = y0;
+           forward_to_backward(M, i_plus, inv_alpha, y1, y2, y3);
+           //y3 = y2 = y1 = tmp[w-1]*inv_alpha;
+       }
+
+       float* o = out[i]+w-4;      
+       for (int j=w-1; j>=0; j-=4, o-=4) {
+           __m128 y00 = tmp[j] - (bb1*y1 + bb2*y2 + bb3*y3);
+           __m128 y01 = y3 = tmp[j-1] - (bb1*y00 + bb2*y1 + bb3*y2);
+           __m128 y02 = y2 = tmp[j-2] - (bb1*y01 + bb2*y00 + bb3*y1);
+           __m128 y03 = y1 = tmp[j-3] - (bb1*y02 + bb2*y01 + bb3*y00);
+           transpose(y03,y02,y01,y00);
+           _mm_store_ps(o, y03);
+           _mm_store_ps(o+os, y02);
+           _mm_store_ps(o+2*os, y01);
+           _mm_store_ps(o+3*os, y00);
+       }
+    }
+
+    const __m128 a4 = _mm_set1_ps(alpha*alpha*alpha*alpha);
+
+    // vertical
+    for (int i=0; i<w; i+=4) {
+       const float* in = out[0] + i;
+       __m128 y3, y2, y1;
+       y3 = y2 = y1 = inv_alpha*_mm_load_ps(in);
+       
+       for (int j=0; j<h; j+=4, in += os) {
+           __m128 y0 = _mm_load_ps(in) - (bb1*y1 + bb2*y2 + bb3*y3);
+           tmp[j] = y0;
+           in += os;
+           tmp[j+1] = y3 = _mm_load_ps(in) - (bb1*y0 + bb2*y1 + bb3*y2);
+           in += os;
+           tmp[j+2] = y2 = _mm_load_ps(in) - (bb1*y3 + bb2*y0 + bb3*y1);
+           in += os;
+           tmp[j+3] = y1 = _mm_load_ps(in) - (bb1*y2 + bb2*y3 + bb3*y0);
+       }
+       
+       {
+           const __m128 i_plus = _mm_load_ps(in - os);
+           __m128 y0 = i_plus - (bb1*y1 + bb2*y2 + bb3*y3);
+           y3 = y2; y2 = y1; y1 = y0;
+           forward_to_backward(M, i_plus, inv_alpha, y1, y2, y3);
+           //y3 = y2 = y1 = tmp[h-1]*inv_alpha;
+       }
+
+
+       float* o = out[h-1] + i;
+       //const __m128 zeros = _mm_setzero_ps();
+       //const __m128 ones = _mm_set1_ps(1.0);
+
+       for (int j=h-1; j>=0; j-=4, o-=os) {
+           __m128 y0 = tmp[j] - (bb1*y1 + bb2*y2 + bb3*y3);        
+           y3 = tmp[j-1] - (bb1*y0 + bb2*y1 + bb3*y2);
+           y2 = tmp[j-2] - (bb1*y3 + bb2*y0 + bb3*y1);
+           y1 = tmp[j-3] - (bb1*y2 + bb2*y3 + bb3*y0);
+
+           _mm_store_ps(o, a4*y0);//_mm_min_ps(ones, _mm_max_ps(zeros,a4*y0)));
+           o -= os;
+           _mm_store_ps(o, a4*y3);//_mm_min_ps(ones, _mm_max_ps(zeros,a4*y3)));
+           o -= os;
+           _mm_store_ps(o, a4*y2);//_mm_min_ps(ones, _mm_max_ps(zeros,a4*y2)));
+           o -= os;
+           _mm_store_ps(o, a4*y1);//_mm_min_ps(ones, _mm_max_ps(zeros,a4*y1)));
+       }
+    }
+    _mm_setcsr(csr_state);
+}
+
+// Try to choose the fastest method
+void convolveGaussian(const BasicImage<float>& I, BasicImage<float>& out, 
double sigma, double sigmas)
+{
+    int ksize = (int)ceil(sigma*sigmas);
+    bool nice = ((I.size().x%4) == 0 && 
+                (I.size().y%4)==0 && 
+                (I.row_stride()%4) == 0 &&
+                (out.row_stride()%4) == 0 &&
+                is_aligned<16>(I[0]) && 
+                is_aligned<16>(out[0]));
+    if (nice && ksize > 2) {
+       double b[3];
+       compute_van_vliet_b(sigma, b);
+       van_vliet_blur_simd(b, I, out);
+    } else if (ksize > 6) {
+       double b[3];
+       compute_van_vliet_b(sigma, b);
+       van_vliet_blur(b, I, out);
+    } else
+       convolveGaussian_simd(I, out, sigma, sigmas);    
+}
+
+
 };
 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]