[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Libcvd-members] libcvd cvd/convolution.h cvd/utility.h cvd_src/... [vid
From: |
Ethan Eade |
Subject: |
[Libcvd-members] libcvd cvd/convolution.h cvd/utility.h cvd_src/... [videofilebuffer_test_23052006] |
Date: |
Wed, 24 May 2006 00:38:32 +0000 |
CVSROOT: /cvsroot/libcvd
Module name: libcvd
Branch: videofilebuffer_test_23052006
Changes by: Ethan Eade <address@hidden> 06/05/24 00:38:31
Modified files:
cvd : convolution.h utility.h
cvd_src : utility.cc
Log message:
Fixed recent changes to convolution.h to avoid explicitly specifying
function template parameters, so that proper overload resolution is
used by
the compiler.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd/convolution.h.diff?only_with_tag=videofilebuffer_test_23052006&tr1=1.5&tr2=1.5.2.1&r1=text&r2=text
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd/utility.h.diff?only_with_tag=videofilebuffer_test_23052006&tr1=1.4&tr2=1.4.2.1&r1=text&r2=text
http://cvs.savannah.gnu.org/viewcvs/libcvd/libcvd/cvd_src/utility.cc.diff?only_with_tag=videofilebuffer_test_23052006&tr1=1.5&tr2=1.5.2.1&r1=text&r2=text
Patches:
Index: libcvd/cvd/convolution.h
diff -u /dev/null libcvd/cvd/convolution.h:1.5.2.1
--- /dev/null Wed May 24 00:38:31 2006
+++ libcvd/cvd/convolution.h Wed May 24 00:38:31 2006
@@ -0,0 +1,545 @@
+/*
+ This file is part of the CVD Library.
+
+ Copyright (C) 2005 The Authors
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef CVD_CONVOLUTION_H_
+#define CVD_CONVOLUTION_H_
+
+#include <vector>
+#include <memory>
+#include <numeric>
+#include <algorithm>
+
+#include <cvd/config.h>
+#include <cvd/exceptions.h>
+#include <cvd/image.h>
+#include <cvd/internal/pixel_operations.h>
+#include <cvd/internal/aligned_mem.h>
+#include <cvd/utility.h>
+
+namespace CVD {
+
+/// creates a Gaussian kernel with given maximum value and standard deviation.
+/// All elements of the passed vector are filled up, therefore the vector
+/// defines the size of the computed kernel. The normalizing value is returned.
+/// @param k vector of T's holds the kernel values
+/// @param maxval the maximum value to be used
+/// @param stddev standard deviation of the kernel
+/// @return the sum of the kernel elements for normalization
+/// @ingroup gVision
+template <class T>
+T gaussianKernel(std::vector<T>& k, T maxval, double stddev)
+{
+ double sum = 0;
+ unsigned int i, argmax=0;
+ std::vector<double> kernel(k.size());
+ for (i=0;i<k.size();i++) {
+ double x = i +0.5 - k.size()/2.0;
+ sum += kernel[i] = exp(-x*x/(2*stddev*stddev));
+ if (kernel[i] > kernel[argmax])
+ argmax = i;
+ }
+ T finalSum = 0;
+ for (i=0;i<k.size();i++)
+ finalSum += k[i] = (T)(kernel[i]*maxval/kernel[argmax]);
+ return finalSum;
+}
+
+/// scales a GaussianKernel to a different maximum value. The new kernel is
+/// returned in scaled. The new normalizing value is returned.
+/// @param k input kernel
+/// @param scaled output vector to hold the resulting kernel
+/// @param maxval the new maximum value
+/// @return sum of the new kernel elements for normalization
+/// @ingroup gVision
+template <class S, class T>
+T scaleKernel(const std::vector<S>& k, std::vector<T>& scaled, T maxval)
+{
+ unsigned int i,argmax=0;
+ for (i=1;i<k.size();i++)
+ if (k[i]>k[argmax])
+ argmax = i;
+ scaled.resize(k.size());
+ T sum = 0;
+ for (i=0;i<k.size();i++)
+ sum += (scaled[i] = (T)((k[i]*maxval)/k[argmax]));
+ return sum;
+}
+
+template <class T>
+void convolveGaussian5_1(BasicImage<T>& I)
+{
+ int w = I.size().x;
+ int h = I.size().y;
+ int i,j;
+ for (j=0;j<w;j++) {
+ T* src = I.data()+j;
+ T* end = src + w*(h-4);
+ while (src != end) {
+ T sum= (T)(0.0544887*(src[0]+src[4*w])
+ + 0.2442010*(src[w]+src[3*w])
+ + 0.4026200*src[2*w]);
+ *(src) = sum;
+ src += w;
+ }
+ }
+ for (i=h-5;i>=0;i--) {
+ T* src = I.data()+i*w;
+ T* end = src + w-4;
+ while (src != end) {
+ T sum= (T)(0.0544887*(src[0]+src[4])
+ + 0.2442010*(src[1]+src[3])
+ + 0.4026200*src[2]);
+ *(src+2*w+2)=sum;
+ ++src;
+ }
+ }
+}
+
+namespace Exceptions {
+
+ /// %Exceptions specific to vision algorithms
+ /// @ingroup gException
+ namespace Convolution {
+ /// Base class for all Image_IO exceptions
+ /// @ingroup gException
+ struct All: public CVD::Exceptions::All {};
+
+ /// Input images have incompatible dimensions
+ /// @ingroup gException
+ struct IncompatibleImageSizes : public All {
+ IncompatibleImageSizes(const std::string & function)
+ {
+ what = "Incompatible image sizes in " + function;
+ };
+ };
+ }
+}
+//void convolveGaussian5_1(BasicImage<byte>& I);
+
+/// convolves an image with a box of given size.
+/// @param I input image, modified in place
+/// @param hwin window size, this is half of the box size
+/// @ingroup gVision
+template <class T> void convolveWithBox(const BasicImage<T>& I, BasicImage<T>&
J, ImageRef hwin)
+{
+ typedef typename Pixel::traits<T>::wider_type sum_type;
+ if (I.size() != J.size()) {
+ throw
Exceptions::Convolution::IncompatibleImageSizes("convolveWithBox");
+ }
+ int w = I.size().x;
+ int h = I.size().y;
+ ImageRef win = 2*hwin+ImageRef(1,1);
+ const double factor = 1.0/(win.x*win.y);
+ std::vector<sum_type> buffer(w*win.y);
+ std::vector<sum_type> sums_v(w);
+ sum_type* sums = &sums_v[0];
+ sum_type* next_row = &buffer[0];
+ sum_type* oldest_row = &buffer[0];
+ zeroPixels(sums, w);
+ const T* input = I.data();
+ T* output = J[hwin.y] - hwin.x;
+ for (int i=0; i<h; i++) {
+ sum_type hsum=sum_type();
+ const T* back = input;
+ int j;
+ for (j=0; j<win.x-1; j++)
+ hsum += input[j];
+ for (; j<w; j++) {
+ hsum += input[j];
+ next_row[j] = hsum;
+ sums[j] += hsum;
+ hsum -= *(back++);
+ }
+ if (i >= win.y-1) {
+ assign_multiple(sums+win.x-1, factor, output+win.x-1, w-win.x+1);
+ differences(sums+win.x-1, oldest_row+win.x-1, sums+win.x-1,
w-win.x+1);
+ output += w;
+ oldest_row += w;
+ if (oldest_row == &buffer[0] + w*win.y)
+ oldest_row = &buffer[0];
+ }
+ input += w;
+ next_row += w;
+ if (next_row == &buffer[0] + w*win.y)
+ next_row = &buffer[0];
+ }
+}
+
+template <class T> inline void convolveWithBox(const BasicImage<T>& I,
BasicImage<T>& J, int hwin)
+{
+ convolveWithBox(I, J, ImageRef(hwin,hwin));
+}
+
+template <class T> inline void convolveWithBox(BasicImage<T>& I, int hwin) {
+ convolveWithBox(I,I,hwin);
+}
+
+template <class T> inline void convolveWithBox(BasicImage<T>& I, ImageRef
hwin) {
+ convolveWithBox(I,I,hwin);
+}
+
+
+template <class T, int A, int B, int C> void convolveSymmetric(Image<T>& I)
+ {
+ typedef typename Pixel::traits<T>::wider_type wider;
+ static const wider S = (A+B+C+B+A);
+ int width = I.size().x;
+ int height = I.size().y;
+ T* p = I.data();
+ int i,j;
+ for (i=0; i<height; i++) {
+ wider a = p[0];
+ wider b = p[1];
+ wider c = p[2];
+ wider d = p[3];
+ p[0] = (T)(((c+c)*A+(b+b)*B + a*C) /S);
+ p[1] = (T)(((b+d)*A+(a+c)*B + b*C) /S);
+ for (j=0;j<width-4;j++,p++) {
+ wider e = p[4];
+ p[2] = (T)(((a+e)*A + (b+d)*B + c*C)/S);
+ a = b; b = c; c = d; d = e;
+ }
+ p[2] = (T)(((a+c)*A + (b+d)*B + c*C) /S);
+ p[3] = (T)(((b+b)*A + (c+c)*B + d*C) /S);
+ p += 4;
+ }
+ for (j=0;j<width;j++) {
+ p = I.data()+j;
+ wider a = p[0];
+ wider b = p[width];
+ p[0] = (T)(((p[2*width]+p[2*width])*A+(b+b)*B + a*C) /S);
+ p[width] = (T)(((b+p[width*3])*A+(a+p[2*width])*B + b*C) /S);
+ for (i=0;i<height-4;i++) {
+ wider c = p[2*width];
+ p[2*width] = (T)(((a+p[4*width])*A + (b+p[3*width])*B + c*C)/S);
+ a=b; b=c;
+ p += width;
+ }
+ wider c = p[2*width];
+ p[2*width] = (T)(((a+c)*A + (b+p[width*3])*B + c*C) /S);
+ p[3*width] = (T)(((b+b)*A + (c+c)*B + p[width*3]*C) /S);
+ }
+ }
+
+ template <class T, int A, int B, int C, int D> void
convolveSymmetric(Image<T>& I)
+ {
+ typedef typename Pixel::traits<T>::wider_type wider;
+ static const wider S = (A+B+C+D+C+B+A);
+ int width = I.size().x;
+ int height = I.size().y;
+ T* p = I.data();
+ int i,j;
+ for (i=0; i<height; i++) {
+ wider a = p[0];
+ wider b = p[1];
+ wider c = p[2];
+ wider d = p[3];
+ p[0] = (T)(((d+d)*A + (c+c)*B + (b+b)*C + a*D)/S);
+ p[1] = (T)(((c+p[4])*A + (b+d)*B + (a+c)*C + b*D)/S);
+ p[2] = (T)(((b+p[5])*A + (a+p[4])*B + (b+d)*C + c*D)/S);
+ for (j=0;j<width-6;j++,p++) {
+ d = p[3];
+ p[3] = (T)(((a+p[6])*A + (b+p[5])*B + (c+p[4])*C + d*D)/S);
+ a=b; b=c; c=d;
+ }
+ d = p[3];
+ wider e = p[4];
+ p[3] = (T)(((a+e)*A + (b+p[5])*B + (c+e)*C + d*D)/S);
+ p[4] = (T)(((b+d)*A + (c+e)*B + (d+p[5])*C + e*D)/S);
+ p[5] = (T)(((c+c)*A + (d+d)*B + (e+e)*C + p[5]*D)/S);
+ p += 6;
+ }
+ for (j=0;j<width;j++) {
+ p = I.data()+j;
+ wider a = p[0];
+ wider b = p[width];
+ wider c = p[2*width];
+ wider d = p[3*width];
+ p[0] = (T)(((d+d)*A + (c+c)*B + (b+b)*C + a*D)/S);
+ p[width] = (T)(((c+p[4*width])*A + (b+d)*B + (a+c)*C + b*D)/S);
+ p[2*width] = (T)(((b+p[5*width])*A + (a+p[4*width])*B + (b+d)*C +
c*D)/S);
+ for (i=0;i<height-6;i++) {
+ d = p[3*width];
+ p[3*width] = (T)(((a+p[width*6])*A + (b+p[width*5])*B +
(c+p[width*4])*C+d*D)/S);
+ a=b; b=c; c=d;
+ p += width;
+ }
+ d = p[3*width];
+ wider e = p[4*width];
+ p[3*width] = (T)(((a+e)*A + (b+p[5*width])*B + (c+e)*C + d*D)/S);
+ p[4*width] = (T)(((b+d)*A + (c+e)*B + (d+p[5*width])*C + e*D)/S);
+ p[5*width] = (T)(((c+c)*A + (d+d)*B + (e+e)*C + p[5*width]*D)/S);
+ }
+ }
+
+template <class T, class K> void convolveSeparableSymmetric(Image<T>& I, const
std::vector<K>& kernel, K divisor)
+ {
+ typedef typename Pixel::traits<T>::wider_type sum_type;
+ int w = I.size().x;
+ int h = I.size().y;
+ int r = (int)kernel.size()/2;
+ int i,j;
+ int m;
+ double factor = 1.0/divisor;
+ for (j=0;j<w;j++) {
+ T* src = I.data()+j;
+ for (i=0; i<h-2*r; i++,src+=w) {
+ sum_type sum = src[r*w]*kernel[r], v;
+ for (m=0; m<r; m++)
+ sum += (src[m*w] + src[(2*r-m)*w]) * kernel[m];
+ *(src) = static_cast<T>(sum * factor);
+ }
+ }
+ int offset = r*w + r;
+ for (i=h-2*r-1;i>=0;i--) {
+ T* src = I[w];
+ for (j=0;j<w-2*r;j++, src++) {
+ sum_type sum = src[r] * kernel[r], v;
+ for (m=0; m<r; m++)
+ sum += (src[m] + src[2*r-m])*kernel[m];
+ *(src+offset) = static_cast<T>(sum*factor);
+ }
+ }
+ }
+
+
+template <class A, class B> struct GetPixelRowTyped {
+ static inline const B* get(const A* row, int w, B* rowbuf) {
+ std::copy(row, row+w, rowbuf);
+ return rowbuf;
+ }
+};
+
+template <class T> struct GetPixelRowTyped<T,T> {
+ static inline const T* get(const T* row, int w, T* rowbuf) {
+ return row;
+ }
+};
+
+template <class A, class B> const B* getPixelRowTyped(const A* row, int n, B*
rowbuf) {
+ return GetPixelRowTyped<A,B>::get(row,n,rowbuf);
+}
+
+template <class T, class S> struct CastCopy {
+ static inline void cast_copy(const T* from, S* to, int count) {
+ for (int i=0; i<count; i++)
+ to[i] = static_cast<S>(from[i]);
+ }
+};
+
+template <class T> struct CastCopy<T,T> {
+ static inline void cast_copy(const T* from, T* to, int count) {
+ std::copy(from, from+count, to);
+ }
+};
+
+template <class T, class S> inline void cast_copy(const T* from, S* to, int
count) { CastCopy<T,S>::cast_copy(from,to,count); }
+
+template <class T, int N=-1, int C = Pixel::Component<T>::count> struct
ConvolveMiddle {
+ template <class S> static inline T at(const T* input, const S& factor, const
S* kernel) { return ConvolveMiddle<T,-1,C>::at(input,factor, kernel, N); }
+};
+
+template <class T, int N> struct ConvolveMiddle<T,N,1> {
+ template <class S> static inline T at(const T* input, const S& factor, const
S* kernel) { return ConvolveMiddle<T,N-1>::at(input,factor, kernel) +
(input[-N]+input[N])*kernel[N-1]; }
+};
+
+template <class T> struct ConvolveMiddle<T,-1,1> {
+ template <class S> static inline T at(const T* input, const S& factor, const
S* kernel, int ksize) {
+ T hsum = *input * factor;
+ for (int k=0; k<ksize; k++)
+ hsum += (input[-k-1] + input[k+1]) * kernel[k];
+ return hsum;
+ }
+};
+
+template <class T, int C> struct ConvolveMiddle<T,-1, C> {
+ template <class S> static inline T at(const T* input, const S& factor, const
S* kernel, int ksize) {
+ T hsum = *input * factor;
+ for (int k=0; k<ksize; k++)
+ hsum += (input[-k-1] + input[k+1]) * kernel[k];
+ return hsum;
+ }
+};
+
+template <class T> struct ConvolveMiddle<T,0,1> {
+ template <class S> static inline T at(const T* input, const S& factor, const
S* kernel) { return *input * factor; }
+};
+
+#if 1
+template <class T,class S> inline const T* convolveMiddle(const T* input,
const S& factor, const S* kernel, int ksize, int n, T* output) {
+#define CALL_CM(I) for (int j=0; j<n; j++, input++) { *(output++) =
ConvolveMiddle<T,I>::at(input, factor, kernel); } break
+ switch (ksize) {
+ case 0: CALL_CM(0);
+ case 1: CALL_CM(1);
+ case 2: CALL_CM(2);
+ case 3: CALL_CM(3);
+ case 4: CALL_CM(4);
+ case 5: CALL_CM(5);
+ case 6: CALL_CM(6);
+ case 7: CALL_CM(7);
+ case 8: CALL_CM(8);
+ case 9: CALL_CM(9);
+ case 10: CALL_CM(10);
+ case 11: CALL_CM(11);
+ case 12: CALL_CM(12);
+ default: for (int j=0; j<n; j++, input++) { *(output++) =
ConvolveMiddle<T,-1>::at(input, factor, kernel, ksize); }
+ }
+ return input;
+#undef CALL_CM
+}
+
+#else
+
+template <class T,class S> const T* convolveMiddle(const T* input, const S&
factor, const S* kernel, int ksize, int n, T* output) {
+ assign_multiple(input, factor, output, n);
+ for (int r=0; r <ksize; r++) {
+ add_multiple_of_sum(input-r-1, input+r+1, kernel[r], output, n);
+ }
+ return input + n;
+}
+
+#endif
+
+template <class T> inline void convolveGaussian(BasicImage<T>& I, double
sigma, double sigmas=3.0)
+{
+ convolveGaussian(I,I,sigma,sigmas);
+}
+
+template <class T> void convolveGaussian(const BasicImage<T>& I,
BasicImage<T>& out, double sigma, double sigmas=3.0)
+{
+ typedef typename Pixel::traits<typename
Pixel::Component<T>::type>::float_type sum_comp_type;
+ typedef typename Pixel::traits<T>::float_type sum_type;
+ assert(out.size() == I.size());
+ int ksize = (int)(sigmas*sigma + 0.5);
+ sum_comp_type kernel[ksize];
+ sum_comp_type ksum = sum_comp_type();
+ for (int i=1; i<=ksize; i++)
+ ksum += (kernel[i-1] =
static_cast<sum_comp_type>(exp(-i*i/(2*sigma*sigma))));
+ for (int i=0; i<ksize; i++)
+ kernel[i] /= (2*ksum+1);
+ double factor = 1.0/(2*ksum+1);
+ int w = I.size().x;
+ int h = I.size().y;
+ int swin = 2*ksize;
+
+ sum_type* buffer = Internal::aligned_mem<sum_type,16>::alloc(w*(swin+1));
+ sum_type* rowbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
+ sum_type* outbuf = Internal::aligned_mem<sum_type,16>::alloc(w);
+
+ sum_type* rows[swin+1];
+ for (int k=0;k<swin+1;k++)
+ rows[k] = buffer + k*w;
+
+ T* output = out.data();
+ for (int i=0; i<h; i++) {
+ sum_type* next_row = rows[swin];
+ const sum_type* input = getPixelRowTyped(I[i], w, rowbuf);
+ // beginning of row
+ for (int j=0; j<ksize; j++) {
+ sum_type hsum = input[j] * factor;
+ for (int k=0; k<ksize; k++)
+ hsum += (input[abs(j-k-1)] + input[j+k+1]) * kernel[k];
+ next_row[j] = hsum;
+ }
+ // middle of row
+ input += ksize;
+ input = convolveMiddle<sum_type, sum_comp_type>(input, factor, kernel,
ksize, w-swin, next_row+ksize);
+ // end of row
+ for (int j=w-ksize; j<w; j++, input++) {
+ sum_type hsum = *input * factor;
+ const int room = w-j;
+ for (int k=0; k<ksize; k++) {
+ hsum += (input[-k-1] + (k+1 >= room ? input[2*room-k-2] : input[k+1]))
* kernel[k];
+ }
+ next_row[j] = hsum;
+ }
+ // vertical
+ if (i >= swin) {
+ const sum_type* middle_row = rows[ksize];
+ assign_multiple(middle_row, factor, outbuf, w);
+ for (int k=0; k<ksize; k++) {
+ const sum_comp_type m = kernel[k];
+ const sum_type* row1 = rows[ksize-k-1];
+ const sum_type* row2 = rows[ksize+k+1];
+ add_multiple_of_sum(row1, row2, m, outbuf, w);
+ }
+ cast_copy(outbuf, output, w);
+ output += w;
+ if (i == h-1) {
+ for (int r=0; r<ksize; r++) {
+ const sum_type* middle_row = rows[ksize+r+1];
+ assign_multiple(middle_row, factor, outbuf, w);
+ for (int k=0; k<ksize; k++) {
+ const sum_comp_type m = kernel[k];
+ const sum_type* row1 = rows[ksize+r-k];
+ const sum_type* row2 = rows[ksize+r+k+2 > swin ? 2*swin -
(ksize+r+k+2) : ksize+r+k+2];
+ add_multiple_of_sum(row1, row2, m, outbuf, w);
+ }
+ cast_copy(outbuf, output, w);
+ output += w;
+ }
+ }
+ } else if (i == swin-1) {
+ for (int r=0; r<ksize; r++) {
+ const sum_type* middle_row = rows[r+1];
+ assign_multiple(middle_row, factor, outbuf, w);
+ for (int k=0; k<ksize; k++) {
+ const sum_comp_type m = kernel[k];
+ const sum_type* row1 = rows[abs(r-k-1)+1];
+ const sum_type* row2 = rows[r+k+2];
+ add_multiple_of_sum(row1, row2, m, outbuf, w);
+ }
+ cast_copy(outbuf, output, w);
+ output += w;
+ }
+ }
+
+ sum_type* tmp = rows[0];
+ for (int r=0;r<swin; r++)
+ rows[r] = rows[r+1];
+ rows[swin] = tmp;
+ }
+
+ Internal::aligned_mem<sum_type,16>::release(buffer);
+ Internal::aligned_mem<sum_type,16>::release(rowbuf);
+ Internal::aligned_mem<sum_type,16>::release(outbuf);
+}
+
+template <class T, class O, class K> void convolve_gaussian_3(const
BasicImage<T>& I, BasicImage<O>& out, K k1, K k2)
+{
+ assert(I.size() == out.size());
+ const T* a=I.data();
+ const int w = I.size().x;
+ O* o = out.data()+w+1;
+ int total = I.totalsize() - 2*w-2;
+ const double cross = k1*k2;
+ k1 *= k1;
+ k2 *= k2;
+ while (total--) {
+ const double sum = k1*(a[0] + a[2] + a[w*2] + a[w*2+2]) + cross*(a[1] +
a[w*2+1] + a[w] + a[w+2]) + k2*a[w+1];
+ *o++ = Pixel::scalar_convert<O,T,double>(sum);
+ ++a;
+ }
+}
+
+} // namespace CVD
+
+#endif
Index: libcvd/cvd/utility.h
diff -u /dev/null libcvd/cvd/utility.h:1.4.2.1
--- /dev/null Wed May 24 00:38:31 2006
+++ libcvd/cvd/utility.h Wed May 24 00:38:31 2006
@@ -0,0 +1,197 @@
+#ifndef CVD_UTILITY_H
+#define CVD_UTILITY_H
+
+#include <cvd/config.h>
+#include <cvd/image.h>
+#include <cvd/internal/is_pod.h>
+#include <cvd/internal/pixel_traits.h>
+#include <cvd/internal/convert_pixel_types.h>
+
+namespace CVD { //begin namespace
+
+ /** Generic image copy function for copying sub rectangles of images into
+ other images. This performs pixel type conversion if the input and output
+ images are different pixel types.
+ @param in input image to copy from
+ @param out output image to copy into
+ @param size size of the area to copy. By default this is the entirty of the
+input image
+ @param begin upper left corner of the area to copy, by default the upper left
+corner of the input image
+ @param dst upper left corner of the destination in the output image, by
+default the upper left corner of the output image
+ @throw ImageRefNotInImage if either begin is not in the input image or dst
not
+in the output image
+ @ingroup gImageIO
+ */
+ template<class S, class T> void copy(const BasicImage<S>& in, BasicImage<T>&
+out, ImageRef size=ImageRef(-1,-1), ImageRef begin = ImageRef(), ImageRef dst =
+ImageRef())
+ {
+ if (size.x == -1 && size.y == -1)
+ size = in.size();
+ // FIXME: This should be an exception, but which one do I use?
+ // I refuse to define another "ImageRefNotInImage" in this namespace.
+ if (!(in.in_image(begin) && out.in_image(dst) && in.in_image(begin+size -
ImageRef(1,1)) && out.in_image(dst+size - ImageRef(1,1)))){
+ std::cerr << "bad copy: " << in.size() << " " << out.size() << " " <<
size << " " << begin << " " << dst << std::endl;
+ int *p = 0;
+ *p = 1;
+ }
+ if (in.size() == out.size() && size == in.size() && begin == ImageRef() &&
dst == ImageRef()) {
+ Pixel::ConvertPixels<S,T>::convert(in.data(), out.data(),
in.totalsize());
+ return;
+ }
+
+ const S* from = &in[begin];
+ T* to = &out[dst];
+ int i = 0;
+ while (i++<size.y) {
+ Pixel::ConvertPixels<S,T>::convert(from, to, size.x);
+ from += in.size().x;
+ to += out.size().x;
+ }
+ }
+
+ template <class T, bool pod = Internal::is_POD<T>::is_pod> struct ZeroPixel {
+ static void zero(T& t) {
+ for (unsigned int c=0; c<Pixel::Component<T>::count; c++)
+ Pixel::Component<T>::get(t,c) = 0;
+ }
+ };
+
+ template <class T> struct ZeroPixel<T,true> {
+ static void zero(T& t) { memset(&t,0,sizeof(T)); }
+ };
+
+ template <class T, bool pod = Internal::is_POD<T>::is_pod> struct ZeroPixels
{
+ static void zero(T* pixels, int count) {
+ if (count) {
+ ZeroPixel<T>::zero(*pixels);
+ std::fill(pixels+1, pixels+count, *pixels);
+ }
+ }
+ };
+
+ template <class T> struct ZeroPixels<T,true> {
+ static void zero(T* pixels, int count) {
+ memset(pixels, 0, sizeof(T)*count);
+ }
+ };
+
+
+ /// Set a pixel to the default value (typically 0)
+ /// For multi-component pixels, this zeros all components (sets them to
defaults)
+ template <class T> inline void zeroPixel(T& pixel) {
ZeroPixel<T>::zero(pixel); }
+
+ /// Set many pixels to the default value (typically 0)
+ /// For multi-component pixels, this zeros all components (sets them to
defaults)
+ template <class T> inline void zeroPixels(T* pixels, int count) {
ZeroPixels<T>::zero(pixels, count); }
+
+ /// Set the one-pixel border (top, bottom, sides) of an image to zero values
+ template <class T> void zeroBorders(BasicImage<T>& I)
+ {
+ if (I.size().y == 0)
+ return;
+ zeroPixels(I[0], I.size().x);
+ for (int r=0;r<I.size().y-1; r++)
+ zeroPixels(I[r]+I.size().x-1,2);
+ zeroPixels(I[I.size().y-1], I.size().x);
+ }
+
+ /// Compute pointwise differences (a_i - b_i) and store in diff_i
+ /// This is accelerated using SIMD for some platforms and data types
(alignment is checked at runtime)
+ /// Do not specify template parameters explicitly so that overloading can
choose the right implementation
+ template <class A, class B> inline void differences(const A* a, const A* b,
B* diff, unsigned int count)
+ {
+ while (count--)
+ *(diff++) = (B)*(a++) - (B)*(b++);
+ }
+
+ /// Compute pointwise (a_i + b_i) * c and add to out_i
+ /// This is accelerated using SIMD for some platforms and data types
(alignment is checked at runtime)
+ /// Do not specify template parameters explicitly so that overloading can
choose the right implementation
+ template <class A, class B> inline void add_multiple_of_sum(const A* a,
const A* b, const A& c, B* out, unsigned int count)
+ {
+ while (count--)
+ *(out++) += (*(a++) + *(b++)) * c;
+ }
+
+ /// Compute pointwise a_i * c and store in out_i
+ /// This is accelerated using SIMD for some platforms and data types
(alignment is checked at runtime)
+ /// Do not specify template parameters explicitly so that overloading can
choose the right implementation
+ template <class A, class B> inline void assign_multiple(const A* a, const A&
c, B* out, unsigned int count)
+ {
+ while (count--)
+ *(out++) = static_cast<B>(*(a++) * c);
+ }
+
+ /// Compute sum(a_i*b_i)
+ /// This is accelerated using SIMD for some platforms and data types
(alignment is checked at runtime)
+ /// Do not specify template parameters explicitly so that overloading can
choose the right implementation
+ template <class T> double inner_product(const T* a, const T* b, unsigned int
count) {
+ double dot = 0;
+ while (count--)
+ dot += *(a++) * *(b++);
+ return dot;
+ }
+
+ template <class R, class D, class T> struct SumSquaredDifferences {
+ static inline R sum_squared_differences(const T* a, const T* b, size_t
count) {
+ R ssd = 0;
+ while (count--) {
+ D d = *a++ - *b++;
+ ssd += d*d;
+ }
+ return ssd;
+ }
+ };
+
+ /// Compute sum of (a_i - b_i)^2 (the SSD)
+ /// This is accelerated using SIMD for some platforms and data types
(alignment is checked at runtime)
+ /// Do not specify template parameters explicitly so that overloading can
choose the right implementation
+ template <class T> inline double sum_squared_differences(const T* a, const
T* b, size_t count) {
+ return
SumSquaredDifferences<double,double,T>::sum_squared_differences(a,b,count);
+ }
+
+ /// Check if the pointer is aligned to the specified byte granularity
+ template<int bytes> bool is_aligned(const void* ptr);
+ template<> inline bool is_aligned<8>(const void* ptr) { return
((reinterpret_cast<size_t>(ptr)) & 0x7) == 0; }
+ template<> inline bool is_aligned<16>(const void* ptr) { return
((reinterpret_cast<size_t>(ptr)) & 0xF) == 0; }
+
+ /// Compute the number of pointer increments necessary to yield alignment of
A bytes
+ template<int A, class T> inline size_t steps_to_align(const T* ptr)
+ {
+ return is_aligned<A>(ptr) ? 0 : (A-((reinterpret_cast<size_t>(ptr)) &
(A-1)))/sizeof(T);
+ }
+
+#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_MMINTRIN)
+ void differences(const byte* a, const byte* b, short* diff, unsigned int
size);
+ void differences(const short* a, const short* b, short* diff, unsigned int
size);
+#endif
+
+
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+ void differences(const float* a, const float* b, float* diff, unsigned int
size);
+ void add_multiple_of_sum(const float* a, const float* b, const float& c,
float* out, unsigned int count);
+ void assign_multiple(const float* a, const float& c, float* out, unsigned
int count);
+ double inner_product(const float* a, const float* b, unsigned int count);
+ double sum_squared_differences(const float* a, const float* b, size_t count);
+#endif
+
+#if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)
+ void differences(const int32_t* a, const int32_t* b, int32_t* diff, unsigned
int size);
+ void differences(const double* a, const double* b, double* diff, unsigned
int size);
+ void add_multiple_of_sum(const double* a, const double* b, const float& c,
double* out, unsigned int count);
+ void assign_multiple(const double* a, const double& c, double* out,
unsigned int count);
+ double inner_product(const double* a, const double* b, unsigned int count);
+ double sum_squared_differences(const double* a, const double* b, size_t
count);
+ long long sum_squared_differences(const byte* a, const byte* b, size_t
count);
+#else
+ inline long long sum_squared_differences(const byte* a, const byte* b,
size_t count) {
+ return SumSquaredDifferences<long
long,int,byte>::sum_squared_differences(a,b,count);
+ }
+#endif
+
+}
+
+#endif
Index: libcvd/cvd_src/utility.cc
diff -u /dev/null libcvd/cvd_src/utility.cc:1.5.2.1
--- /dev/null Wed May 24 00:38:31 2006
+++ libcvd/cvd_src/utility.cc Wed May 24 00:38:31 2006
@@ -0,0 +1,585 @@
+#include <cvd/config.h>
+#include <cvd/utility.h>
+// internal functions used by CVD vision algorithm implementations
+#include <cvd/internal/assembly.h>
+
+#if CVD_HAVE_MMINTRIN
+# include <mmintrin.h>
+#endif
+
+#if CVD_HAVE_XMMINTRIN
+# include <xmmintrin.h>
+#endif
+
+#if CVD_HAVE_EMMINTRIN
+# include <emmintrin.h>
+#endif
+
+using namespace std;
+
+ using CVD::is_aligned;
+ using CVD::steps_to_align;
+ template <class F, class T1, class T2, int A, int M> inline void
maybe_aligned_differences(const T1* a, const T1* b, T2* c, unsigned int count)
+ {
+ if (count < M*2) {
+ F::unaligned_differences(a,b,c,count);
+ return;
+ }
+ if (!is_aligned<A>(a)) {
+ unsigned int steps = steps_to_align<A>(a);
+ F::unaligned_differences(a,b,c,steps);
+ count -= steps;
+ a += steps;
+ b += steps;
+ c += steps;
+ }
+ if (!is_aligned<A>(c) || count < M) {
+ F::unaligned_differences(a,b,c,count);
+ return;
+ }
+ unsigned int block = (count/M)*M;
+ F::aligned_differences(a,b,c,block);
+ if (count > block) {
+ F::unaligned_differences(a+block,b+block,c+block,count-block);
+ }
+ }
+
+ template <class F, class T1, class T2, int A, int M> inline void
maybe_aligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out,
unsigned int count)
+ {
+ if (count < M*2) {
+ F::unaligned_add_mul_add(a,b,c,out,count);
+ return;
+ }
+ if (!is_aligned<A>(a)) {
+ unsigned int steps = steps_to_align<A>(a);
+ F::unaligned_add_mul_add(a,b,c,out,steps);
+ count -= steps;
+ a += steps;
+ b += steps;
+ out += steps;
+ if (count < M || !is_aligned<16>(out)) {
+ F::unaligned_add_mul_add(a,b,c,out,count);
+ return;
+ }
+ }
+ else if (count < M || !is_aligned<16>(out)) {
+ F::unaligned_add_mul_add(a,b,c,out,count);
+ return;
+ }
+ unsigned int block = (count/M)*M;
+ F::aligned_add_mul_add(a,b,c,out,block);
+ if (count > block)
+ F::unaligned_add_mul_add(a+block,b+block,c, out+block,count-block);
+ }
+
+ template <class F, class T1, class T2, int A, int M> inline void
maybe_aligned_assign_mul(const T1* a, const T1& c, T2* out, unsigned int count)
+ {
+ if (count < M*2) {
+ F::unaligned_assign_mul(a,c,out,count);
+ return;
+ }
+ if (!is_aligned<A>(a)) {
+ unsigned int steps = steps_to_align<A>(a);
+ F::unaligned_assign_mul(a,c,out,steps);
+ count -= steps;
+ a += steps;
+ out += steps;
+ if (count < M) {
+ F::unaligned_assign_mul(a,c,out,count);
+ return;
+ }
+ }
+ unsigned int block = (count/M)*M;
+ F::aligned_assign_mul(a,c,out,block);
+ if (count > block) {
+ F::unaligned_assign_mul(a+block,c, out+block,count-block);
+ }
+ }
+
+ template <class F, class R, class T1, int A, int M> inline R
maybe_aligned_inner_product(const T1* a, const T1* b, unsigned int count)
+ {
+ if (count < M*2) {
+ return F::unaligned_inner_product(a,b,count);
+ }
+ R sum = 0;
+ if (!is_aligned<A>(a)) {
+ unsigned int steps = steps_to_align<A>(a);
+ sum = F::unaligned_inner_product(a,b,steps);
+ count -= steps;
+ a += steps;
+ b += steps;
+ if (count < M) {
+ return sum + F::unaligned_inner_product(a,b,count);
+ }
+ }
+ unsigned int block = (count/M)*M;
+ sum += F::aligned_inner_product(a,b,block);
+ if (count > block)
+ sum += F::unaligned_inner_product(a+block,b+block,count-block);
+ return sum;
+ }
+
+ template <class F, class R, class T1, int A, int M> inline R
maybe_aligned_ssd(const T1* a, const T1* b, unsigned int count)
+ {
+ if (count < M*2) {
+ return F::unaligned_ssd(a,b,count);
+ }
+ R sum = 0;
+ if (!is_aligned<A>(a)) {
+ unsigned int steps = steps_to_align<A>(a);
+ sum = F::unaligned_ssd(a,b,steps);
+ count -= steps;
+ a += steps;
+ b += steps;
+ if (count < M) {
+ return sum + F::unaligned_ssd(a,b,count);
+ }
+ }
+ unsigned int block = (count/M)*M;
+ sum += F::aligned_ssd(a,b,block);
+ if (count > block)
+ sum += F::unaligned_ssd(a+block,b+block,count-block);
+ return sum;
+ }
+
+
+
+namespace CVD {
+
+#if defined(CVD_HAVE_MMXEXT) && defined(CVD_HAVE_MMINTRIN)
+
+
+ void byte_to_short_differences(const __m64* a, const __m64* b, __m64*
diff, unsigned int count)
+ {
+ __m64 z = _mm_setzero_si64();
+ for (;count; --count, ++a, ++b, diff+=2) {
+ __m64 aq = *a;
+ __m64 bq = *b;
+ __m64 alo = _mm_unpacklo_pi8(aq,z);
+ __m64 ahi = _mm_unpackhi_pi8(aq,z);
+ __m64 blo = _mm_unpacklo_pi8(bq,z);
+ __m64 bhi = _mm_unpackhi_pi8(bq,z);
+ diff[0] = _mm_sub_pi16(alo,blo);
+ diff[1] = _mm_sub_pi16(ahi,bhi);
+ }
+ _mm_empty();
+ }
+
+ void short_differences(const __m64* a, const __m64* b, __m64* diff,
unsigned int count)
+ {
+ while (count--) {
+ *(diff++) = _mm_sub_pi16(*(a++), *(b++));
+ }
+ _mm_empty();
+ }
+
+
+ struct MMX_funcs {
+ template <class T1, class T2> static inline void
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+ differences<T1,T2>(a,b,diff,count);
+ }
+ static inline void aligned_differences(const byte* a, const byte* b,
short* diff, unsigned int count) {
+ if (is_aligned<8>(b))
+ byte_to_short_differences((const __m64*)a,(const __m64*)b,
(__m64*)diff, count>>3);
+ else
+ unaligned_differences(a,b,diff,count);
+ }
+
+ static inline void aligned_differences(const short* a, const short* b,
short* diff, unsigned int count) {
+ if (is_aligned<8>(b))
+ short_differences((const __m64*)a, (const __m64*)b,
(__m64*)diff, count>>2);
+ else
+ unaligned_differences(a,b,diff,count);
+ }
+ };
+
+ void differences(const byte* a, const byte* b, short* diff, unsigned int
count) {
+ maybe_aligned_differences<MMX_funcs, byte, short, 8, 8>(a,b,diff,count);
+ }
+
+ void differences(const short* a, const short* b, short* diff, unsigned int
count) {
+ maybe_aligned_differences<MMX_funcs, short, short, 8,
4>(a,b,diff,count);
+ }
+#endif
+
+
+#if defined(CVD_HAVE_SSE) && defined(CVD_HAVE_XMMINTRIN)
+
+ template <bool Aligned> inline __m128 load_ps(const void* addr) { return
_mm_loadu_ps((const float*)addr); }
+ template <> inline __m128 load_ps<true>(const void* addr) { return
_mm_load_ps((const float*)addr); }
+
+ template <bool Aligned> inline void store_ps(__m128 m, void* addr) {
return _mm_storeu_ps((float*)addr, m); }
+ template <> inline void store_ps<true>(__m128 m, void* addr) { return
_mm_store_ps((float*)addr, m); }
+
+ template <bool Aligned_b> inline void float_differences(const __m128* a,
const __m128* b, __m128* diff, unsigned int count)
+ {
+ while (count--) {
+ *(diff++) = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+ }
+ }
+
+ template <bool Aligned_b> void float_add_multiple_of_sum(const __m128* a,
const __m128* b, const float& c, __m128* out, unsigned int count)
+ {
+ __m128 cccc = _mm_set1_ps(c);
+ while (count--) {
+ *out = _mm_add_ps(_mm_mul_ps(_mm_add_ps(*(a++),
load_ps<Aligned_b>(b++)), cccc), *out);
+ ++out;
+ }
+ }
+
+ template <bool Aligned_out> inline void float_assign_multiple(const
__m128* a, const float& c, __m128* out, unsigned int count)
+ {
+ const __m128 cccc = _mm_set1_ps(c);
+ while (count--)
+ store_ps<Aligned_out>(_mm_mul_ps(*(a++), cccc), out++);
+
+ }
+
+ template <bool Aligned_b> double float_inner_product(const __m128* a,
const __m128* b, unsigned int count)
+ {
+ float sums_store[4];
+ const unsigned int BLOCK = 1<<10;
+ double dot = 0;
+ while (count) {
+ size_t pass = std::min(count, BLOCK);
+ count-=pass;
+ __m128 sums = _mm_setzero_ps();
+ while (pass--) {
+ __m128 prod = _mm_mul_ps(*(a++), load_ps<Aligned_b>(b++));
+ sums = _mm_add_ps(prod, sums);
+ }
+ _mm_storeu_ps(sums_store, sums);
+ dot += sums_store[0] + sums_store[1] + sums_store[2] +
sums_store[3];
+ }
+ return dot;
+ }
+
+ template <bool Aligned_b> inline double
float_sum_squared_differences(const __m128* a, const __m128* b, size_t count)
+ {
+ float sums_store[4];
+ const size_t BLOCK = 1<<10;
+ double ssd = 0;
+ while (count) {
+ size_t pass = std::min(count, BLOCK);
+ count-=pass;
+ __m128 sums = _mm_setzero_ps();
+ while (pass--) {
+ __m128 diff = _mm_sub_ps(*(a++), load_ps<Aligned_b>(b++));
+ sums = _mm_add_ps(_mm_mul_ps(diff,diff), sums);
+ }
+ _mm_storeu_ps(sums_store, sums);
+ ssd += sums_store[0] + sums_store[1] + sums_store[2] +
sums_store[3];
+ }
+ return ssd;
+ }
+
+ struct SSE_funcs {
+ template <class T1, class T2> static inline void
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+ differences<T1,T2>(a,b,diff,count);
+ }
+
+ static inline void aligned_differences(const float* a, const float* b,
float* diff, unsigned int count) {
+ if (is_aligned<16>(b))
+ float_differences<true>((const __m128*)a, (const __m128*)b,
(__m128*)diff, count>>2);
+ else
+ float_differences<false>((const __m128*)a, (const __m128*)b,
(__m128*)diff, count>>2);
+ }
+
+ template <class T1, class T2> static inline void
unaligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, size_t
count) {
+ add_multiple_of_sum<T1,T2>(a,b,c,out,count);
+ }
+ static inline void aligned_add_mul_add(const float* a, const float* b,
const float& c, float* out, size_t count) {
+ if (is_aligned<16>(b))
+ float_add_multiple_of_sum<true>((const __m128*)a, (const
__m128*)b, c, (__m128*)out, count>>2);
+ else
+ float_add_multiple_of_sum<false>((const __m128*)a, (const
__m128*)b, c, (__m128*)out, count>>2);
+ }
+
+ template <class T1, class T2> static inline void
unaligned_assign_mul(const T1* a, const T1& c, T2* out, size_t count) {
+ assign_multiple<T1,T2>(a,c,out,count);
+ }
+ static inline void aligned_assign_mul(const float* a, const float& c,
float* out, size_t count) {
+ if (is_aligned<16>(out))
+ float_assign_multiple<false>((const __m128*)a, c, (__m128*)out,
count>>2);
+ else
+ float_assign_multiple<false>((const __m128*)a, c, (__m128*)out,
count>>2);
+ }
+
+ template <class T1> static inline double unaligned_inner_product(const
T1* a, const T1* b, size_t count) {
+ return inner_product<T1>(a,b,count);
+ }
+
+ static inline double aligned_inner_product(const float* a, const float*
b, unsigned int count)
+ {
+ if (is_aligned<16>(b))
+ return float_inner_product<true>((const __m128*) a, (const
__m128*) b, count>>2);
+ else
+ return float_inner_product<false>((const __m128*) a, (const
__m128*) b, count>>2);
+ }
+
+ template <class T1> static inline double unaligned_ssd(const T1* a,
const T1* b, size_t count) {
+ return sum_squared_differences<T1>(a,b,count);
+ }
+
+ static inline double aligned_ssd(const float* a, const float* b,
unsigned int count)
+ {
+ if (is_aligned<16>(b))
+ return float_sum_squared_differences<true>((const __m128*) a,
(const __m128*) b, count>>2);
+ else
+ return float_sum_squared_differences<false>((const __m128*) a,
(const __m128*) b, count>>2);
+ }
+ };
+
+ void differences(const float* a, const float* b, float* diff, unsigned int
size)
+ {
+ maybe_aligned_differences<SSE_funcs, float, float, 16,
4>(a,b,diff,size);
+ }
+
+ void add_multiple_of_sum(const float* a, const float* b, const float& c,
float* out, unsigned int count)
+ {
+ maybe_aligned_add_mul_add<SSE_funcs,float,float,16,4>(a,b,c,out,count);
+ }
+
+ void assign_multiple(const float* a, const float& c, float* out, unsigned
int count)
+ {
+ maybe_aligned_assign_mul<SSE_funcs,float,float,16,4>(a,c,out,count);
+ }
+
+
+ double inner_product(const float* a, const float* b, unsigned int count)
+ {
+ return
maybe_aligned_inner_product<SSE_funcs,double,float,16,4>(a,b,count);
+ }
+
+ double sum_squared_differences(const float* a, const float* b, size_t
count)
+ {
+ return maybe_aligned_ssd<SSE_funcs,double,float,16,4>(a,b,count);
+ }
+
+#endif
+
+#if defined (CVD_HAVE_SSE2) && defined(CVD_HAVE_EMMINTRIN)
+
+
+ static inline __m128i zero_si128() { __m128i x; asm ( "pxor %0, %0 \n\t"
: "=x"(x) ); return x; }
+ template <bool Aligned> inline __m128i load_si128(const void* addr) {
return _mm_loadu_si128((const __m128i*)addr); }
+ template <> inline __m128i load_si128<true>(const void* addr) { return
_mm_load_si128((const __m128i*)addr); }
+ template <bool Aligned> inline __m128d load_pd(const void* addr) { return
_mm_loadu_pd((const double*)addr); }
+ template <> inline __m128d load_pd<true>(const void* addr) { return
_mm_load_pd((const double*)addr); }
+
+ template <bool Aligned> inline void store_pd(__m128d m, void* addr) {
return _mm_storeu_pd((double*)addr, m); }
+ template <> inline void store_pd<true>(__m128d m, void* addr) { return
_mm_store_pd((double*)addr, m); }
+
+ template <bool Aligned_b> void int_differences(const __m128i* a, const
__m128i* b, __m128i* diff, unsigned int count)
+ {
+ while (count--) {
+ *(diff++) = _mm_sub_epi32(*(a++), load_si128<Aligned_b>(b++));
+ }
+ }
+
+ template <bool Aligned_b> void double_differences(const __m128d* a, const
__m128d* b, __m128d* diff, unsigned int count)
+ {
+ while (count--) {
+ *(diff++) = _mm_sub_pd(*(a++), load_pd<Aligned_b>(b++));
+ }
+ }
+
+ template <bool Aligned_b> void double_add_multiple_of_sum(const __m128d*
a, const __m128d* b, const double& c, __m128d* out, unsigned int count)
+ {
+ __m128d cc = _mm_set1_pd(c);
+ while (count--) {
+ *out = _mm_add_pd(_mm_mul_pd(_mm_add_pd(*(a++),
load_pd<Aligned_b>(b++)), cc), *out);
+ ++out;
+ }
+ }
+
+ template <bool Aligned_out> void double_assign_multiple(const __m128d* a,
const double& c, __m128d* out, unsigned int count)
+ {
+ __m128d cc = _mm_set1_pd(c);
+ while (count--)
+ store_pd<Aligned_out>(_mm_mul_pd(*(a++), cc), out++);
+ }
+ template <bool Aligned_b> double double_inner_product(const __m128d* a,
const __m128d* b, unsigned int count)
+ {
+ double sums_store[2];
+ const unsigned int BLOCK = 1<<16;
+ double dot = 0;
+ while (count) {
+ size_t pass = std::min(count, BLOCK);
+ count-=pass;
+ __m128d sums = _mm_setzero_pd();
+ while (pass--) {
+ __m128d prod = _mm_mul_pd(*(a++), load_pd<Aligned_b>(b++));
+ sums = _mm_add_pd(prod, sums);
+ }
+ _mm_storeu_pd(sums_store, sums);
+ dot += sums_store[0] + sums_store[1];
+ }
+ return dot;
+ }
+
+ template <bool Aligned_b> long long byte_sum_squared_differences(const
__m128i* a, const __m128i* b, unsigned int count) {
+ unsigned long sums_store[4];
+ const unsigned int BLOCK = 1<<15;
+ long long ssd = 0;
+ while (count) {
+ size_t pass = std::min(count, BLOCK);
+ count -= pass;
+ __m128i sums = _mm_setzero_si128();
+ while (pass--) {
+ __m128i lo_a = load_si128<true>(a++);
+ __m128i lo_b = load_si128<Aligned_b>(b++);
+ __m128i hi_a = _mm_unpackhi_epi8(lo_a, sums);
+ __m128i hi_b = _mm_unpackhi_epi8(lo_b, sums);
+ lo_a = _mm_unpacklo_epi8(lo_a, sums);
+ lo_b = _mm_unpacklo_epi8(lo_b, sums);
+ lo_a = _mm_sub_epi16(lo_a, lo_b);
+ hi_a = _mm_sub_epi16(hi_a, hi_b);
+ lo_a = _mm_madd_epi16(lo_a,lo_a);
+ hi_a = _mm_madd_epi16(hi_a,hi_a);
+ sums = _mm_add_epi32(_mm_add_epi32(lo_a, hi_a), sums);
+ }
+ _mm_storeu_si128((__m128i*)sums_store, sums);
+ ssd += sums_store[0] + sums_store[1] + sums_store[2] +
sums_store[3];
+ }
+ return ssd;
+ }
+
+ template <bool Aligned_b> inline double
double_sum_squared_differences(const __m128d* a, const __m128d* b, unsigned int
count)
+ {
+ double sums_store[2];
+ const unsigned int BLOCK = 1<<10;
+ double ssd = 0;
+ while (count) {
+ size_t pass = std::min(count, BLOCK);
+ count-=pass;
+ __m128d sums = _mm_setzero_pd();
+ while (pass--) {
+ __m128d diff = _mm_sub_pd(*(a++), load_pd<Aligned_b>(b++));
+ sums = _mm_add_pd(_mm_mul_pd(diff,diff), sums);
+ }
+ _mm_storeu_pd(sums_store, sums);
+ ssd += sums_store[0] + sums_store[1];
+ }
+ return ssd;
+ }
+
+
+ struct SSE2_funcs {
+ template <class T1, class T2> static inline void
unaligned_differences(const T1* a, const T1* b, T2* diff, size_t count) {
+ differences<T1,T2>(a,b,diff,count);
+ }
+
+ static inline void aligned_differences(const int32_t* a, const int32_t*
b, int32_t* diff, unsigned int count) {
+ if (is_aligned<16>(b))
+ int_differences<true>((const __m128i*)a, (const __m128i*)b,
(__m128i*)diff, count>>2);
+ else
+ int_differences<false>((const __m128i*)a, (const __m128i*)b,
(__m128i*)diff, count>>2);
+ }
+
+ static inline void aligned_differences(const double* a, const double*
b, double* diff, unsigned int count)
+ {
+ if (is_aligned<16>(b))
+ double_differences<true>((const __m128d*)a,(const
__m128d*)b,(__m128d*)diff,count>>1);
+ else
+ double_differences<false>((const __m128d*)a,(const
__m128d*)b,(__m128d*)diff,count>>1);
+ }
+
+ template <class T1, class T2> static inline void
unaligned_add_mul_add(const T1* a, const T1* b, const T1& c, T2* out, size_t
count) {
+ add_multiple_of_sum<T1,T2>(a,b,c,out,count);
+ }
+
+ static inline void aligned_add_mul_add(const double* a, const double*
b, const double& c, double* out, unsigned int count)
+ {
+ if (is_aligned<16>(b))
+ double_add_multiple_of_sum<true>((const __m128d*)a, (const
__m128d*)b, c, (__m128d*)out, count>>1);
+ else
+ double_add_multiple_of_sum<false>((const __m128d*)a, (const
__m128d*)b, c, (__m128d*)out, count>>1);
+ }
+
+ template <class T1, class T2> static inline void
unaligned_assign_mul(const T1* a, const T1& c, T2* out, size_t count) {
+ assign_multiple<T1,T2>(a,c,out,count);
+ }
+
+ static inline void aligned_assign_mul(const double* a, const double& c,
double* out, unsigned int count)
+ {
+ if (is_aligned<16>(out))
+ double_assign_multiple<true>((const __m128d*)a, c,
(__m128d*)out, count>>1);
+ else
+ double_assign_multiple<false>((const __m128d*)a, c,
(__m128d*)out, count>>1);
+ }
+
+ template <class T1> static inline double unaligned_inner_product(const
T1* a, const T1* b, size_t count) {
+ return inner_product<T1>(a,b,count);
+ }
+
+ static inline double aligned_inner_product(const double* a, const
double* b, unsigned int count)
+ {
+ if (is_aligned<16>(b))
+ return double_inner_product<true>((const __m128d*) a, (const
__m128d*) b, count>>1);
+ else
+ return double_inner_product<false>((const __m128d*) a, (const
__m128d*) b, count>>1);
+ }
+
+ template <class T1> static inline double unaligned_ssd(const T1* a,
const T1* b, size_t count) {
+ return sum_squared_differences<T1>(a,b,count);
+ }
+
+ static inline long long unaligned_ssd(const byte* a, const byte* b,
size_t count) {
+ return SumSquaredDifferences<long long, int,
byte>::sum_squared_differences(a,b,count);
+ }
+
+ static inline double aligned_ssd(const double* a, const double* b,
size_t count)
+ {
+ if (is_aligned<16>(b))
+ return double_sum_squared_differences<true>((const __m128d*)a,
(const __m128d*)b, count>>1);
+ else
+ return double_sum_squared_differences<false>((const __m128d*)a,
(const __m128d*)b, count>>1);
+ }
+
+ static inline long long aligned_ssd(const byte* a, const byte* b,
size_t count)
+ {
+ if (is_aligned<16>(b))
+ return byte_sum_squared_differences<true>((const __m128i*)a,
(const __m128i*)b, count>>4);
+ else
+ return byte_sum_squared_differences<false>((const __m128i*)a,
(const __m128i*)b, count>>4);
+ }
+ };
+
+ void differences(const int32_t* a, const int32_t* b, int32_t* diff,
unsigned int size)
+ {
+ maybe_aligned_differences<SSE2_funcs, int32_t, int32_t, 16,
4>(a,b,diff,size);
+ }
+
+ void differences(const double* a, const double* b, double* diff, unsigned
int size)
+ {
+ maybe_aligned_differences<SSE2_funcs, double, double, 16,
2>(a,b,diff,size);
+ }
+
+ void add_multiple_of_sum(const double* a, const double* b, const double&
c, double* out, unsigned int count)
+ {
+ maybe_aligned_add_mul_add<SSE2_funcs, double, double, 16,
2>(a,b,c,out,count);
+ }
+
+ void assign_multiple(const double* a, const double& c, double* out,
unsigned int count)
+ {
+ maybe_aligned_assign_mul<SSE2_funcs, double, double, 16,
2>(a,c,out,count);
+ }
+
+ double inner_product(const double* a, const double* b, unsigned int count)
+ {
+ return maybe_aligned_inner_product<SSE2_funcs, double, double, 16,
2>(a,b,count);
+ }
+
+ double sum_squared_differences(const double* a, const double* b, size_t
count)
+ {
+ return maybe_aligned_ssd<SSE2_funcs, double, double, 16, 2>(a,b,count);
+ }
+
+ long long sum_squared_differences(const byte* a, const byte* b, size_t
count)
+ {
+ return maybe_aligned_ssd<SSE2_funcs, long long, byte, 16,
16>(a,b,count);
+ }
+#endif
+
+}
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Libcvd-members] libcvd cvd/convolution.h cvd/utility.h cvd_src/... [videofilebuffer_test_23052006],
Ethan Eade <=