traverso-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Traverso-commit] traverso/src audiofileio/audiofileio.pro base.p...


From: Remon Sijrier
Subject: [Traverso-commit] traverso/src audiofileio/audiofileio.pro base.p...
Date: Sat, 27 Oct 2007 17:57:16 +0000

CVSROOT:        /sources/traverso
Module name:    traverso
Changes by:     Remon Sijrier <r_sijrier>       07/10/27 17:57:16

Modified files:
        src/audiofileio: audiofileio.pro 
        src            : base.pri 
        src/common     : Mixer.cpp Mixer.h 
        src/core       : core.pro 
        src/engine     : engine.pro 
        src/traverso   : Traverso.cpp Traverso.h 
Added files:
        src/core       : fpu.cc fpu.h 
        src/engine     : sse_functions_64bit.S 

Log message:
        * sync sse based code to ardour2's version 

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/traverso/src/audiofileio/audiofileio.pro?cvsroot=traverso&r1=1.15&r2=1.16
http://cvs.savannah.gnu.org/viewcvs/traverso/src/base.pri?cvsroot=traverso&r1=1.47&r2=1.48
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/Mixer.cpp?cvsroot=traverso&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/Mixer.h?cvsroot=traverso&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/core.pro?cvsroot=traverso&r1=1.43&r2=1.44
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/fpu.cc?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/fpu.h?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/engine/engine.pro?cvsroot=traverso&r1=1.20&r2=1.21
http://cvs.savannah.gnu.org/viewcvs/traverso/src/engine/sse_functions_64bit.S?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/traverso/Traverso.cpp?cvsroot=traverso&r1=1.47&r2=1.48
http://cvs.savannah.gnu.org/viewcvs/traverso/src/traverso/Traverso.h?cvsroot=traverso&r1=1.9&r2=1.10

Patches:
Index: audiofileio/audiofileio.pro
===================================================================
RCS file: /sources/traverso/traverso/src/audiofileio/audiofileio.pro,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -b -r1.15 -r1.16
--- audiofileio/audiofileio.pro 20 Oct 2007 17:38:16 -0000      1.15
+++ audiofileio/audiofileio.pro 27 Oct 2007 17:57:15 -0000      1.16
@@ -46,7 +46,3 @@
     INCLUDEPATH += ../../3thparty/include .
 }
 
-HEADERS -= PeakDataReader.h \
-decode/PeakDataReader.h
-SOURCES -= decode/PeakDataReader.cpp
-

Index: base.pri
===================================================================
RCS file: /sources/traverso/traverso/src/base.pri,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -b -r1.47 -r1.48
--- base.pri    26 Sep 2007 16:32:52 -0000      1.47
+++ base.pri    27 Oct 2007 17:57:15 -0000      1.48
@@ -20,7 +20,7 @@
 
 DEFINES += JACK_SUPPORT
 DEFINES += ALSA_SUPPORT
-#DEFINES += PORTAUDIO_SUPPORT
+DEFINES += PORTAUDIO_SUPPORT
 DEFINES += LV2_SUPPORT
 DEFINES += QT_OPENGL_SUPPORT
 
@@ -49,7 +49,7 @@
 #################################################
 
 
-# DEFINES += STATIC_BUILD
+DEFINES += STATIC_BUILD
 
 #
 # Use Memory Locking 
@@ -97,15 +97,47 @@
                
                MACHINETYPE = $$system(arch)
                
-               contains( MACHINETYPE, x86_64 ) {
-                       QMAKE_CXXFLAGS_RELEASE += -mtune=athlon64
-               }
+               X86_FLAGS = $$system(cat /proc/cpuinfo | grep '^flags')
+               
+               HOST_SUPPORTS_SSE = 0
                
-               contains( MACHINETYPE, i[456]86) {
+               contains(X86_FLAGS, sse) {
+                       HOST_SUPPORTS_SSE = 1
                        DEFINES += SSE_OPTIMIZATIONS
+               }
+               
+               contains(X86_FLAGS, mmx) {
+                       QMAKE_CXXFLAGS_RELEASE += -mmmx
+               }
+               
+               contains(X86_FLAGS, 3dnow) {
+                       QMAKE_CXXFLAGS_RELEASE += -m3dnow
+               }
+               
+               contains(MACHINETYPE, i586) {
+                       QMAKE_CXXFLAGS_RELEASE += -march=i586
+               }
+
+               contains(MACHINETYPE, i686) {
+                       QMAKE_CXXFLAGS_RELEASE += -march=i686
+                       eval(HOST_SUPPORTS_SSE == 1) {
                        QMAKE_CXXFLAGS_RELEASE += -msse -mfpmath=sse
+                               DEFINES += USE_XMMINTRIN
+                       }
+               }
+
+               contains(MACHINETYPE, x86_64) {
+                       eval(HOST_SUPPORTS_SSE == 1) {
+                               QMAKE_CXXFLAGS_RELEASE += -msse -mfpmath=sse
+                               DEFINES += USE_XMMINTRIN USE_X86_64_ASM
+                       }
                }
                
+               contains(MACHINETYPE, i[456]86) {
+                       DEFINES += ARCH_X86
+               }
+
+               
        }
        
        GCCVERSION = $$system(gcc -dumpversion)
@@ -138,6 +170,9 @@
        QMAKE_LFLAGS_SONAME  = -Wl,-install_name,@executable_path/../Frameworks/
 
        RC_FILE = ../../resources/images/traverso_mac.icns
+       
+# Uncomment if dest. target is (at least) tiger (works maybe on other targets 
as well ?)
+# DEFINES += BUILD_VECLIB_OPTIMIZATIONS
 }
 
 win32 { 

Index: common/Mixer.cpp
===================================================================
RCS file: /sources/traverso/traverso/src/common/Mixer.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- common/Mixer.cpp    20 Oct 2007 17:38:16 -0000      1.1
+++ common/Mixer.cpp    27 Oct 2007 17:57:15 -0000      1.2
@@ -17,7 +17,7 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
  
-    $Id: Mixer.cpp,v 1.1 2007/10/20 17:38:16 r_sijrier Exp $
+    $Id: Mixer.cpp,v 1.2 2007/10/27 17:57:15 r_sijrier Exp $
 */
 
 #include "Mixer.h"
@@ -31,7 +31,7 @@
 
 
 
-float default_compute_peak (audio_sample_t* buf, nframes_t nsamples, float 
current)
+float default_compute_peak (const audio_sample_t* buf, nframes_t nsamples, 
float current)
 {
         for (nframes_t i = 0; i < nsamples; ++i) {
                 current = f_max (current, fabsf (buf[i]));
@@ -46,16 +46,52 @@
                 buf[i] *= gain;
 }
 
-void default_mix_buffers_with_gain (audio_sample_t* dst, audio_sample_t* src, 
nframes_t nframes, float gain)
+void default_mix_buffers_with_gain (audio_sample_t* dst, const audio_sample_t* 
src, nframes_t nframes, float gain)
 {
         for (nframes_t i = 0; i < nframes; i++) {
                 dst[i] += src[i] * gain;
         }
 }
 
-void default_mix_buffers_no_gain (audio_sample_t* dst, audio_sample_t* src, 
nframes_t nframes)
+void default_mix_buffers_no_gain (audio_sample_t* dst, const audio_sample_t* 
src, nframes_t nframes)
 {
         for (nframes_t i=0; i < nframes; i++) {
                 dst[i] += src[i];
         }
 }
+
+
+#if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
+#include <Accelerate/Accelerate.h>
+
+float veclib_compute_peak (const audio_sample_t* buf, nframes_t nsamples, 
float current)
+{
+       float tmpmax = 0.0f;
+       vDSP_maxmgv(buf, 1, &tmpmax, nsamples);
+       return f_max(current, tmpmax);
+}
+
+void veclib_find_peaks (const audio_sample_t* buf, nframes_t nframes, float 
*min, float *max)
+{
+       vDSP_maxv (const_cast<audio_sample_t*>(buf), 1, max, nframes);
+       vDSP_minv (const_cast<audio_sample_t*>(buf), 1, min, nframes);
+}
+
+void veclib_apply_gain_to_buffer (audio_sample_t * buf, nframes_t nframes, 
float gain)
+{
+       vDSP_vsmul(buf, 1, &gain, buf, 1, nframes);
+}
+
+void veclib_mix_buffers_with_gain (audio_sample_t * dst, const audio_sample_t 
* src, nframes_t nframes, float gain)
+{
+       vDSP_vsma(src, 1, &gain, dst, 1, dst, 1, nframes);
+}
+
+void veclib_mix_buffers_no_gain (audio_sample_t * dst, const audio_sample_t * 
src, nframes_t nframes)
+{
+       // It seems that a vector mult only operation does not exist...
+       float gain = 1.0f;
+       vDSP_vsma(src, 1, &gain, dst, 1, dst, 1, nframes);
+}
+
+#endif

Index: common/Mixer.h
===================================================================
RCS file: /sources/traverso/traverso/src/common/Mixer.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- common/Mixer.h      20 Oct 2007 17:38:17 -0000      1.1
+++ common/Mixer.h      27 Oct 2007 17:57:15 -0000      1.2
@@ -17,11 +17,11 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
  
-    $Id: Mixer.h,v 1.1 2007/10/20 17:38:17 r_sijrier Exp $
+    $Id: Mixer.h,v 1.2 2007/10/27 17:57:15 r_sijrier Exp $
 */
 
-#ifndef MIXER_H
-#define MIXER_H
+#ifndef TRAVERSO_MIXER_H
+#define TRAVERSO_MIXER_H
 
 #include "defines.h"
 #include <cmath>
@@ -60,32 +60,40 @@
 }
 
 
-float default_compute_peak                     (audio_sample_t*  buf, 
nframes_t nsamples, float current);
+float default_compute_peak                     (const audio_sample_t*  buf, 
nframes_t nsamples, float current);
 void  default_apply_gain_to_buffer             (audio_sample_t*  buf, 
nframes_t nframes, float gain);
-void  default_mix_buffers_with_gain            (audio_sample_t*  dst, 
audio_sample_t*  src, nframes_t nframes, float gain);
-void  default_mix_buffers_no_gain              (audio_sample_t*  dst, 
audio_sample_t*  src, nframes_t nframes);
+void  default_mix_buffers_with_gain            (audio_sample_t*  dst, const 
audio_sample_t*  src, nframes_t nframes, float gain);
+void  default_mix_buffers_no_gain              (audio_sample_t*  dst, const 
audio_sample_t*  src, nframes_t nframes);
 
-#if defined (SSE_OPTIMIZATIONS)
+
+#if defined (ARCH_X86) && defined (SSE_OPTIMIZATIONS)
 
 extern "C"
 {
         /* SSE functions */
-        float x86_sse_compute_peak             (audio_sample_t*  buf, 
nframes_t nsamples, float current);
+        float x86_sse_compute_peak             (const audio_sample_t*  buf, 
nframes_t nsamples, float current);
         void  x86_sse_apply_gain_to_buffer     (audio_sample_t*  buf, 
nframes_t nframes, float gain);
-        void  x86_sse_mix_buffers_with_gain    (audio_sample_t*  dst, 
audio_sample_t*  src, nframes_t nframes, float gain);
-        void  x86_sse_mix_buffers_no_gain      (audio_sample_t*  dst, 
audio_sample_t*  src, nframes_t nframes);
+        void  x86_sse_mix_buffers_with_gain    (audio_sample_t*  dst, const 
audio_sample_t*  src, nframes_t nframes, float gain);
+        void  x86_sse_mix_buffers_no_gain      (audio_sample_t*  dst, const 
audio_sample_t*  src, nframes_t nframes);
 }
 #endif
 
+#if defined (__APPLE__)  && defined (BUILD_VECLIB_OPTIMIZATIONS)
+
+float veclib_compute_peak              (const audio_sample_t* buf, nframes_t 
nsamples, float current);
+void  veclib_apply_gain_to_buffer      (audio_sample_t* buf, nframes_t 
nframes, float gain);
+void  veclib_mix_buffers_with_gain     (audio_sample_t* dst, const 
audio_sample_t* src, nframes_t nframes, float gain);
+void  veclib_mix_buffers_no_gain       (audio_sample_t* dst, const 
audio_sample_t* src, nframes_t nframes);
 
+#endif
 
 class Mixer
 {
 public:
-        typedef float (*compute_peak_t)                        
(audio_sample_t* , nframes_t, float);
+        typedef float (*compute_peak_t)                        (const 
audio_sample_t* , nframes_t, float);
         typedef void  (*apply_gain_to_buffer_t)                
(audio_sample_t* , nframes_t, float);
-        typedef void  (*mix_buffers_with_gain_t)       (audio_sample_t* , 
audio_sample_t* , nframes_t, float);
-        typedef void  (*mix_buffers_no_gain_t)         (audio_sample_t* , 
audio_sample_t* , nframes_t);
+        typedef void  (*mix_buffers_with_gain_t)       (audio_sample_t* , 
const audio_sample_t* , nframes_t, float);
+        typedef void  (*mix_buffers_no_gain_t)         (audio_sample_t* , 
const audio_sample_t* , nframes_t);
 
         static compute_peak_t          compute_peak;
         static apply_gain_to_buffer_t  apply_gain_to_buffer;

Index: core/core.pro
===================================================================
RCS file: /sources/traverso/traverso/src/core/core.pro,v
retrieving revision 1.43
retrieving revision 1.44
diff -u -b -r1.43 -r1.44
--- core/core.pro       22 Oct 2007 16:49:43 -0000      1.43
+++ core/core.pro       27 Oct 2007 17:57:15 -0000      1.44
@@ -57,7 +57,8 @@
        Marker.cpp \
        Themer.cpp \
        AudioFileMerger.cpp \
-       ProjectConverter.cpp
+       ProjectConverter.cpp \
+       fpu.cc
 HEADERS = precompile.h \
        ../common/Utils.h \
        ../common/Tsar.h \
@@ -96,7 +97,6 @@
        gdither_types.h \
        gdither_types_internal.h \
        noise.h \
-       FastDelegate.h \
        SnapList.h \
        Snappable.h \
        CommandPlugin.h \
@@ -104,7 +104,8 @@
        Marker.h \
        Themer.h \
        AudioFileMerger.h \
-       ProjectConverter.h
+       ProjectConverter.h \
+       fpu.h
 macx{
     QMAKE_LIBDIR += /usr/local/qt/lib
 }

Index: engine/engine.pro
===================================================================
RCS file: /sources/traverso/traverso/src/engine/engine.pro,v
retrieving revision 1.20
retrieving revision 1.21
diff -u -b -r1.20 -r1.21
--- engine/engine.pro   20 Oct 2007 17:38:19 -0000      1.20
+++ engine/engine.pro   27 Oct 2007 17:57:15 -0000      1.21
@@ -53,7 +53,14 @@
 }
 
 unix{
-    contains(DEFINES, SSE_OPTIMIZATIONS): SOURCES += sse_functions.S
+       contains(DEFINES, SSE_OPTIMIZATIONS) {
+               constain(DEFINES, USE_X86_64_ASM) {
+                       SOURCES += sse_functions_64bit.S
+               }
+               contains(DEFINES, ARCH_X86) {
+                       SOURCES += sse_functions.S
+               }
+       }
 }
 
 macx{

Index: traverso/Traverso.cpp
===================================================================
RCS file: /sources/traverso/traverso/src/traverso/Traverso.cpp,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -b -r1.47 -r1.48
--- traverso/Traverso.cpp       31 Aug 2007 09:19:51 -0000      1.47
+++ traverso/Traverso.cpp       27 Oct 2007 17:57:16 -0000      1.48
@@ -35,6 +35,14 @@
 #include <ContextPointer.h>
 #include <Information.h>
 #include "defines.h"
+#include "fpu.h"
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+#if defined (__APPLE__)
+#include <Carbon/Carbon.h> // For Gestalt
+#endif
+
 
 // Always put me below _all_ includes, this is needed
 // in case we run with memory leak detection enabled!
@@ -100,6 +108,7 @@
        init_sse();
        
        QMetaObject::invokeMethod(this, "create_interface", 
Qt::QueuedConnection);
+//     create_interface();
        
        connect(this, SIGNAL(lastWindowClosed()), &pm(), SLOT(exit()));
 }
@@ -160,23 +169,13 @@
 {
        bool generic_mix_functions = true;
 
-#if defined (SSE_OPTIMIZATIONS)
+       FPU fpu;
 
-       unsigned int use_sse = 0;
+#if defined (ARCH_X86) && defined (SSE_OPTIMIZATIONS)
 
-       asm volatile (
-               "mov $1, %%eax\n"
-               "pushl %%ebx\n"
-               "cpuid\n"
-               "popl %%ebx\n"
-               "andl $33554432, %%edx\n"
-               "movl %%edx, %0\n"
-               : "=m" (use_sse)
-               :
-               : "%eax", "%ecx", "%edx", "memory");
+       if (fpu.has_sse()) {
 
-       if (use_sse) {
-               printf("Enabling SSE optimized routines\n");
+               printf("Using SSE optimized routines\n");
 
                // SSE SET
                Mixer::compute_peak             = x86_sse_compute_peak;
@@ -185,9 +184,31 @@
                Mixer::mix_buffers_no_gain      = x86_sse_mix_buffers_no_gain;
 
                generic_mix_functions = false;
+
+       }
+
+#elif defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
+       long sysVersion = 0;
+
+       if (noErr != Gestalt(gestaltSystemVersion, &sysVersion))
+               sysVersion = 0;
+
+       if (sysVersion >= 0x00001040) { // Tiger at least
+               Mixer::compute_peak           = veclib_compute_peak;
+               Mixer::apply_gain_to_buffer   = veclib_apply_gain_to_buffer;
+               Mixer::mix_buffers_with_gain  = veclib_mix_buffers_with_gain;
+               Mixer::mix_buffers_no_gain    = veclib_mix_buffers_no_gain;
+
+               generic_mix_functions = false;
+
+               info << "Apple VecLib H/W specific optimizations in use" << 
endmsg;
        }
 #endif
 
+       /* consider FPU denormal handling to be "h/w optimization" */
+
+       setup_fpu ();
+
 
        if (generic_mix_functions) {
                Mixer::compute_peak             = default_compute_peak;
@@ -200,6 +221,68 @@
 
 }
 
+
+void Traverso::setup_fpu()
+{
+
+       // export TRAVERSO_RUNNING_UNDER_VALGRIND to disable assembler stuff 
below!
+       if (getenv("TRAVERSO_RUNNING_UNDER_VALGRIND")) {
+               printf("TRAVERSO_RUNNING_UNDER_VALGRIND=TRUE\n");
+               // valgrind doesn't understand this assembler stuff
+               // September 10th, 2007
+               return;
+       }
+
+#if defined(ARCH_X86) && defined(USE_XMMINTRIN)
+
+       int MXCSR;
+       FPU fpu;
+
+       /* XXX use real code to determine if the processor supports
+       DenormalsAreZero and FlushToZero
+       */
+       
+       if (!fpu.has_flush_to_zero() && !fpu.has_denormals_are_zero()) {
+               return;
+       }
+
+       MXCSR  = _mm_getcsr();
+
+/*     switch (Config->get_denormal_model()) {
+               case DenormalNone:
+                       MXCSR &= ~(_MM_FLUSH_ZERO_ON|0x8000);
+                       break;
+
+               case DenormalFTZ:
+                       if (fpu.has_flush_to_zero()) {
+                               MXCSR |= _MM_FLUSH_ZERO_ON;
+                       }
+                       break;
+
+               case DenormalDAZ:*/
+                       MXCSR &= ~_MM_FLUSH_ZERO_ON;
+                       if (fpu.has_denormals_are_zero()) {
+                               MXCSR |= 0x8000;
+                       }
+//                     break;
+//             
+//             case DenormalFTZDAZ:
+//                     if (fpu.has_flush_to_zero()) {
+//                             if (fpu.has_denormals_are_zero()) {
+//                                     MXCSR |= _MM_FLUSH_ZERO_ON | 0x8000;
+//                             } else {
+//                                     MXCSR |= _MM_FLUSH_ZERO_ON;
+//                             }
+//                     }
+//                     break;
+//     }
+
+       _mm_setcsr (MXCSR);
+
+#endif
+}
+
+
 void Traverso::prepare_audio_device( )
 {
        int rate = config().get_property("Hardware", "samplerate", 
44100).toInt();

Index: traverso/Traverso.h
===================================================================
RCS file: /sources/traverso/traverso/src/traverso/Traverso.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- traverso/Traverso.h 1 May 2007 21:11:42 -0000       1.9
+++ traverso/Traverso.h 27 Oct 2007 17:57:16 -0000      1.10
@@ -17,7 +17,7 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
 
-    $Id: Traverso.h,v 1.9 2007/05/01 21:11:42 r_sijrier Exp $
+    $Id: Traverso.h,v 1.10 2007/10/27 17:57:16 r_sijrier Exp $
 */
 
 #ifndef Traverso_H
@@ -44,6 +44,7 @@
 
 private :
        void init_sse();
+       void setup_fpu();
         void prepare_audio_device();
        
 private slots:

Index: core/fpu.cc
===================================================================
RCS file: core/fpu.cc
diff -N core/fpu.cc
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ core/fpu.cc 27 Oct 2007 17:57:15 -0000      1.1
@@ -0,0 +1,115 @@
+/*
+Copyright (C) 2007 Remon Sijrier
+
+Copyright (C) 2000-2007 Paul Davis 
+
+This file is part of Traverso
+
+Traverso is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
+
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <fpu.h>
+
+#include "Debugger.h"
+
+FPU::FPU ()
+{
+       unsigned long cpuflags = 0;
+
+       _flags = Flags (0);
+
+#ifndef ARCH_X86
+       return;
+#endif
+       
+#ifndef USE_X86_64_ASM
+       asm volatile (
+               "mov $1, %%eax\n"
+               "pushl %%ebx\n"
+               "cpuid\n"
+               "movl %%edx, %0\n"
+               "popl %%ebx\n"
+               : "=r" (cpuflags)
+               : 
+               : "%eax", "%ecx", "%edx", "memory"
+               );
+       
+#else
+       
+       asm volatile (
+               "pushq %%rbx\n"
+               "movq $1, %%rax\n"
+               "cpuid\n"
+               "movq %%rdx, %0\n"
+               "popq %%rbx\n"
+               : "=r" (cpuflags)
+               : 
+               : "%rax", "%rcx", "%rdx", "memory"
+               );
+
+#endif /* USE_X86_64_ASM */
+       
+       if (cpuflags & (1<<25)) {
+               _flags = Flags (_flags | (HasSSE|HasFlushToZero));
+       }
+
+       if (cpuflags & (1<<26)) {
+               _flags = Flags (_flags | HasSSE2);
+       }
+
+       if (cpuflags & (1 << 24)) {
+               
+               char* fxbuf = 0;
+               
+#ifdef NO_POSIX_MEMALIGN
+               if ((fxbuf = (char *) malloc(512)) == 0)
+#else
+               if (posix_memalign ((void**)&fxbuf, 16, 512)) 
+#endif                 
+               {
+                       PERROR("cannot allocate 16 byte aligned buffer for h/w 
feature detection");
+               } else {
+                       
+                       asm volatile (
+                               "fxsave (%0)"
+                               :
+                               : "r" (fxbuf)
+                               : "memory"
+                               );
+                       
+                       uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]);
+                       
+                       /* if the mask is zero, set its default value (from 
intel specs) */
+                       
+                       if (mxcsr_mask == 0) {
+                               mxcsr_mask = 0xffbf;
+                       }
+                       
+                       if (mxcsr_mask & (1<<6)) {
+                               _flags = Flags (_flags | HasDenormalsAreZero);
+                       } 
+
+                       free (fxbuf);
+               }
+       }
+}                      
+
+FPU::~FPU ()
+{
+}

Index: core/fpu.h
===================================================================
RCS file: core/fpu.h
diff -N core/fpu.h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ core/fpu.h  27 Oct 2007 17:57:15 -0000      1.1
@@ -0,0 +1,50 @@
+/*
+Copyright (C) 2007 Remon Sijrier
+
+Copyright (C) 2000-2007 Paul Davis 
+
+This file is part of Traverso
+
+Traverso is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
+
+*/
+
+#ifndef __pbd_fpu_h__
+#define __pbd_fpu_h__
+
+
+class FPU {
+  private:
+       enum Flags {
+               HasFlushToZero = 0x1,
+               HasDenormalsAreZero = 0x2,
+               HasSSE = 0x4,
+               HasSSE2 = 0x8
+       };
+
+  public:
+       FPU ();
+       ~FPU ();
+
+       bool has_flush_to_zero () const { return _flags & HasFlushToZero; }
+       bool has_denormals_are_zero () const { return _flags & 
HasDenormalsAreZero; }
+       bool has_sse () const { return _flags & HasSSE; }
+       bool has_sse2 () const { return _flags & HasSSE2; }
+       
+  private:
+       Flags _flags;
+};
+
+#endif /* __pbd_fpu_h__ */

Index: engine/sse_functions_64bit.S
===================================================================
RCS file: engine/sse_functions_64bit.S
diff -N engine/sse_functions_64bit.S
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ engine/sse_functions_64bit.S        27 Oct 2007 17:57:15 -0000      1.1
@@ -0,0 +1,609 @@
+/*
+    Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+       Author: Sampo Savolainen
+       64-bit conversion: John Rigg
+
+    $Id: sse_functions_64bit.S,v 1.1 2007/10/27 17:57:15 r_sijrier Exp $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int 
nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+       .type   x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+
+#; %rdi float  *dst
+#; %rsi float  *src    
+#; %rdx unsigned int nframes
+#; %xmm0 float gain
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save the registers
+       pushq %rbx
+       pushq %rdi
+       pushq %rsi
+       
+       #; if nframes == 0, go to end
+       cmp     $0, %rdx
+       je      .MBWG_END
+
+       #; Check for alignment
+
+       movq %rdi, %rax
+       andq $12, %rax #; mask alignment offset
+
+       movq %rsi, %rbx
+       andq $12, %rbx #; mask alignment offset
+
+       cmp %rax, %rbx
+       jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+       #; if we are aligned
+       cmp $0, %rbx
+       jz .MBWG_SSE
+       
+       #; Pre-loop, we need to run 1-3 frames "manually" without
+       #; SSE instructions
+
+.MBWG_PRELOOP:
+       
+       #; gain is already in %xmm0
+       movss (%rsi), %xmm1
+       mulss %xmm0, %xmm1
+       addss (%rdi), %xmm1
+       movss %xmm1, (%rdi)
+
+       addq $4, %rdi #; dst++
+       addq $4, %rsi #; src++
+       decq %rdx         #; nframes--
+       jz .MBWG_END
+
+       addq $4, %rbx
+       
+       cmp $16, %rbx #; test if we've reached 16 byte alignment
+       jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+       cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
+       jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+       #; gain is already in %xmm0
+       shufps  $0x00, %xmm0, %xmm0
+
+
+.MBWG_SSELOOP:
+
+       movaps  (%rsi), %xmm1 #; source => xmm0
+       mulps   %xmm0,  %xmm1 #; apply gain to source
+       addps   (%rdi), %xmm1 #; mix with destination
+       movaps  %xmm1, (%rdi) #; copy result to destination
+       
+       addq $16, %rdi #; dst+=4
+       addq $16, %rsi #; src+=4
+
+       subq $4, %rdx #; nframes-=4
+       cmp $4, %rdx
+       jge .MBWG_SSELOOP
+
+       cmp $0, %rdx
+       je .MBWG_END
+
+       #; if there are remaining frames, the nonalign code will do nicely
+       #; for the rest 1-3 frames.
+       
+.MBWG_NONALIGN:
+       #; not aligned!
+
+       #; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+       movss (%rsi), %xmm1
+       mulss %xmm0, %xmm1
+       addss (%rdi), %xmm1
+       movss %xmm1, (%rdi)
+       
+       addq $4, %rdi
+       addq $4, %rsi
+       
+       decq %rdx
+       jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+       popq %rsi
+       popq %rdi
+       popq %rbx
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int 
nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+       .type   x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save the registers
+       pushq %rbx
+       pushq %rdi
+       pushq %rsi
+       
+       #; the real function
+
+       #; if nframes == 0, go to end
+       cmp     $0, %rdx
+       je      .MBNG_END
+
+       #; Check for alignment
+
+       movq %rdi, %rax
+       andq $12, %rax #; mask alignment offset
+
+       movq %rsi, %rbx
+       andq $12, %rbx #; mask alignment offset
+
+       cmp %rax, %rbx
+       jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+       cmp $0, %rbx
+       je .MBNG_SSE
+
+       #; Pre-loop, we need to run 1-3 frames "manually" without
+       #; SSE instructions
+
+.MBNG_PRELOOP:
+               
+       movss (%rsi), %xmm0
+       addss (%rdi), %xmm0
+       movss %xmm0, (%rdi)
+
+       addq $4, %rdi #; dst++
+       addq $4, %rsi #; src++
+       decq %rdx         #; nframes--
+       jz      .MBNG_END
+       addq $4, %rbx
+       
+       cmp $16, %rbx #; test if we've reached 16 byte alignment
+       jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+       cmp $4, %rdx #; if there are frames left, but less than 4
+       jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+       movaps  (%rsi), %xmm0 #; source => xmm0
+       addps   (%rdi), %xmm0 #; mix with destination
+       movaps  %xmm0, (%rdi) #; copy result to destination
+       
+       addq $16, %rdi #; dst+=4
+       addq $16, %rsi #; src+=4
+
+       subq $4, %rdx #; nframes-=4
+       cmp $4, %rdx
+       jge .MBNG_SSELOOP
+
+       cmp $0, %rdx
+       je .MBNG_END
+
+       #; if there are remaining frames, the nonalign code will do nicely
+       #; for the rest 1-3 frames.
+       
+.MBNG_NONALIGN:
+       #; not aligned!
+
+       movss (%rsi), %xmm0 #; src => xmm0
+       addss (%rdi), %xmm0 #; xmm0 += dst
+       movss %xmm0, (%rdi) #; xmm0 => dst
+       
+       addq $4, %rdi
+       addq $4, %rsi
+       
+       decq %rdx
+       jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+       popq %rsi
+       popq %rdi
+       popq %rbx
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float 
gain);
+
+.globl x86_sse_apply_gain_to_buffer
+       .type   x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+
+#; %rdi         float          *buf    32(%rbp)
+#; %rsi  unsigned int  nframes
+#; %xmm0 float                 gain
+#; %xmm1 float         buf[0]
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save %rdi
+       pushq %rdi
+       
+       #; the real function
+
+       #; if nframes == 0, go to end
+       movq %rsi, %rcx #; nframes
+       cmp     $0, %rcx
+       je      .AG_END
+
+       #; set up the gain buffer (gain is already in %xmm0)
+       shufps  $0x00, %xmm0, %xmm0
+       
+       #; Check for alignment
+
+       movq %rdi, %rdx #; buf => %rdx
+       andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+       jz      .AG_SSE #; if buffer IS aligned
+
+       #; PRE-LOOP
+       #; we iterate 1-3 times, doing normal x87 float comparison
+       #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+       #; Load next value from the buffer into %xmm1
+       movss (%rdi), %xmm1
+       mulss %xmm0, %xmm1
+       movss %xmm1, (%rdi)
+
+       #; increment buffer, decrement counter
+       addq $4, %rdi #; buf++;
+       
+       decq %rcx   #; nframes--
+       jz      .AG_END #; if we run out of frames, we go to the end
+       
+       addq $4, %rdx #; one non-aligned byte less
+       cmp $16, %rdx
+       jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+       #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+       #; Figure out how many loops we should do
+       movq %rcx, %rax #; copy remaining nframes to %rax for division
+       movq $0, %rdx   #; 0 the edx register
+       
+       
+       pushq %rdi
+       movq $4, %rdi
+       divq %rdi #; %rdx = remainder == 0
+       popq %rdi
+
+       #; %rax = SSE iterations
+       cmp $0, %rax
+       je .AGPOST_START
+
+       
+.AGLP_SSE:
+
+       movaps (%rdi), %xmm1
+       mulps %xmm0, %xmm1
+       movaps %xmm1, (%rdi)
+
+       addq $16, %rdi
+       subq $4, %rcx   #; nframes-=4
+
+       decq %rax
+       jnz .AGLP_SSE
+
+       #; Next we need to post-process all remaining frames
+       #; the remaining frame count is in %rcx
+       
+       #; if no remaining frames, jump to the end
+       cmp $0, %rcx
+       andq $3, %rcx #; nframes % 4
+       je .AG_END
+
+.AGPOST_START:
+
+       movss (%rdi), %xmm1
+       mulss %xmm0, %xmm1
+       movss %xmm1, (%rdi)
+
+       #; increment buffer, decrement counter
+       addq $4, %rdi #; buf++;
+       
+       decq %rcx   #; nframes--
+       jnz     .AGPOST_START #; if we run out of frames, we go to the end
+       
+.AG_END:
+
+
+       popq %rdi
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int 
nframes)
+
+.globl x86_sse_apply_gain_vector
+        .type   x86_sse_apply_gain_vector,@function
+
+x86_sse_apply_gain_vector:
+
+#; %rdi float *buf
+#; %rsi float *gain_vector
+#; %rdx unsigned int nframes
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; Save registers
+       pushq %rdi
+       pushq %rsi
+       pushq %rbx
+
+       #; if nframes == 0 go to end
+       cmp $0, %rdx
+       je .AGA_END
+               
+       #; Check alignment
+       movq %rdi, %rax
+       andq $12, %rax
+               
+       movq %rsi, %rbx
+       andq $12, %rbx
+
+       cmp %rax,%rbx
+       jne .AGA_ENDLOOP
+
+       cmp $0, %rax
+       jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
+
+#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
+.AGA_ALIGNLOOP:
+               
+       movss (%rdi), %xmm0 #; buf => xmm0
+       movss (%rsi), %xmm1 #; gain value => xmm1
+       mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+       movss %xmm0, (%rdi) #; signal with gain => buf
+
+       decq %rdx
+       jz .AGA_END
+
+       addq $4, %rdi #; buf++
+       addq $4, %rsi #; gab++
+       
+       addq $4, %rax
+       cmp $16, %rax
+       jne .AGA_ALIGNLOOP
+       
+#; There are frames left for sure, as that is checked in the beginning
+#; and within the previous loop. BUT, there might be less than 4 frames
+#; to process
+
+.AGA_SSE:
+       movq %rdx, %rax #; nframes => %rax
+       shr $2, %rax #; unsigned divide by 4
+
+       cmp $0, %rax  #; Jos toimii ilman t�t�, niin kiva
+       je .AGA_ENDLOOP
+
+.AGA_SSELOOP:
+       movaps (%rdi), %xmm0
+       movaps (%rsi), %xmm1
+       mulps %xmm1, %xmm0
+       movaps %xmm0, (%rdi)
+
+       addq $16, %rdi
+       addq $16, %rsi
+
+       decq %rax
+       jnz .AGA_SSELOOP
+
+       andq $3, %rdx #; Remaining frames are nframes & 3
+       jz .AGA_END
+
+
+#; Inside this loop, we know there are frames left to process
+#; but because either there are < 4 frames left, or the buffers
+#; are not aligned, we can't use the parallel SSE ops
+.AGA_ENDLOOP:
+       movss (%rdi), %xmm0 #; buf => xmm0
+       movss (%rsi), %xmm1 #; gain value => xmm1
+       mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+       movss %xmm0, (%rdi) #; signal with gain => buf
+
+       addq $4,%rdi
+       addq $4,%rsi
+       decq %rdx #; nframes--
+       jnz .AGA_ENDLOOP
+
+.AGA_END:
+
+       popq %rbx
+       popq %rsi
+       popq %rdi
+
+       leave
+       ret
+
+.size  x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
+#; end proc
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+       .type   x86_sse_compute_peak,@function
+
+       
+x86_sse_compute_peak:
+
+#; %rdi         float          *buf    32(%rbp)
+#; %rsi         unsigned int   nframes
+#; %xmm0 float         current
+#; %xmm1 float         buf[0]
+
+       pushq %rbp
+       movq %rsp, %rbp
+
+       #; save %rdi
+       pushq %rdi
+       
+       #; if nframes == 0, go to end
+       movq %rsi, %rcx #; nframes
+       cmp     $0, %rcx
+       je      .CP_END
+
+       #; create the "abs" mask in %xmm2
+       pushq   $2147483647
+       movss   (%rsp), %xmm2
+       addq    $8, %rsp
+       shufps  $0x00, %xmm2, %xmm2
+
+       #; Check for alignment
+
+       #;movq 8(%rbp), %rdi #; buf 
+       movq %rdi, %rdx #; buf => %rdx
+       andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+       jz      .CP_SSE #; if buffer IS aligned
+
+       #; PRE-LOOP
+       #; we iterate 1-3 times, doing normal x87 float comparison
+       #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.LP_START:
+
+       #; Load next value from the buffer
+       movss (%rdi), %xmm1
+       andps %xmm2, %xmm1
+       maxss %xmm1, %xmm0
+
+       #; increment buffer, decrement counter
+       addq $4, %rdi #; buf++;
+       
+       decq %rcx   #; nframes--
+       jz      .CP_END #; if we run out of frames, we go to the end
+       
+       addq $4, %rdx #; one non-aligned byte less
+       cmp $16, %rdx
+       jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+       #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+       #; Figure out how many loops we should do
+       movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+       shr $2,%rax #; unsigned divide by 4
+       jz .POST_START
+
+       #; %rax = SSE iterations
+
+       #; current maximum is at %xmm0, but we need to ..
+       shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+       #;prefetcht0 16(%rdi)
+
+.LP_SSE:
+
+       movaps (%rdi), %xmm1
+       andps %xmm2, %xmm1
+       maxps %xmm1, %xmm0
+
+       addq $16, %rdi
+
+       decq %rax
+       jnz .LP_SSE
+
+       #; Calculate the maximum value contained in the 4 FP's in %xmm0
+       movaps %xmm0, %xmm1
+       shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+       maxps  %xmm1, %xmm0 #; maximums of the two pairs
+       movaps %xmm0, %xmm1
+       shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs 
(1234 => 2143)
+       maxps  %xmm1, %xmm0 
+
+       #; now every float in %xmm0 is the same value, current maximum value
+       
+       #; Next we need to post-process all remaining frames
+       #; the remaining frame count is in %rcx
+       
+       #; if no remaining frames, jump to the end
+
+       andq $3, %rcx #; nframes % 4
+       jz .CP_END
+
+.POST_START:
+
+       movss (%rdi), %xmm1
+       andps %xmm2, %xmm1
+       maxss %xmm1, %xmm0
+       
+       addq $4, %rdi   #; buf++;
+       
+       decq %rcx               #; nframes--;
+       jnz .POST_START
+
+.CP_END:
+
+       popq %rdi
+       
+       #; return
+       leave
+       ret
+
+.size  x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+




reply via email to

[Prev in Thread] Current Thread [Next in Thread]