[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Traverso-commit] traverso/src audiofileio/audiofileio.pro base.p...
From: |
Remon Sijrier |
Subject: |
[Traverso-commit] traverso/src audiofileio/audiofileio.pro base.p... |
Date: |
Sat, 27 Oct 2007 17:57:16 +0000 |
CVSROOT: /sources/traverso
Module name: traverso
Changes by: Remon Sijrier <r_sijrier> 07/10/27 17:57:16
Modified files:
src/audiofileio: audiofileio.pro
src : base.pri
src/common : Mixer.cpp Mixer.h
src/core : core.pro
src/engine : engine.pro
src/traverso : Traverso.cpp Traverso.h
Added files:
src/core : fpu.cc fpu.h
src/engine : sse_functions_64bit.S
Log message:
* sync sse based code to ardour2's version
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/traverso/src/audiofileio/audiofileio.pro?cvsroot=traverso&r1=1.15&r2=1.16
http://cvs.savannah.gnu.org/viewcvs/traverso/src/base.pri?cvsroot=traverso&r1=1.47&r2=1.48
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/Mixer.cpp?cvsroot=traverso&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/traverso/src/common/Mixer.h?cvsroot=traverso&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/core.pro?cvsroot=traverso&r1=1.43&r2=1.44
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/fpu.cc?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/core/fpu.h?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/engine/engine.pro?cvsroot=traverso&r1=1.20&r2=1.21
http://cvs.savannah.gnu.org/viewcvs/traverso/src/engine/sse_functions_64bit.S?cvsroot=traverso&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/traverso/src/traverso/Traverso.cpp?cvsroot=traverso&r1=1.47&r2=1.48
http://cvs.savannah.gnu.org/viewcvs/traverso/src/traverso/Traverso.h?cvsroot=traverso&r1=1.9&r2=1.10
Patches:
Index: audiofileio/audiofileio.pro
===================================================================
RCS file: /sources/traverso/traverso/src/audiofileio/audiofileio.pro,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -b -r1.15 -r1.16
--- audiofileio/audiofileio.pro 20 Oct 2007 17:38:16 -0000 1.15
+++ audiofileio/audiofileio.pro 27 Oct 2007 17:57:15 -0000 1.16
@@ -46,7 +46,3 @@
INCLUDEPATH += ../../3thparty/include .
}
-HEADERS -= PeakDataReader.h \
-decode/PeakDataReader.h
-SOURCES -= decode/PeakDataReader.cpp
-
Index: base.pri
===================================================================
RCS file: /sources/traverso/traverso/src/base.pri,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -b -r1.47 -r1.48
--- base.pri 26 Sep 2007 16:32:52 -0000 1.47
+++ base.pri 27 Oct 2007 17:57:15 -0000 1.48
@@ -20,7 +20,7 @@
DEFINES += JACK_SUPPORT
DEFINES += ALSA_SUPPORT
-#DEFINES += PORTAUDIO_SUPPORT
+DEFINES += PORTAUDIO_SUPPORT
DEFINES += LV2_SUPPORT
DEFINES += QT_OPENGL_SUPPORT
@@ -49,7 +49,7 @@
#################################################
-# DEFINES += STATIC_BUILD
+DEFINES += STATIC_BUILD
#
# Use Memory Locking
@@ -97,15 +97,47 @@
MACHINETYPE = $$system(arch)
- contains( MACHINETYPE, x86_64 ) {
- QMAKE_CXXFLAGS_RELEASE += -mtune=athlon64
- }
+ X86_FLAGS = $$system(cat /proc/cpuinfo | grep '^flags')
+
+ HOST_SUPPORTS_SSE = 0
- contains( MACHINETYPE, i[456]86) {
+ contains(X86_FLAGS, sse) {
+ HOST_SUPPORTS_SSE = 1
DEFINES += SSE_OPTIMIZATIONS
+ }
+
+ contains(X86_FLAGS, mmx) {
+ QMAKE_CXXFLAGS_RELEASE += -mmmx
+ }
+
+ contains(X86_FLAGS, 3dnow) {
+ QMAKE_CXXFLAGS_RELEASE += -m3dnow
+ }
+
+ contains(MACHINETYPE, i586) {
+ QMAKE_CXXFLAGS_RELEASE += -march=i586
+ }
+
+ contains(MACHINETYPE, i686) {
+ QMAKE_CXXFLAGS_RELEASE += -march=i686
+ eval(HOST_SUPPORTS_SSE == 1) {
QMAKE_CXXFLAGS_RELEASE += -msse -mfpmath=sse
+ DEFINES += USE_XMMINTRIN
+ }
+ }
+
+ contains(MACHINETYPE, x86_64) {
+ eval(HOST_SUPPORTS_SSE == 1) {
+ QMAKE_CXXFLAGS_RELEASE += -msse -mfpmath=sse
+ DEFINES += USE_XMMINTRIN USE_X86_64_ASM
+ }
}
+ contains(MACHINETYPE, i[456]86) {
+ DEFINES += ARCH_X86
+ }
+
+
}
GCCVERSION = $$system(gcc -dumpversion)
@@ -138,6 +170,9 @@
QMAKE_LFLAGS_SONAME = -Wl,-install_name,@executable_path/../Frameworks/
RC_FILE = ../../resources/images/traverso_mac.icns
+
+# Uncomment if dest. target is (at least) tiger (works maybe on other targets
as well ?)
+# DEFINES += BUILD_VECLIB_OPTIMIZATIONS
}
win32 {
Index: common/Mixer.cpp
===================================================================
RCS file: /sources/traverso/traverso/src/common/Mixer.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- common/Mixer.cpp 20 Oct 2007 17:38:16 -0000 1.1
+++ common/Mixer.cpp 27 Oct 2007 17:57:15 -0000 1.2
@@ -17,7 +17,7 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- $Id: Mixer.cpp,v 1.1 2007/10/20 17:38:16 r_sijrier Exp $
+ $Id: Mixer.cpp,v 1.2 2007/10/27 17:57:15 r_sijrier Exp $
*/
#include "Mixer.h"
@@ -31,7 +31,7 @@
-float default_compute_peak (audio_sample_t* buf, nframes_t nsamples, float
current)
+float default_compute_peak (const audio_sample_t* buf, nframes_t nsamples,
float current)
{
for (nframes_t i = 0; i < nsamples; ++i) {
current = f_max (current, fabsf (buf[i]));
@@ -46,16 +46,52 @@
buf[i] *= gain;
}
-void default_mix_buffers_with_gain (audio_sample_t* dst, audio_sample_t* src,
nframes_t nframes, float gain)
+void default_mix_buffers_with_gain (audio_sample_t* dst, const audio_sample_t*
src, nframes_t nframes, float gain)
{
for (nframes_t i = 0; i < nframes; i++) {
dst[i] += src[i] * gain;
}
}
-void default_mix_buffers_no_gain (audio_sample_t* dst, audio_sample_t* src,
nframes_t nframes)
+void default_mix_buffers_no_gain (audio_sample_t* dst, const audio_sample_t*
src, nframes_t nframes)
{
for (nframes_t i=0; i < nframes; i++) {
dst[i] += src[i];
}
}
+
+
+#if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
+#include <Accelerate/Accelerate.h>
+
+float veclib_compute_peak (const audio_sample_t* buf, nframes_t nsamples,
float current)
+{
+ float tmpmax = 0.0f;
+ vDSP_maxmgv(buf, 1, &tmpmax, nsamples);
+ return f_max(current, tmpmax);
+}
+
+void veclib_find_peaks (const audio_sample_t* buf, nframes_t nframes, float
*min, float *max)
+{
+ vDSP_maxv (const_cast<audio_sample_t*>(buf), 1, max, nframes);
+ vDSP_minv (const_cast<audio_sample_t*>(buf), 1, min, nframes);
+}
+
+void veclib_apply_gain_to_buffer (audio_sample_t * buf, nframes_t nframes,
float gain)
+{
+ vDSP_vsmul(buf, 1, &gain, buf, 1, nframes);
+}
+
+void veclib_mix_buffers_with_gain (audio_sample_t * dst, const audio_sample_t
* src, nframes_t nframes, float gain)
+{
+ vDSP_vsma(src, 1, &gain, dst, 1, dst, 1, nframes);
+}
+
+void veclib_mix_buffers_no_gain (audio_sample_t * dst, const audio_sample_t *
src, nframes_t nframes)
+{
+ // It seems that a vector mult only operation does not exist...
+ float gain = 1.0f;
+ vDSP_vsma(src, 1, &gain, dst, 1, dst, 1, nframes);
+}
+
+#endif
Index: common/Mixer.h
===================================================================
RCS file: /sources/traverso/traverso/src/common/Mixer.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- common/Mixer.h 20 Oct 2007 17:38:17 -0000 1.1
+++ common/Mixer.h 27 Oct 2007 17:57:15 -0000 1.2
@@ -17,11 +17,11 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- $Id: Mixer.h,v 1.1 2007/10/20 17:38:17 r_sijrier Exp $
+ $Id: Mixer.h,v 1.2 2007/10/27 17:57:15 r_sijrier Exp $
*/
-#ifndef MIXER_H
-#define MIXER_H
+#ifndef TRAVERSO_MIXER_H
+#define TRAVERSO_MIXER_H
#include "defines.h"
#include <cmath>
@@ -60,32 +60,40 @@
}
-float default_compute_peak (audio_sample_t* buf,
nframes_t nsamples, float current);
+float default_compute_peak (const audio_sample_t* buf,
nframes_t nsamples, float current);
void default_apply_gain_to_buffer (audio_sample_t* buf,
nframes_t nframes, float gain);
-void default_mix_buffers_with_gain (audio_sample_t* dst,
audio_sample_t* src, nframes_t nframes, float gain);
-void default_mix_buffers_no_gain (audio_sample_t* dst,
audio_sample_t* src, nframes_t nframes);
+void default_mix_buffers_with_gain (audio_sample_t* dst, const
audio_sample_t* src, nframes_t nframes, float gain);
+void default_mix_buffers_no_gain (audio_sample_t* dst, const
audio_sample_t* src, nframes_t nframes);
-#if defined (SSE_OPTIMIZATIONS)
+
+#if defined (ARCH_X86) && defined (SSE_OPTIMIZATIONS)
extern "C"
{
/* SSE functions */
- float x86_sse_compute_peak (audio_sample_t* buf,
nframes_t nsamples, float current);
+ float x86_sse_compute_peak (const audio_sample_t* buf,
nframes_t nsamples, float current);
void x86_sse_apply_gain_to_buffer (audio_sample_t* buf,
nframes_t nframes, float gain);
- void x86_sse_mix_buffers_with_gain (audio_sample_t* dst,
audio_sample_t* src, nframes_t nframes, float gain);
- void x86_sse_mix_buffers_no_gain (audio_sample_t* dst,
audio_sample_t* src, nframes_t nframes);
+ void x86_sse_mix_buffers_with_gain (audio_sample_t* dst, const
audio_sample_t* src, nframes_t nframes, float gain);
+ void x86_sse_mix_buffers_no_gain (audio_sample_t* dst, const
audio_sample_t* src, nframes_t nframes);
}
#endif
+#if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
+
+float veclib_compute_peak (const audio_sample_t* buf, nframes_t
nsamples, float current);
+void veclib_apply_gain_to_buffer (audio_sample_t* buf, nframes_t
nframes, float gain);
+void veclib_mix_buffers_with_gain (audio_sample_t* dst, const
audio_sample_t* src, nframes_t nframes, float gain);
+void veclib_mix_buffers_no_gain (audio_sample_t* dst, const
audio_sample_t* src, nframes_t nframes);
+#endif
class Mixer
{
public:
- typedef float (*compute_peak_t)
(audio_sample_t* , nframes_t, float);
+ typedef float (*compute_peak_t) (const
audio_sample_t* , nframes_t, float);
typedef void (*apply_gain_to_buffer_t)
(audio_sample_t* , nframes_t, float);
- typedef void (*mix_buffers_with_gain_t) (audio_sample_t* ,
audio_sample_t* , nframes_t, float);
- typedef void (*mix_buffers_no_gain_t) (audio_sample_t* ,
audio_sample_t* , nframes_t);
+ typedef void (*mix_buffers_with_gain_t) (audio_sample_t* ,
const audio_sample_t* , nframes_t, float);
+ typedef void (*mix_buffers_no_gain_t) (audio_sample_t* ,
const audio_sample_t* , nframes_t);
static compute_peak_t compute_peak;
static apply_gain_to_buffer_t apply_gain_to_buffer;
Index: core/core.pro
===================================================================
RCS file: /sources/traverso/traverso/src/core/core.pro,v
retrieving revision 1.43
retrieving revision 1.44
diff -u -b -r1.43 -r1.44
--- core/core.pro 22 Oct 2007 16:49:43 -0000 1.43
+++ core/core.pro 27 Oct 2007 17:57:15 -0000 1.44
@@ -57,7 +57,8 @@
Marker.cpp \
Themer.cpp \
AudioFileMerger.cpp \
- ProjectConverter.cpp
+ ProjectConverter.cpp \
+ fpu.cc
HEADERS = precompile.h \
../common/Utils.h \
../common/Tsar.h \
@@ -96,7 +97,6 @@
gdither_types.h \
gdither_types_internal.h \
noise.h \
- FastDelegate.h \
SnapList.h \
Snappable.h \
CommandPlugin.h \
@@ -104,7 +104,8 @@
Marker.h \
Themer.h \
AudioFileMerger.h \
- ProjectConverter.h
+ ProjectConverter.h \
+ fpu.h
macx{
QMAKE_LIBDIR += /usr/local/qt/lib
}
Index: engine/engine.pro
===================================================================
RCS file: /sources/traverso/traverso/src/engine/engine.pro,v
retrieving revision 1.20
retrieving revision 1.21
diff -u -b -r1.20 -r1.21
--- engine/engine.pro 20 Oct 2007 17:38:19 -0000 1.20
+++ engine/engine.pro 27 Oct 2007 17:57:15 -0000 1.21
@@ -53,7 +53,14 @@
}
unix{
- contains(DEFINES, SSE_OPTIMIZATIONS): SOURCES += sse_functions.S
+ contains(DEFINES, SSE_OPTIMIZATIONS) {
+ constain(DEFINES, USE_X86_64_ASM) {
+ SOURCES += sse_functions_64bit.S
+ }
+ contains(DEFINES, ARCH_X86) {
+ SOURCES += sse_functions.S
+ }
+ }
}
macx{
Index: traverso/Traverso.cpp
===================================================================
RCS file: /sources/traverso/traverso/src/traverso/Traverso.cpp,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -b -r1.47 -r1.48
--- traverso/Traverso.cpp 31 Aug 2007 09:19:51 -0000 1.47
+++ traverso/Traverso.cpp 27 Oct 2007 17:57:16 -0000 1.48
@@ -35,6 +35,14 @@
#include <ContextPointer.h>
#include <Information.h>
#include "defines.h"
+#include "fpu.h"
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+#if defined (__APPLE__)
+#include <Carbon/Carbon.h> // For Gestalt
+#endif
+
// Always put me below _all_ includes, this is needed
// in case we run with memory leak detection enabled!
@@ -100,6 +108,7 @@
init_sse();
QMetaObject::invokeMethod(this, "create_interface",
Qt::QueuedConnection);
+// create_interface();
connect(this, SIGNAL(lastWindowClosed()), &pm(), SLOT(exit()));
}
@@ -160,23 +169,13 @@
{
bool generic_mix_functions = true;
-#if defined (SSE_OPTIMIZATIONS)
+ FPU fpu;
- unsigned int use_sse = 0;
+#if defined (ARCH_X86) && defined (SSE_OPTIMIZATIONS)
- asm volatile (
- "mov $1, %%eax\n"
- "pushl %%ebx\n"
- "cpuid\n"
- "popl %%ebx\n"
- "andl $33554432, %%edx\n"
- "movl %%edx, %0\n"
- : "=m" (use_sse)
- :
- : "%eax", "%ecx", "%edx", "memory");
+ if (fpu.has_sse()) {
- if (use_sse) {
- printf("Enabling SSE optimized routines\n");
+ printf("Using SSE optimized routines\n");
// SSE SET
Mixer::compute_peak = x86_sse_compute_peak;
@@ -185,9 +184,31 @@
Mixer::mix_buffers_no_gain = x86_sse_mix_buffers_no_gain;
generic_mix_functions = false;
+
+ }
+
+#elif defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
+ long sysVersion = 0;
+
+ if (noErr != Gestalt(gestaltSystemVersion, &sysVersion))
+ sysVersion = 0;
+
+ if (sysVersion >= 0x00001040) { // Tiger at least
+ Mixer::compute_peak = veclib_compute_peak;
+ Mixer::apply_gain_to_buffer = veclib_apply_gain_to_buffer;
+ Mixer::mix_buffers_with_gain = veclib_mix_buffers_with_gain;
+ Mixer::mix_buffers_no_gain = veclib_mix_buffers_no_gain;
+
+ generic_mix_functions = false;
+
+ info << "Apple VecLib H/W specific optimizations in use" <<
endmsg;
}
#endif
+ /* consider FPU denormal handling to be "h/w optimization" */
+
+ setup_fpu ();
+
if (generic_mix_functions) {
Mixer::compute_peak = default_compute_peak;
@@ -200,6 +221,68 @@
}
+
+void Traverso::setup_fpu()
+{
+
+ // export TRAVERSO_RUNNING_UNDER_VALGRIND to disable assembler stuff
below!
+ if (getenv("TRAVERSO_RUNNING_UNDER_VALGRIND")) {
+ printf("TRAVERSO_RUNNING_UNDER_VALGRIND=TRUE\n");
+ // valgrind doesn't understand this assembler stuff
+ // September 10th, 2007
+ return;
+ }
+
+#if defined(ARCH_X86) && defined(USE_XMMINTRIN)
+
+ int MXCSR;
+ FPU fpu;
+
+ /* XXX use real code to determine if the processor supports
+ DenormalsAreZero and FlushToZero
+ */
+
+ if (!fpu.has_flush_to_zero() && !fpu.has_denormals_are_zero()) {
+ return;
+ }
+
+ MXCSR = _mm_getcsr();
+
+/* switch (Config->get_denormal_model()) {
+ case DenormalNone:
+ MXCSR &= ~(_MM_FLUSH_ZERO_ON|0x8000);
+ break;
+
+ case DenormalFTZ:
+ if (fpu.has_flush_to_zero()) {
+ MXCSR |= _MM_FLUSH_ZERO_ON;
+ }
+ break;
+
+ case DenormalDAZ:*/
+ MXCSR &= ~_MM_FLUSH_ZERO_ON;
+ if (fpu.has_denormals_are_zero()) {
+ MXCSR |= 0x8000;
+ }
+// break;
+//
+// case DenormalFTZDAZ:
+// if (fpu.has_flush_to_zero()) {
+// if (fpu.has_denormals_are_zero()) {
+// MXCSR |= _MM_FLUSH_ZERO_ON | 0x8000;
+// } else {
+// MXCSR |= _MM_FLUSH_ZERO_ON;
+// }
+// }
+// break;
+// }
+
+ _mm_setcsr (MXCSR);
+
+#endif
+}
+
+
void Traverso::prepare_audio_device( )
{
int rate = config().get_property("Hardware", "samplerate",
44100).toInt();
Index: traverso/Traverso.h
===================================================================
RCS file: /sources/traverso/traverso/src/traverso/Traverso.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- traverso/Traverso.h 1 May 2007 21:11:42 -0000 1.9
+++ traverso/Traverso.h 27 Oct 2007 17:57:16 -0000 1.10
@@ -17,7 +17,7 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- $Id: Traverso.h,v 1.9 2007/05/01 21:11:42 r_sijrier Exp $
+ $Id: Traverso.h,v 1.10 2007/10/27 17:57:16 r_sijrier Exp $
*/
#ifndef Traverso_H
@@ -44,6 +44,7 @@
private :
void init_sse();
+ void setup_fpu();
void prepare_audio_device();
private slots:
Index: core/fpu.cc
===================================================================
RCS file: core/fpu.cc
diff -N core/fpu.cc
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ core/fpu.cc 27 Oct 2007 17:57:15 -0000 1.1
@@ -0,0 +1,115 @@
+/*
+Copyright (C) 2007 Remon Sijrier
+
+Copyright (C) 2000-2007 Paul Davis
+
+This file is part of Traverso
+
+Traverso is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <fpu.h>
+
+#include "Debugger.h"
+
+FPU::FPU ()
+{
+ unsigned long cpuflags = 0;
+
+ _flags = Flags (0);
+
+#ifndef ARCH_X86
+ return;
+#endif
+
+#ifndef USE_X86_64_ASM
+ asm volatile (
+ "mov $1, %%eax\n"
+ "pushl %%ebx\n"
+ "cpuid\n"
+ "movl %%edx, %0\n"
+ "popl %%ebx\n"
+ : "=r" (cpuflags)
+ :
+ : "%eax", "%ecx", "%edx", "memory"
+ );
+
+#else
+
+ asm volatile (
+ "pushq %%rbx\n"
+ "movq $1, %%rax\n"
+ "cpuid\n"
+ "movq %%rdx, %0\n"
+ "popq %%rbx\n"
+ : "=r" (cpuflags)
+ :
+ : "%rax", "%rcx", "%rdx", "memory"
+ );
+
+#endif /* USE_X86_64_ASM */
+
+ if (cpuflags & (1<<25)) {
+ _flags = Flags (_flags | (HasSSE|HasFlushToZero));
+ }
+
+ if (cpuflags & (1<<26)) {
+ _flags = Flags (_flags | HasSSE2);
+ }
+
+ if (cpuflags & (1 << 24)) {
+
+ char* fxbuf = 0;
+
+#ifdef NO_POSIX_MEMALIGN
+ if ((fxbuf = (char *) malloc(512)) == 0)
+#else
+ if (posix_memalign ((void**)&fxbuf, 16, 512))
+#endif
+ {
+ PERROR("cannot allocate 16 byte aligned buffer for h/w
feature detection");
+ } else {
+
+ asm volatile (
+ "fxsave (%0)"
+ :
+ : "r" (fxbuf)
+ : "memory"
+ );
+
+ uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]);
+
+ /* if the mask is zero, set its default value (from
intel specs) */
+
+ if (mxcsr_mask == 0) {
+ mxcsr_mask = 0xffbf;
+ }
+
+ if (mxcsr_mask & (1<<6)) {
+ _flags = Flags (_flags | HasDenormalsAreZero);
+ }
+
+ free (fxbuf);
+ }
+ }
+}
+
+FPU::~FPU ()
+{
+}
Index: core/fpu.h
===================================================================
RCS file: core/fpu.h
diff -N core/fpu.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ core/fpu.h 27 Oct 2007 17:57:15 -0000 1.1
@@ -0,0 +1,50 @@
+/*
+Copyright (C) 2007 Remon Sijrier
+
+Copyright (C) 2000-2007 Paul Davis
+
+This file is part of Traverso
+
+Traverso is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+*/
+
+#ifndef __pbd_fpu_h__
+#define __pbd_fpu_h__
+
+
+class FPU {
+ private:
+ enum Flags {
+ HasFlushToZero = 0x1,
+ HasDenormalsAreZero = 0x2,
+ HasSSE = 0x4,
+ HasSSE2 = 0x8
+ };
+
+ public:
+ FPU ();
+ ~FPU ();
+
+ bool has_flush_to_zero () const { return _flags & HasFlushToZero; }
+ bool has_denormals_are_zero () const { return _flags &
HasDenormalsAreZero; }
+ bool has_sse () const { return _flags & HasSSE; }
+ bool has_sse2 () const { return _flags & HasSSE2; }
+
+ private:
+ Flags _flags;
+};
+
+#endif /* __pbd_fpu_h__ */
Index: engine/sse_functions_64bit.S
===================================================================
RCS file: engine/sse_functions_64bit.S
diff -N engine/sse_functions_64bit.S
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ engine/sse_functions_64bit.S 27 Oct 2007 17:57:15 -0000 1.1
@@ -0,0 +1,609 @@
+/*
+ Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Author: Sampo Savolainen
+ 64-bit conversion: John Rigg
+
+ $Id: sse_functions_64bit.S,v 1.1 2007/10/27 17:57:15 r_sijrier Exp $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int
nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+ .type x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+#; %xmm0 float gain
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save the registers
+ pushq %rbx
+ pushq %rdi
+ pushq %rsi
+
+ #; if nframes == 0, go to end
+ cmp $0, %rdx
+ je .MBWG_END
+
+ #; Check for alignment
+
+ movq %rdi, %rax
+ andq $12, %rax #; mask alignment offset
+
+ movq %rsi, %rbx
+ andq $12, %rbx #; mask alignment offset
+
+ cmp %rax, %rbx
+ jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+ #; if we are aligned
+ cmp $0, %rbx
+ jz .MBWG_SSE
+
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+
+.MBWG_PRELOOP:
+
+ #; gain is already in %xmm0
+ movss (%rsi), %xmm1
+ mulss %xmm0, %xmm1
+ addss (%rdi), %xmm1
+ movss %xmm1, (%rdi)
+
+ addq $4, %rdi #; dst++
+ addq $4, %rsi #; src++
+ decq %rdx #; nframes--
+ jz .MBWG_END
+
+ addq $4, %rbx
+
+ cmp $16, %rbx #; test if we've reached 16 byte alignment
+ jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+ cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
+ jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+ #; gain is already in %xmm0
+ shufps $0x00, %xmm0, %xmm0
+
+
+.MBWG_SSELOOP:
+
+ movaps (%rsi), %xmm1 #; source => xmm0
+ mulps %xmm0, %xmm1 #; apply gain to source
+ addps (%rdi), %xmm1 #; mix with destination
+ movaps %xmm1, (%rdi) #; copy result to destination
+
+ addq $16, %rdi #; dst+=4
+ addq $16, %rsi #; src+=4
+
+ subq $4, %rdx #; nframes-=4
+ cmp $4, %rdx
+ jge .MBWG_SSELOOP
+
+ cmp $0, %rdx
+ je .MBWG_END
+
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+
+.MBWG_NONALIGN:
+ #; not aligned!
+
+ #; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+ movss (%rsi), %xmm1
+ mulss %xmm0, %xmm1
+ addss (%rdi), %xmm1
+ movss %xmm1, (%rdi)
+
+ addq $4, %rdi
+ addq $4, %rsi
+
+ decq %rdx
+ jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+ popq %rsi
+ popq %rdi
+ popq %rbx
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int
nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+ .type x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save the registers
+ pushq %rbx
+ pushq %rdi
+ pushq %rsi
+
+ #; the real function
+
+ #; if nframes == 0, go to end
+ cmp $0, %rdx
+ je .MBNG_END
+
+ #; Check for alignment
+
+ movq %rdi, %rax
+ andq $12, %rax #; mask alignment offset
+
+ movq %rsi, %rbx
+ andq $12, %rbx #; mask alignment offset
+
+ cmp %rax, %rbx
+ jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+ cmp $0, %rbx
+ je .MBNG_SSE
+
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+
+.MBNG_PRELOOP:
+
+ movss (%rsi), %xmm0
+ addss (%rdi), %xmm0
+ movss %xmm0, (%rdi)
+
+ addq $4, %rdi #; dst++
+ addq $4, %rsi #; src++
+ decq %rdx #; nframes--
+ jz .MBNG_END
+ addq $4, %rbx
+
+ cmp $16, %rbx #; test if we've reached 16 byte alignment
+ jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+ cmp $4, %rdx #; if there are frames left, but less than 4
+ jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+ movaps (%rsi), %xmm0 #; source => xmm0
+ addps (%rdi), %xmm0 #; mix with destination
+ movaps %xmm0, (%rdi) #; copy result to destination
+
+ addq $16, %rdi #; dst+=4
+ addq $16, %rsi #; src+=4
+
+ subq $4, %rdx #; nframes-=4
+ cmp $4, %rdx
+ jge .MBNG_SSELOOP
+
+ cmp $0, %rdx
+ je .MBNG_END
+
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+
+.MBNG_NONALIGN:
+ #; not aligned!
+
+ movss (%rsi), %xmm0 #; src => xmm0
+ addss (%rdi), %xmm0 #; xmm0 += dst
+ movss %xmm0, (%rdi) #; xmm0 => dst
+
+ addq $4, %rdi
+ addq $4, %rsi
+
+ decq %rdx
+ jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+ popq %rsi
+ popq %rdi
+ popq %rbx
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float
gain);
+
+.globl x86_sse_apply_gain_to_buffer
+ .type x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+
+#; %rdi float *buf 32(%rbp)
+#; %rsi unsigned int nframes
+#; %xmm0 float gain
+#; %xmm1 float buf[0]
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save %rdi
+ pushq %rdi
+
+ #; the real function
+
+ #; if nframes == 0, go to end
+ movq %rsi, %rcx #; nframes
+ cmp $0, %rcx
+ je .AG_END
+
+ #; set up the gain buffer (gain is already in %xmm0)
+ shufps $0x00, %xmm0, %xmm0
+
+ #; Check for alignment
+
+ movq %rdi, %rdx #; buf => %rdx
+ andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .AG_SSE #; if buffer IS aligned
+
+ #; PRE-LOOP
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+ #; Load next value from the buffer into %xmm1
+ movss (%rdi), %xmm1
+ mulss %xmm0, %xmm1
+ movss %xmm1, (%rdi)
+
+ #; increment buffer, decrement counter
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--
+ jz .AG_END #; if we run out of frames, we go to the end
+
+ addq $4, %rdx #; one non-aligned byte less
+ cmp $16, %rdx
+ jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+ #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+ #; Figure out how many loops we should do
+ movq %rcx, %rax #; copy remaining nframes to %rax for division
+ movq $0, %rdx #; 0 the edx register
+
+
+ pushq %rdi
+ movq $4, %rdi
+ divq %rdi #; %rdx = remainder == 0
+ popq %rdi
+
+ #; %rax = SSE iterations
+ cmp $0, %rax
+ je .AGPOST_START
+
+
+.AGLP_SSE:
+
+ movaps (%rdi), %xmm1
+ mulps %xmm0, %xmm1
+ movaps %xmm1, (%rdi)
+
+ addq $16, %rdi
+ subq $4, %rcx #; nframes-=4
+
+ decq %rax
+ jnz .AGLP_SSE
+
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %rcx
+
+ #; if no remaining frames, jump to the end
+ cmp $0, %rcx
+ andq $3, %rcx #; nframes % 4
+ je .AG_END
+
+.AGPOST_START:
+
+ movss (%rdi), %xmm1
+ mulss %xmm0, %xmm1
+ movss %xmm1, (%rdi)
+
+ #; increment buffer, decrement counter
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--
+ jnz .AGPOST_START #; if we run out of frames, we go to the end
+
+.AG_END:
+
+
+ popq %rdi
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int
nframes)
+
+.globl x86_sse_apply_gain_vector
+ .type x86_sse_apply_gain_vector,@function
+
+x86_sse_apply_gain_vector:
+
+#; %rdi float *buf
+#; %rsi float *gain_vector
+#; %rdx unsigned int nframes
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; Save registers
+ pushq %rdi
+ pushq %rsi
+ pushq %rbx
+
+ #; if nframes == 0 go to end
+ cmp $0, %rdx
+ je .AGA_END
+
+ #; Check alignment
+ movq %rdi, %rax
+ andq $12, %rax
+
+ movq %rsi, %rbx
+ andq $12, %rbx
+
+ cmp %rax,%rbx
+ jne .AGA_ENDLOOP
+
+ cmp $0, %rax
+ jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
+
+#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
+.AGA_ALIGNLOOP:
+
+ movss (%rdi), %xmm0 #; buf => xmm0
+ movss (%rsi), %xmm1 #; gain value => xmm1
+ mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
+ movss %xmm0, (%rdi) #; signal with gain => buf
+
+ decq %rdx
+ jz .AGA_END
+
+ addq $4, %rdi #; buf++
+ addq $4, %rsi #; gab++
+
+ addq $4, %rax
+ cmp $16, %rax
+ jne .AGA_ALIGNLOOP
+
+#; There are frames left for sure, as that is checked in the beginning
+#; and within the previous loop. BUT, there might be less than 4 frames
+#; to process
+
+.AGA_SSE:
+ movq %rdx, %rax #; nframes => %rax
+ shr $2, %rax #; unsigned divide by 4
+
+ cmp $0, %rax #; Jos toimii ilman t�t�, niin kiva
+ je .AGA_ENDLOOP
+
+.AGA_SSELOOP:
+ movaps (%rdi), %xmm0
+ movaps (%rsi), %xmm1
+ mulps %xmm1, %xmm0
+ movaps %xmm0, (%rdi)
+
+ addq $16, %rdi
+ addq $16, %rsi
+
+ decq %rax
+ jnz .AGA_SSELOOP
+
+ andq $3, %rdx #; Remaining frames are nframes & 3
+ jz .AGA_END
+
+
+#; Inside this loop, we know there are frames left to process
+#; but because either there are < 4 frames left, or the buffers
+#; are not aligned, we can't use the parallel SSE ops
+.AGA_ENDLOOP:
+ movss (%rdi), %xmm0 #; buf => xmm0
+ movss (%rsi), %xmm1 #; gain value => xmm1
+ mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
+ movss %xmm0, (%rdi) #; signal with gain => buf
+
+ addq $4,%rdi
+ addq $4,%rsi
+ decq %rdx #; nframes--
+ jnz .AGA_ENDLOOP
+
+.AGA_END:
+
+ popq %rbx
+ popq %rsi
+ popq %rdi
+
+ leave
+ ret
+
+.size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
+#; end proc
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+ .type x86_sse_compute_peak,@function
+
+
+x86_sse_compute_peak:
+
+#; %rdi float *buf 32(%rbp)
+#; %rsi unsigned int nframes
+#; %xmm0 float current
+#; %xmm1 float buf[0]
+
+ pushq %rbp
+ movq %rsp, %rbp
+
+ #; save %rdi
+ pushq %rdi
+
+ #; if nframes == 0, go to end
+ movq %rsi, %rcx #; nframes
+ cmp $0, %rcx
+ je .CP_END
+
+ #; create the "abs" mask in %xmm2
+ pushq $2147483647
+ movss (%rsp), %xmm2
+ addq $8, %rsp
+ shufps $0x00, %xmm2, %xmm2
+
+ #; Check for alignment
+
+ #;movq 8(%rbp), %rdi #; buf
+ movq %rdi, %rdx #; buf => %rdx
+ andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .CP_SSE #; if buffer IS aligned
+
+ #; PRE-LOOP
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.LP_START:
+
+ #; Load next value from the buffer
+ movss (%rdi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+
+ #; increment buffer, decrement counter
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--
+ jz .CP_END #; if we run out of frames, we go to the end
+
+ addq $4, %rdx #; one non-aligned byte less
+ cmp $16, %rdx
+ jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+ #; We have reached the 16 byte aligned "buf" ("rdi") value
+
+ #; Figure out how many loops we should do
+ movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+ shr $2,%rax #; unsigned divide by 4
+ jz .POST_START
+
+ #; %rax = SSE iterations
+
+ #; current maximum is at %xmm0, but we need to ..
+ shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+ #;prefetcht0 16(%rdi)
+
+.LP_SSE:
+
+ movaps (%rdi), %xmm1
+ andps %xmm2, %xmm1
+ maxps %xmm1, %xmm0
+
+ addq $16, %rdi
+
+ decq %rax
+ jnz .LP_SSE
+
+ #; Calculate the maximum value contained in the 4 FP's in %xmm0
+ movaps %xmm0, %xmm1
+ shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+ maxps %xmm1, %xmm0 #; maximums of the two pairs
+ movaps %xmm0, %xmm1
+ shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs
(1234 => 2143)
+ maxps %xmm1, %xmm0
+
+ #; now every float in %xmm0 is the same value, current maximum value
+
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %rcx
+
+ #; if no remaining frames, jump to the end
+
+ andq $3, %rcx #; nframes % 4
+ jz .CP_END
+
+.POST_START:
+
+ movss (%rdi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+
+ addq $4, %rdi #; buf++;
+
+ decq %rcx #; nframes--;
+ jnz .POST_START
+
+.CP_END:
+
+ popq %rdi
+
+ #; return
+ leave
+ ret
+
+.size x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Traverso-commit] traverso/src audiofileio/audiofileio.pro base.p...,
Remon Sijrier <=