Openjpeg improvements

From Second Life Wiki
Jump to navigation Jump to search

Introduction

There are two main directions to improve OpenJPEG.

  • Optimize the entropy coder (T1)
  • Optimize the wavelet transform (DWT)

Depending on the image characteristics, the complexity is distributed between these two blocks, as illustrated in the analysis below.

Says bushing:

Analysis on OSX/PPC showed:

- 27.1% dwt_decode_1_real (OpenJPEG)
- 22.9% tcd_decode_tile (OpenJPEG)
- 5.6% t1_dec_sigpass_step (OpenJPEG)
- 5.4% t1_decode_cblks (OpenJPEG)
- 5.4% t1_dec_refpass_step (OpenJPEG)
- 4.7% mqc_decode (OpenJPEG)

tcd_decode_tile is really just a call to dwt_decode_real, so half of the
time is spent in dwt_decode_real and dwt_decode_1_real.

OpenJpeg: 1 min 25 sec
GeoJasPer: 1 min 34 sec
Kakadu: 19 sec (!)

Related issues/patches in JIRA

T1 Optimization

Stefan Westerfeld has started to optimize the T1.

DWT Optimization

GCC4 Vector Ops

GCC supports vectorization, and you can code general vector operations without any assembly. GCC4.0 will automatically convert the vector operations to machine instructions of the defined processor platform. This makes it easy to support several vector processors, like SSE, MMX, ALTIVEC, and simulated.

Here is demo code to accomplish this task with the 9/7 encode; it should achieve at least a 2x speedup: User:Dzonatas_Sol/DEMO_DWT

Here is Dzonatas Sol's modification in the form of a patch: (I can't guarantee I didn't mess this up, gcc just core dumps if I try and compile this for pentium3. Compiling for i686 results in messed up textures. I'll have to try it later on x86_64...) Seg Baphomet 03:13, 20 March 2007 (PDT)

diff -ur OpenJPEG/libopenjpeg/dwt.c OpenJPEG.patched/libopenjpeg/dwt.c
--- OpenJPEG/libopenjpeg/dwt.c  2007-01-15 03:55:40.000000000 -0600
+++ OpenJPEG.patched/libopenjpeg/dwt.c  2007-03-20 03:21:40.000000000 -0500
@@ -51,6 +51,40 @@
  *  mail: ive@lilysoft.com
  */
 
+//#include <xmmintrin.h>
+
+/***************************
+ * DEMO: 1-22-2007
+ * DWT
+ *  * modified to use vector operations and processors
+ *  * kept same lifting scheme but also optimized redundant calculations
+ * - dzonatas@dzonux.net
+ *
+ */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+
+typedef union
+       {
+       float   f[4] ;
+       v4sf    v ;
+       } v4 ;
+
+typedef struct
+       {
+       v4              *m ;
+       int             dn ;
+       int             sn ;
+       int             ddn ;
+       int             dsn ;
+       int             rdn ;
+       int             rsn ;
+       int             cas ;
+       } v4dwt ; 
+
+const v4 v8192         = { 8192.0f , 8192.0f , 8192.0f , 8192.0f } ;
+
 #include "opj_includes.h"
 
 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
@@ -70,7 +104,7 @@
 /**
 Inverse lazy transform (horizontal)
 */
-static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas);
+//static void dwt_interleave_h(v4dwt *w , int * a);
 /**
 Inverse lazy transform (vertical)
 */
@@ -155,6 +189,38 @@
 /* <summary>                             */
 /* Inverse lazy transform (horizontal).  */
 /* </summary>                            */
+static void v4dwt_interleave_h( v4dwt * s , int * a , int x ) {
+    int i;
+    int *ai = NULL;
+    v4 *bi = NULL;
+    ai = a ;
+    bi = s->m + s->cas;
+    i = s->sn ;
+    while( i-- )
+       {
+               bi->f[0] = *ai;
+               bi->f[1] = *(ai + x);
+               bi->f[2] = *(ai + x*2);
+               bi->f[3] = *(ai + x*3);
+               bi->v /= v8192.v ;              // precision shift
+               bi += 2;  
+               ai++;
+           }
+    ai = a + s->sn ;   // update: NOT NEEDED ?
+    bi = s->m + 1 - s->cas ;
+    i = s->dn ;
+    while( i-- )
+       {
+               bi->f[0] = *ai;
+               bi->f[1] = *(ai + x);
+               bi->f[2] = *(ai + x*2);
+               bi->f[3] = *(ai + x*3);
+               bi->v /= v8192.v ;
+               bi += 2;
+               ai++;
+           }
+}
+
 static void dwt_interleave_h(int *a, int *b, int dn, int sn, int cas) {
     int i;
     int *ai = NULL;
@@ -178,6 +244,36 @@
 /* <summary>                             */  
 /* Inverse lazy transform (vertical).    */
 /* </summary>                            */ 
+static void v4dwt_interleave_v( v4dwt * s , int *a , int x ) {
+    int i;
+    int *ai = NULL;
+    v4 *bi = NULL;
+    ai = a;
+    bi = s->m + s->cas;
+    i = s->sn ;
+    while( i-- )
+       {
+               bi->f[0] = *ai;
+               bi->f[1] = *(ai+1);
+               bi->f[2] = *(ai+2);
+               bi->f[3] = *(ai+3);
+               bi += 2;
+               ai += x;
+       }
+    ai = a + (s->sn * x);
+    bi = s->m + 1 - s->cas;
+    i = s->dn ;
+    while( i-- )
+       {
+               bi->f[0] = *ai;
+               bi->f[1] = *(ai+1);
+               bi->f[2] = *(ai+2);
+               bi->f[3] = *(ai+3);
+               bi += 2;  
+               ai += x;
+               }
+}
+
 static void dwt_interleave_v(int *a, int *b, int dn, int sn, int x, int cas) {
     int i;
     int *ai = NULL;
@@ -279,9 +375,122 @@
        }
 }
 
+
+static void v4dwt_mula( v4* x , const v4 n , int i , int r )
+       {
+       v4* l = x ;
+       while( i-- )
+               {
+               (x-1)->v += ( x->v + l->v ) * n.v ;
+               l = x ;
+               x += 2 ;
+               }
+       if( r )
+               {
+               x-- ;
+               while( r-- )
+                       ( x += 2 )->v += ( l->v + l->v ) * n.v ;
+               }
+       }
+
+static void v4dwt_mulb( v4* x , const v4 n , int d , int r )
+       {
+       v4* l = x ;
+       while( d-- )
+               {
+               x += 2 ;
+               (x-1)->v += ( l->v + x->v ) * n.v ;
+               l = x ;
+               }
+       if( r )
+               {
+               x-- ;
+               while( r-- )
+                       ( x += 2 )->v += ( l->v + l->v ) * n.v ;
+               }
+       }
+
+
+
+
 /* <summary>                             */
 /* Inverse 9-7 wavelet transform in 1-D. */
 /* </summary>                            */
+
+#define PF( x )        { x / 8192.0f , x / 8192.0f , x / 8192.0f , x / 8192.0f } ;
+
+const v4 v10078                = PF( 10078.0f ) ;
+const v4 v13318                = PF( 13318.0f ) ;
+const v4 v3633         = PF( -3633.0f ) ;
+const v4 v7233         = PF( -7233.0f )  ;
+const v4 v434          = PF( 434.0f  )  ;
+const v4 v12994                = PF( 12994.0f )  ;
+
+void v4dwt_decode_1_real( v4dwt* s )
+       {
+       int i , y ;
+       v4* a = s->m ;
+       v4* b = a + 1 ;
+       v4      *x , n , *l ;
+
+       if (!s->cas)
+               {
+               if( s->dn <= 0 && s->sn <= 1 )
+                       return ;
+
+               i = s->sn ;
+               x = a - 2 ;
+               while( i-- )
+                       ( x += 2 )->v *= v10078.v ;
+
+               i = s->dn ;
+               x = b - 2 ;
+               while( i-- )
+                       ( x += 2 )->v *= v13318.v ;
+
+               if( ( i = s->dsn ) )
+                       v4dwt_mula( b , v3633 , i , s->rsn ) ;
+
+               if( ( i = s->ddn ) )
+                       v4dwt_mulb( a , v7233 , i , s->rdn ) ;
+
+               if( ( i = s->dsn ) )
+                       v4dwt_mula( b , v434 , i , s->rsn ) ;
+
+               if( ( i = s->ddn ) )
+                       v4dwt_mulb( a , v12994 , i , s->rdn ) ;
+
+               }
+       else
+               {
+               if( s->sn <= 0 && s->dn <= 1 )
+                       return ;
+
+               i = s->sn ;
+               x = b - 2 ;
+               while( i-- )
+                       ( x += 2 )->v *= v10078.v ;
+
+               i = s->dn ;
+               x = a - 2 ;
+               while( i-- )
+                       ( x += 2 )->v *= v13318.v ;
+
+               if( ( i = s->dsn ) )
+                       v4dwt_mulb( a , v3633 , i , s->rsn ) ;
+
+               if( ( i = s->ddn ) )
+                       v4dwt_mula( b , v7233 , i , s->rdn ) ;
+
+               if( ( i = s->dsn ) )
+                       v4dwt_mulb( a , v434 , i , s->rsn ) ;
+
+               if( ( i = s->ddn ) )
+                       v4dwt_mula( b , v12994 , i , s->rdn ) ;
+               }
+       }
+
+
 static void dwt_decode_1_real(int *a, int dn, int sn, int cas) {
        int i;
        if (!cas) {
@@ -525,6 +734,92 @@
        int i, j, k;
        int *a = NULL;
        int *aj = NULL;
+       int w, l;
+       int mr = 0;                     /* max width/height of the resolution level computed                       */
+       v4 *m = NULL ;
+       v4dwt s ;
+
+       w = tilec->x1-tilec->x0;
+       l = tilec->numresolutions-1;
+       a = tilec->data;
+
+       for (i = l-1; i >= stop; i--)
+               {
+               int r ;
+               r = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
+               if( r > mr )
+                       mr = r ;
+               r = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
+               if( r > mr )
+                       mr = r ;
+               }
+       m = (v4*)opj_malloc( (mr+5) * sizeof(v4));
+       s.m = (v4*)((unsigned)m + 16 - ( (unsigned)m % 16 )) ;
+
+       for (i = l-1; i >= stop; i--) {
+               int rw;                 /* width of the resolution level computed                       */
+               int rh;                 /* heigth of the resolution level computed                      */
+               int rw1;                /* width of the resolution level once lower than computed one   */
+               int rh1;                /* height of the resolution level once lower than computed one  */
+
+               rw = tilec->resolutions[l - i].x1 - tilec->resolutions[l - i].x0;
+               rh = tilec->resolutions[l - i].y1 - tilec->resolutions[l - i].y0;
+               rw1= tilec->resolutions[l - i - 1].x1 - tilec->resolutions[l - i - 1].x0;
+               rh1= tilec->resolutions[l - i - 1].y1 - tilec->resolutions[l - i - 1].y0;
+
+
+               s.sn = rw1;
+               s.dn = rw-rw1;
+               s.cas = tilec->resolutions[l - i].x0 % 2; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
+               s.ddn = ( s.dn <= s.sn ? s.dn : s.sn ) ;
+               s.dsn = ( s.sn <= s.dn ? s.sn : s.dn ) ;
+               s.rdn = ( s.dn > s.sn ? s.dn - s.sn : 0 ) ;
+               s.rsn = ( s.sn > s.dn ? s.sn - s.dn : 0 ) ;
+               for (j = 0; j < rh; j+=4) {
+                       aj = a + j * w;
+                       v4dwt_interleave_h( &s, aj , w ) ;
+                       v4dwt_decode_1_real( &s ) ;
+                       k = rw ;
+                       while( k-- )
+                               {
+                               s.m[k].v *= v8192.v ;
+                               aj[k] = floorf( s.m[k].f[0] ) ;
+                               aj[k+w] = floorf( s.m[k].f[1] ) ;
+                               aj[k+w*2] = floorf( s.m[k].f[2] ) ;
+                               aj[k+w*3] = floorf( s.m[k].f[3] ) ;
+                               }
+               }
+
+               s.sn = rh1;
+               s.dn = rh-rh1;
+               s.cas = tilec->resolutions[l - i].y0 % 2; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
+               s.ddn = ( s.dn <= s.sn ? s.dn : s.sn ) ;
+               s.dsn = ( s.sn <= s.dn ? s.sn : s.dn ) ;
+               s.rdn = ( s.dn > s.sn ? s.dn - s.sn : 0 ) ;
+               s.rsn = ( s.sn > s.dn ? s.sn - s.dn : 0 ) ;
+               for (j = 0; j < rw; j+=4) {
+                       aj = a + j;
+                       v4dwt_interleave_v( &s , aj , w ) ;
+                       v4dwt_decode_1_real( &s ) ;
+                       k = rh ;
+                       while( k-- )
+                               {
+//                             s.m[k].v *= v8192.v ;
+                               aj[k*w] = floorf( s.m[k].f[0] ) ;
+                               aj[1+k*w] = floorf( s.m[k].f[1] ) ;
+                               aj[2+k*w] = floorf( s.m[k].f[2] ) ;
+                               aj[3+k*w] = floorf( s.m[k].f[3] ) ;
+                               }
+               }
+       }
+       opj_free(m);
+}
+
+#if 0
+void dwt_decode_real(opj_tcd_tilecomp_t * tilec, int stop) {
+       int i, j, k;
+       int *a = NULL;
+       int *aj = NULL;
        int *bj = NULL;
        int w, l;
 
@@ -572,7 +867,7 @@
                opj_free(bj);
        }
 }
-
+#endif
 
 /* <summary>                          */
 /* Get gain of 9-7 wavelet transform. */

SSE Implementation

From the author of the below code: Balatoni Denes, dbalatoni at programozo.hu

I see that openjpeg uses a BSD licence. Yes you can use that code under BSD licence, thanks for asking too  :) . I might not have time to do the integration myself, but of course I can answer questions if needed.

ps: this code can not do dwt on arbitrary size vectors. if you have k levels of dwt, than you can only have IIRC vectorsizes of multiplies of 2^k. for arbitrary size vectors, more coding is needed.

static float coeffs_cdf97_sse[12*4] __attribute__ ((__aligned__ (32))) = {
     -1.586134342,-1.586134342,-1.586134342,-1.586134342,
     -0.052980118,-0.052980118,-0.052980118,-0.052980118,
      0.882911075, 0.882911075, 0.882911075, 0.882911075,
      0.443506852, 0.443506852, 0.443506852, 0.443506852,
     -1.230174105,-1.230174105,-1.230174105,-1.230174105,
      0.812893066, 0.812893066, 0.812893066, 0.812893066,
      1.230174105, 1.230174105, 1.230174105, 1.230174105,
     -0.812893066,-0.812893066,-0.812893066,-0.812893066,
     -0.443506852,-0.443506852,-0.443506852,-0.443506852,
     -0.882911075,-0.882911075,-0.882911075,-0.882911075,
      0.052980118, 0.052980118, 0.052980118, 0.052980118,
      1.586134342, 1.586134342, 1.586134342, 1.586134342
    };


#define HORIZ_LIFTING_STEP8(i1, i2, o1, o2, ofs, o2b) \
	"movaps " #ofs "(%2), %%xmm5   \n\t" \
	"movaps " #i1 ", " #o2 "       \n\t" \
	"shufps $147, %%xmm7, %%xmm7   \n\t" \
	"shufps $147, " #o2 ", " #o2 " \n\t" \
	"movss  %%xmm7, %%xmm0         \n\t" \
	"movss  " #o2 ", %%xmm7        \n\t" \
	"movss  %%xmm0, " #o2 "        \n\t" \
	"addps  " #o2b ", " #o1 "      \n\t" \
	"mulps  %%xmm5, " #o1 "        \n\t" \
	"addps  " #i2 ", " #o1 "       \n\t" 

#define HORIZ_LIFTING_STEP8_END(i1, i2, o1, o2, ofs, o2b) \
	"movaps " #ofs "(%2), %%xmm5   \n\t" \
	"movaps " #i1 ", " #o2 "       \n\t" \
	"shufps $147, %%xmm7, %%xmm7   \n\t" \
	"shufps $147, " #o2 ", " #o2 " \n\t" \
	"movss  %%xmm7, " #o2 "        \n\t" \
	"addps  " #o2b ", " #o1 "      \n\t" \
	"mulps  %%xmm5, " #o1 "        \n\t" \
	"addps  " #i2 ", " #o1 "       \n\t" 

#define HORIZ_LIFTING_STEP8_BEGIN(i1, i2, o1, o2, ofs, o2b, shu) \
	"movaps " #ofs "(%2), %%xmm5       \n\t" \
	"movaps " #i1 ", " #o2 "           \n\t" \
	"shufps $147, %%xmm7, %%xmm7       \n\t" \
	"shufps $147, " #o2 ", " #o2 "     \n\t" \
	"movss  " #o2 ", %%xmm7            \n\t" \
	"shufps " #shu ", " #o2 ", " #o2 " \n\t" \
	"addps  " #o2b ", " #o1 "          \n\t" \
	"mulps  %%xmm5, " #o1 "            \n\t" \
	"addps  " #i2 ", " #o1 "           \n\t" 

#define HORIZ_LIFTING_STEP16(i1, i2, i3, i4, o1, o2, o3, o4, ofs, o3b, o4b) \
	"movaps " #i2 ", " #o4 "       \n\t" \
	"movaps " #i1 ", " #o3 "       \n\t" \
	"shufps $147, %%xmm7, %%xmm7   \n\t" \
	"shufps $147, " #o4 ", " #o4 " \n\t" \
	"shufps $147, " #o3 ", " #o3 " \n\t" \
	"movss  %%xmm7, %%xmm0         \n\t" \
	"movss  " #o4 ", %%xmm7        \n\t" \
	"movss  " #o3 ", " #o4 "       \n\t" \
	"movss  %%xmm0, " #o3 "        \n\t" \
	"movaps " #ofs "(%2), %%xmm0   \n\t" \
	"addps  " #o3b ", " #o1 "      \n\t" \
	"addps  " #o4b ", " #o2 "      \n\t" \
	"mulps  %%xmm0, " #o1 "        \n\t" \
	"mulps  %%xmm0, " #o2 "        \n\t" \
	"addps  " #i3 ", " #o1 "       \n\t" \
	"addps  " #i4 ", " #o2 "       \n\t"	

/* Calculate 4 lifting steps to get 2 samples vertical, xmm0-xmm3 : buffer
*/

#define VERT_MAIN(in1, in2, ofs) \
	"movaps " #ofs "(%2), %%xmm7          \n\t" \
 \
	"movaps " #in1 ", %%xmm4              \n\t" \
	"movaps %%xmm0, %%xmm5                \n\t" \
	"addps  %%xmm4, %%xmm5                \n\t" \
	"mulps  %%xmm7, %%xmm5                \n\t" \
	"addps  " #in2 ", %%xmm5              \n\t" \
 \
	"movaps 16+" #ofs "(%2), %%xmm7       \n\t" \
 \
	"movaps %%xmm5, %%xmm6                \n\t" \
	"addps  %%xmm1, %%xmm6                \n\t" \
	"mulps  %%xmm7, %%xmm6                \n\t" \
	"addps  %%xmm0, %%xmm6                \n\t" \
 \
	"movaps 32+" #ofs "(%2), %%xmm7       \n\t" \
	"movaps %%xmm4, %%xmm0                \n\t" \
 \
	"movaps %%xmm6, %%xmm4                \n\t" \
	"addps  %%xmm2, %%xmm4                \n\t" \
	"mulps  %%xmm7, %%xmm4                \n\t" \
	"addps  %%xmm1, %%xmm4                \n\t" \
 \
	"movaps 48+" #ofs "(%2), %%xmm7       \n\t" \
	"movaps %%xmm5, %%xmm1                \n\t" \
 \
	"movaps %%xmm4, %%xmm5                \n\t" \
	"addps  %%xmm3, %%xmm5                \n\t" \
	"mulps  %%xmm7, %%xmm5                \n\t" \
	"addps  %%xmm2, %%xmm5                \n\t" \
 \
	"movaps %%xmm4, %%xmm3                \n\t" \
	"movaps %%xmm6, %%xmm2                \n\t" 


static inline void analyze_inplace(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
// Anal_begin
	"movaps (%1), %%xmm7       \n\t"
	"movaps (%1), %%xmm1       \n\t"
	"movaps (%1), %%xmm0       \n\t"
	"movaps (%1), %%xmm2       \n\t"
	"shufps   $1, %%xmm1, %%xmm1 \n\t"
	"shufps   $2, %%xmm7, %%xmm7 \n\t"

	"movaps (%2), %%xmm5         \n\t"
	"addss  %%xmm7, %%xmm0       \n\t"
	"mulss  %%xmm5, %%xmm0       \n\t"	
	"addss  %%xmm1, %%xmm0       \n\t"

	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss  %%xmm0, %%xmm7       \n\t"
	
	"movaps 16(%2), %%xmm5       \n\t"
	"addss  %%xmm0, %%xmm0       \n\t"
	"mulss  %%xmm5, %%xmm0       \n\t"
	"addss  %%xmm2, %%xmm0       \n\t"	

	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss  %%xmm0, %%xmm7       \n\t"
	"shufps $147, %%xmm7, %%xmm7 \n\t"		

	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm0       \n\t"
	"movaps %%xmm1, %%xmm2       \n\t"
	"movups 12(%1), %%xmm4       \n\t"
	"shufps $221, %%xmm0, %%xmm1 \n\t"
	"shufps $136, %%xmm0, %%xmm2 \n\t"
	"shufps $147, %%xmm1, %%xmm1 \n\t"
	"movss  %%xmm4, %%xmm1       \n\t"

	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 0, %%xmm3)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 16, %%xmm1)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 32, %%xmm3)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm1, %%xmm1, 48, %%xmm2, $229)	

	"movaps 64(%2), %%xmm6       \n\t"
	"mulps  %%xmm6, %%xmm2       \n\t"	
	"movaps 80(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"	

	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm2, (%1, %5)       \n\t"

	"addl %5, %1                 \n\t"
	"addl %5, %1                 \n\t"

//Anal sse16
	".balign 16                  \n\t"
	"1:                          \n\t"
	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm0       \n\t"
	"movaps (%1, %6), %%xmm4       \n\t"
	"movaps (%1, %5, 4), %%xmm6       \n\t"
	"movaps %%xmm1, %%xmm2       \n\t"
	"movaps %%xmm4, %%xmm5       \n\t"
	"shufps $221, %%xmm0, %%xmm1 \n\t"
	"shufps $221, %%xmm6, %%xmm4 \n\t"
	"shufps $136, %%xmm0, %%xmm2 \n\t"
	"shufps $136, %%xmm6, %%xmm5 \n\t"
	"shufps $147, %%xmm1, %%xmm1 \n\t"
	"shufps $147, %%xmm4, %%xmm4 \n\t"
	"movups 12(%1), %%xmm3       \n\t"
	"movss  %%xmm3, %%xmm1       \n\t"
	"movups 12(%1, %5, 2), %%xmm3       \n\t"
	"movss  %%xmm3, %%xmm4       \n\t"
	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 0, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 16, %%xmm1, %%xmm4)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 32, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm1, %%xmm4, %%xmm1, %%xmm4, 48, %%xmm2, %%xmm5)	

	"movaps 64(%2), %%xmm0       \n\t"
	"movaps 80(%2), %%xmm6       \n\t"
	"mulps  %%xmm0, %%xmm2       \n\t"	
	"mulps  %%xmm0, %%xmm5       \n\t"	
	"mulps  %%xmm6, %%xmm1       \n\t"	
	"mulps  %%xmm6, %%xmm4       \n\t"	

	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm2, (%1, %5)       \n\t"
	"movaps %%xmm4, (%1, %5, 2)       \n\t"
	"movaps %%xmm5, (%1, %6)       \n\t"

	"addl %6, %1                \n\t"
	"addl %5, %1                \n\t"
	"subl  $1, %0                \n\t"
	"jnz 1b                      \n\t"

//Anal_end

	"movaps 16-16(%1, %5), %%xmm1       \n\t"
	"movaps 16-16(%1, %5), %%xmm0       \n\t"
	"shufps  $6, %%xmm0, %%xmm0 \n\t" //27 volt
	"movaps %%xmm1, %%xmm2       \n\t"
	"movups 12(%1), %%xmm4       \n\t"
	"shufps $221, %%xmm0, %%xmm1 \n\t"
	"shufps $136, %%xmm0, %%xmm2 \n\t"
	"shufps $147, %%xmm1, %%xmm1 \n\t"
	"movss  %%xmm4, %%xmm1       \n\t"

	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 0, %%xmm3)	
	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 16, %%xmm1)	
	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 32, %%xmm3)	
	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm3, %%xmm1, %%xmm1, 48, %%xmm2)	

	"movaps 64(%2), %%xmm7       \n\t"
	"mulps  %%xmm7, %%xmm2       \n\t"	
	"movaps 80(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"	

	"movaps %%xmm1,  0(%1)       \n\t"
	"movaps %%xmm2, 16-16(%1, %5)       \n\t"


	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

static inline void synthetize_inplace(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
//Synth_begin
	"movaps (%1), %%xmm1         \n\t"
	"movaps (%1, %5), %%xmm2     \n\t"
	
	"movaps 96(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"
	"movaps 112(%2), %%xmm5      \n\t"
	"mulps  %%xmm5, %%xmm2       \n\t"

	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128, %%xmm3, $229)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+16, %%xmm1, $230)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128+32, %%xmm3, $228)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+48, %%xmm1, $228)	
	
	"unpckhps %%xmm2, %%xmm1     \n\t"
	"movaps %%xmm1, (%1)       \n\t"

	"addl %5, %1                 \n\t"

//Synth_sse16
	".balign 16                  \n\t"
	"1:                          \n\t"
	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm2       \n\t"
	"movaps (%1, %6), %%xmm4       \n\t"
	"movaps (%1, %5, 4), %%xmm5       \n\t"

	"movaps 96(%2), %%xmm0       \n\t"
	"mulps  %%xmm0, %%xmm1       \n\t"
	"mulps  %%xmm0, %%xmm4       \n\t"
	"movaps 112(%2), %%xmm0      \n\t"
	"mulps  %%xmm0, %%xmm2       \n\t"
	"mulps  %%xmm0, %%xmm5       \n\t"

	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 128, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 128+16, %%xmm1, %%xmm4)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 128+32, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 128+48, %%xmm1, %%xmm4)	

	"movaps %%xmm1, %%xmm3       \n\t"
	"movaps %%xmm4, %%xmm6       \n\t"

	"unpckhps %%xmm2, %%xmm3     \n\t"
	"unpcklps %%xmm2, %%xmm1     \n\t"
	"unpckhps %%xmm5, %%xmm6     \n\t"
	"unpcklps %%xmm5, %%xmm4     \n\t"
	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm3, (%1, %5)       \n\t"
	"movaps %%xmm4, (%1, %5, 2)       \n\t"
	"movaps %%xmm6, (%1, %6)       \n\t"

	"addl %5, %1                 \n\t"
	"addl %6, %1                 \n\t"	
	"subl  $1, %0                \n\t"
	"jnz 1b                      \n\t"

//Synthend_sse
	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm2       \n\t"
	
	"movaps 96(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"
	"movaps 112(%2), %%xmm5      \n\t"
	"mulps  %%xmm5, %%xmm2       \n\t"

	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128, %%xmm3)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+16, %%xmm1)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128+32, %%xmm3)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+48, %%xmm1)	

	"movaps %%xmm1, %%xmm3       \n\t"

	"unpckhps %%xmm2, %%xmm3     \n\t"
	"unpcklps %%xmm2, %%xmm1     \n\t"
	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm3, (%1, %5)       \n\t"

// Vege
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm1        \n\t"

	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm2        \n\t"	
	"movss %%xmm7, %%xmm3        \n\t"	
	"movaps 128+16(%2), %%xmm5   \n\t"
	"addss %%xmm2, %%xmm2        \n\t"
	"mulss %%xmm5, %%xmm2        \n\t"
	"addss %%xmm1, %%xmm2        \n\t"
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm1        \n\t"
	"movss %%xmm7, %%xmm4        \n\t"
	"movaps 128+32(%2), %%xmm5   \n\t"
	"addss %%xmm2, %%xmm4        \n\t"
	"mulss %%xmm5, %%xmm4        \n\t"
	"addss %%xmm3, %%xmm4        \n\t"
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm0        \n\t"
	
	"addss %%xmm4, %%xmm0        \n\t"
	"movaps 128+48(%2), %%xmm5   \n\t"
	"mulss %%xmm5, %%xmm0        \n\t"
	"addss %%xmm1, %%xmm0        \n\t"

	"movss %%xmm4, %%xmm6        \n\t"
	"addss %%xmm6, %%xmm6        \n\t"	
	"mulss %%xmm5, %%xmm6        \n\t"
	"addss %%xmm6, %%xmm2        \n\t"
	
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm2, %%xmm7        \n\t"	
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm4, %%xmm7        \n\t"	
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm0, %%xmm7        \n\t"	
	"shufps $147, %%xmm7, %%xmm7 \n\t"

	"movups %%xmm7, (%1, %5, 2)       \n\t"


	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

static inline void analyze_inplace_vertical(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
// A_Start
	"movaps (%2), %%xmm7                  \n\t"		

	"movaps (%1, %5, 4), %%xmm0           \n\t"
	"movaps  (%1, %5, 2), %%xmm1          \n\t"
	"addps  %%xmm0, %%xmm1                \n\t"
	"mulps  %%xmm7, %%xmm1                \n\t"	
	"addps  (%1, %6), %%xmm1              \n\t"	

	"movaps (%1), %%xmm4                  \n\t"
	"addps  (%1, %5, 2), %%xmm4           \n\t"
	"mulps  %%xmm7, %%xmm4                \n\t"	
	"addps  (%1, %5), %%xmm4              \n\t"	

	"movaps 16(%2), %%xmm7                \n\t"		

	"movaps %%xmm1, %%xmm2                \n\t"
	"addps  %%xmm4, %%xmm2                \n\t"
	"mulps  %%xmm7, %%xmm2                \n\t"	
	"addps  (%1, %5, 2), %%xmm2           \n\t"	

	"movaps %%xmm4, %%xmm5                \n\t"
	"addps  %%xmm4, %%xmm5                \n\t"
	"mulps  %%xmm7, %%xmm5                \n\t"	
	"addps  (%1), %%xmm5                  \n\t"	

	"movaps 32(%2), %%xmm7                \n\t"		

	"movaps %%xmm5, %%xmm3                \n\t"
	"addps  %%xmm2, %%xmm3                \n\t"
	"mulps  %%xmm7, %%xmm3                \n\t"	
	"addps  %%xmm4, %%xmm3                \n\t"	

	"movaps 48(%2), %%xmm7                \n\t"		

	"movaps %%xmm3, %%xmm6                \n\t"
	"addps  %%xmm3, %%xmm6                \n\t"
	"mulps  %%xmm7, %%xmm6                \n\t"	
	"addps  %%xmm5, %%xmm6                \n\t"	

	"movaps 64(%2), %%xmm4                \n\t"		
	"movaps 80(%2), %%xmm5                \n\t"		
	"mulps  %%xmm3, %%xmm4                \n\t"
	"mulps  %%xmm5, %%xmm6                \n\t"

	"movaps %%xmm6, (%1)                  \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"
	
	"addl %5, %1                          \n\t"
	"addl %5, %1                          \n\t"

// A_Main
	".balign 16                           \n\t"
	"1:                                   \n\t"

	VERT_MAIN((%1, %5, 4), (%1, %6), 0)

	"movaps 64(%2), %%xmm7                \n\t" 
	"movaps 80(%2), %%xmm6                \n\t" 

	"mulps  %%xmm7, %%xmm4                \n\t" 
	"mulps  %%xmm6, %%xmm5                \n\t"

	"movaps %%xmm5, (%1)                  \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"

	"addl %5, %1                          \n\t"
	"addl %5, %1                          \n\t"
	"subl  $2, %0                         \n\t"
	"jnz 1b                               \n\t"

// A_End
	VERT_MAIN(%%xmm0, (%1, %6), 0)

	"movaps 64(%2), %%xmm7                \n\t" 
	"movaps 80(%2), %%xmm6                \n\t" 
 
	"mulps  %%xmm7, %%xmm4                \n\t" 
	"mulps  %%xmm6, %%xmm5                \n\t"
	
	"movaps (%1, %5), %%xmm6              \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"
	"movaps (%1), %%xmm4                  \n\t"
	"movaps %%xmm5, (%1)                  \n\t"

	VERT_MAIN(%%xmm4, %%xmm6, 0)

	"movaps 64(%2), %%xmm7                \n\t" 
	"movaps 80(%2), %%xmm6                \n\t" 
 
	"mulps  %%xmm7, %%xmm4                \n\t" 
	"mulps  %%xmm6, %%xmm5                \n\t"

	"movaps %%xmm5, (%1, %5, 2)           \n\t"
	"movaps %%xmm4, (%1, %6)              \n\t"

	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

static inline void synthetize_inplace_vertical(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
// S_Start
	"movaps 128(%2), %%xmm7               \n\t"		
	"movaps 112(%2), %%xmm5               \n\t"		
	"movaps 96(%2), %%xmm4                \n\t"		
	"movaps %%xmm5, %%xmm0                \n\t"
	"movaps %%xmm4, %%xmm6                \n\t"
	"mulps  (%1, %5), %%xmm5              \n\t"
	"mulps  (%1, %6), %%xmm0              \n\t"
	"mulps  (%1, %5, 2), %%xmm4           \n\t"
	"mulps  (%1), %%xmm6                  \n\t"

	"movaps  %%xmm5, %%xmm1               \n\t"
	"addps  %%xmm0, %%xmm1                \n\t"
	"mulps  %%xmm7, %%xmm1                \n\t"	
	"addps  %%xmm4, %%xmm1                \n\t"	

	"movaps  %%xmm5, %%xmm4               \n\t"
	"addps  %%xmm4, %%xmm4                \n\t"
	"mulps  %%xmm7, %%xmm4                \n\t"	
	"addps  %%xmm6, %%xmm4                \n\t"	

	"movaps 128+16(%2), %%xmm7            \n\t"		

	"movaps %%xmm1, %%xmm2                \n\t"
	"addps  %%xmm4, %%xmm2                \n\t"
	"mulps  %%xmm7, %%xmm2                \n\t"	
	"addps  %%xmm5, %%xmm2                \n\t"	

	"movaps 128+32(%2), %%xmm7            \n\t"		

	"movaps %%xmm2, %%xmm3                \n\t"
	"addps  %%xmm3, %%xmm3                \n\t"
	"mulps  %%xmm7, %%xmm3                \n\t"	
	"addps  %%xmm4, %%xmm3                \n\t"	

	"movaps %%xmm3, (%1)                  \n\t"
	
	"addl %5, %1                          \n\t"

// S_Main
	".balign 16                           \n\t"
	"1:                                   \n\t"

	
	"movaps 96(%2), %%xmm6                \n\t"
	"movaps 112(%2), %%xmm4               \n\t"

	"mulps (%1, %5, 4), %%xmm4            \n\t"
	"mulps (%1, %6), %%xmm6               \n\t"
		
	VERT_MAIN(%%xmm4, %%xmm6, 128)

	"movaps %%xmm5, (%1)                  \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"

	"addl %5, %1                          \n\t"
	"addl %5, %1                          \n\t"
	"subl  $2, %0                         \n\t"
	"jnz 1b                               \n\t"

// S_End
	"movaps %%xmm1, %%xmm4                \n\t"
	"movaps 128+16(%2), %%xmm7            \n\t"		
	"addps  %%xmm1, %%xmm4                \n\t"
	"mulps  %%xmm7, %%xmm4                \n\t"
	"addps  %%xmm0, %%xmm4                \n\t"

	"movaps %%xmm2, %%xmm5                \n\t"
	"movaps 128+32(%2), %%xmm7            \n\t"		
	"addps  %%xmm4, %%xmm5                \n\t"
	"mulps  %%xmm7, %%xmm5                \n\t"
	"addps  %%xmm1, %%xmm5                \n\t"

	"movaps %%xmm3, %%xmm6                \n\t"
	"movaps 128+48(%2), %%xmm7            \n\t"		
	"addps  %%xmm5, %%xmm6                \n\t"
	"mulps  %%xmm7, %%xmm6                \n\t"
	"addps  %%xmm2, %%xmm6                \n\t"

	"movaps %%xmm5, %%xmm0                \n\t"
	"addps  %%xmm5, %%xmm0                \n\t"
	"mulps  %%xmm7, %%xmm0                \n\t"
	"addps  %%xmm4, %%xmm0                \n\t"

	"movaps %%xmm6, (%1)                  \n\t"
	"movaps %%xmm5, (%1, %5)              \n\t"
	"movaps %%xmm0, (%1, %5, 2)           \n\t"

	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

int dwt1d(float *pict, int width, int height, int it) {
    int h;

    for(h=1;h<(1<<it);h<<=1) 
	analyze_inplace(pict, coeffs_cdf97_sse, 16*h, (width/16/h)-1);    
}

int idwt1d(float *pict, int width, int height, int it) {
    int h;

    for(h=(1<<(it-1));h>0;h>>=1) 
	synthetize_inplace(pict, coeffs_cdf97_sse, 16*h, (width/h/16)-1);
}

int dwt2d(float *pict, int width, int height, int it) {
    int i,h;
    float *p;

    for(h=1;h<(1<<it);h<<=1) {
    
	for(p=pict,i=0;i<height;i+=h, p+=width*h)
	    analyze_inplace(p, coeffs_cdf97_sse, 16*h, (width/16/h)-1);

	for(p=pict, i=0;i<width;i+=4*h, p+=4*h)
	    analyze_inplace_vertical(p, coeffs_cdf97_sse, width*4*h, (height/h)-6);
    }
}

int idwt2d(float *pict, int width, int height, int it) {
    int i, h;
    float *p;

    for(h=(1<<(it-1));h>0;h>>=1) {
        for(p=pict, i=0;i<width;i+=4*h, p+=4*h)
	    synthetize_inplace_vertical(p, coeffs_cdf97_sse, width*4*h, (height/h)-4);
    
	for(p=pict,i=0;i<height;i+=h, p+=width*h)
	    synthetize_inplace(p, coeffs_cdf97_sse, 16*h, (width/h/16)-1);
    }
}