Difference between revisions of "Openjpeg improvements"

From Second Life Wiki
Jump to navigation Jump to search
(new page)
 
(openjpeg article.)
Line 1: Line 1:
=The Reason=
OpenJPEG sucks.  It does not use fast 9/7 or 5/3 lifting DWT algorithms.  This page is all the information we have collected to help with improving openjpeg.
Says bushing:
<pre>
Analysis on OSX/PPC showed:
- 27.1% dwt_decode_1_real (OpenJPEG)
- 22.9% tcd_decode_tile (OpenJPEG)
- 5.6% t1_dec_sigpass_step (OpenJPEG)
- 5.4% t1_decode_cblks (OpenJPEG)
- 5.4% t1_dec_refpass_step (OpenJPEG)
- 4.7% mqc_decode (OpenJPEG)
tcd_decode_tile is really just a call to dwt_decode_real, so half of the
time is spent in dwt_decode_real and dwt_decode_1_real.
OpenJpeg: 1 min 25 sec
GeoJasPer: 1 min 34 sec
Kakadu: 19 sec (!)
</pre>
==Some Goodies==
From the author: Balatoni Denes, dbalatoni at programozo.hu
From the author: Balatoni Denes, dbalatoni at programozo.hu



Revision as of 12:40, 29 January 2007

The Reason

OpenJPEG sucks. It does not use fast 9/7 or 5/3 lifting DWT algorithms. This page is all the information we have collected to help with improving openjpeg.

Says bushing:

Analysis on OSX/PPC showed:

- 27.1% dwt_decode_1_real (OpenJPEG)
- 22.9% tcd_decode_tile (OpenJPEG)
- 5.6% t1_dec_sigpass_step (OpenJPEG)
- 5.4% t1_decode_cblks (OpenJPEG)
- 5.4% t1_dec_refpass_step (OpenJPEG)
- 4.7% mqc_decode (OpenJPEG)

tcd_decode_tile is really just a call to dwt_decode_real, so half of the
time is spent in dwt_decode_real and dwt_decode_1_real.

OpenJpeg: 1 min 25 sec
GeoJasPer: 1 min 34 sec
Kakadu: 19 sec (!)


Some Goodies

From the author: Balatoni Denes, dbalatoni at programozo.hu

I see that openjpeg uses a BSD licence. Yes you can use that code under BSD licence, thanks for asking too  :) . I might not have time to do the integration myself, but of course I can answer questions if needed.

ps: this code can not do dwt on arbitrary size vectors. if you have k levels of dwt, than you can only have IIRC vectorsizes of multiplies of 2^k. for arbitrary size vectors, more coding is needed.

static float coeffs_cdf97_sse[12*4] __attribute__ ((__aligned__ (32))) = {
     -1.586134342,-1.586134342,-1.586134342,-1.586134342,
     -0.052980118,-0.052980118,-0.052980118,-0.052980118,
      0.882911075, 0.882911075, 0.882911075, 0.882911075,
      0.443506852, 0.443506852, 0.443506852, 0.443506852,
     -1.230174105,-1.230174105,-1.230174105,-1.230174105,
      0.812893066, 0.812893066, 0.812893066, 0.812893066,
      1.230174105, 1.230174105, 1.230174105, 1.230174105,
     -0.812893066,-0.812893066,-0.812893066,-0.812893066,
     -0.443506852,-0.443506852,-0.443506852,-0.443506852,
     -0.882911075,-0.882911075,-0.882911075,-0.882911075,
      0.052980118, 0.052980118, 0.052980118, 0.052980118,
      1.586134342, 1.586134342, 1.586134342, 1.586134342
    };


#define HORIZ_LIFTING_STEP8(i1, i2, o1, o2, ofs, o2b) \
	"movaps " #ofs "(%2), %%xmm5   \n\t" \
	"movaps " #i1 ", " #o2 "       \n\t" \
	"shufps $147, %%xmm7, %%xmm7   \n\t" \
	"shufps $147, " #o2 ", " #o2 " \n\t" \
	"movss  %%xmm7, %%xmm0         \n\t" \
	"movss  " #o2 ", %%xmm7        \n\t" \
	"movss  %%xmm0, " #o2 "        \n\t" \
	"addps  " #o2b ", " #o1 "      \n\t" \
	"mulps  %%xmm5, " #o1 "        \n\t" \
	"addps  " #i2 ", " #o1 "       \n\t" 

#define HORIZ_LIFTING_STEP8_END(i1, i2, o1, o2, ofs, o2b) \
	"movaps " #ofs "(%2), %%xmm5   \n\t" \
	"movaps " #i1 ", " #o2 "       \n\t" \
	"shufps $147, %%xmm7, %%xmm7   \n\t" \
	"shufps $147, " #o2 ", " #o2 " \n\t" \
	"movss  %%xmm7, " #o2 "        \n\t" \
	"addps  " #o2b ", " #o1 "      \n\t" \
	"mulps  %%xmm5, " #o1 "        \n\t" \
	"addps  " #i2 ", " #o1 "       \n\t" 

#define HORIZ_LIFTING_STEP8_BEGIN(i1, i2, o1, o2, ofs, o2b, shu) \
	"movaps " #ofs "(%2), %%xmm5       \n\t" \
	"movaps " #i1 ", " #o2 "           \n\t" \
	"shufps $147, %%xmm7, %%xmm7       \n\t" \
	"shufps $147, " #o2 ", " #o2 "     \n\t" \
	"movss  " #o2 ", %%xmm7            \n\t" \
	"shufps " #shu ", " #o2 ", " #o2 " \n\t" \
	"addps  " #o2b ", " #o1 "          \n\t" \
	"mulps  %%xmm5, " #o1 "            \n\t" \
	"addps  " #i2 ", " #o1 "           \n\t" 

#define HORIZ_LIFTING_STEP16(i1, i2, i3, i4, o1, o2, o3, o4, ofs, o3b, o4b) \
	"movaps " #i2 ", " #o4 "       \n\t" \
	"movaps " #i1 ", " #o3 "       \n\t" \
	"shufps $147, %%xmm7, %%xmm7   \n\t" \
	"shufps $147, " #o4 ", " #o4 " \n\t" \
	"shufps $147, " #o3 ", " #o3 " \n\t" \
	"movss  %%xmm7, %%xmm0         \n\t" \
	"movss  " #o4 ", %%xmm7        \n\t" \
	"movss  " #o3 ", " #o4 "       \n\t" \
	"movss  %%xmm0, " #o3 "        \n\t" \
	"movaps " #ofs "(%2), %%xmm0   \n\t" \
	"addps  " #o3b ", " #o1 "      \n\t" \
	"addps  " #o4b ", " #o2 "      \n\t" \
	"mulps  %%xmm0, " #o1 "        \n\t" \
	"mulps  %%xmm0, " #o2 "        \n\t" \
	"addps  " #i3 ", " #o1 "       \n\t" \
	"addps  " #i4 ", " #o2 "       \n\t"	

/* Calculate 4 lifting steps to get 2 samples vertical, xmm0-xmm3 : buffer
*/

#define VERT_MAIN(in1, in2, ofs) \
	"movaps " #ofs "(%2), %%xmm7          \n\t" \
 \
	"movaps " #in1 ", %%xmm4              \n\t" \
	"movaps %%xmm0, %%xmm5                \n\t" \
	"addps  %%xmm4, %%xmm5                \n\t" \
	"mulps  %%xmm7, %%xmm5                \n\t" \
	"addps  " #in2 ", %%xmm5              \n\t" \
 \
	"movaps 16+" #ofs "(%2), %%xmm7       \n\t" \
 \
	"movaps %%xmm5, %%xmm6                \n\t" \
	"addps  %%xmm1, %%xmm6                \n\t" \
	"mulps  %%xmm7, %%xmm6                \n\t" \
	"addps  %%xmm0, %%xmm6                \n\t" \
 \
	"movaps 32+" #ofs "(%2), %%xmm7       \n\t" \
	"movaps %%xmm4, %%xmm0                \n\t" \
 \
	"movaps %%xmm6, %%xmm4                \n\t" \
	"addps  %%xmm2, %%xmm4                \n\t" \
	"mulps  %%xmm7, %%xmm4                \n\t" \
	"addps  %%xmm1, %%xmm4                \n\t" \
 \
	"movaps 48+" #ofs "(%2), %%xmm7       \n\t" \
	"movaps %%xmm5, %%xmm1                \n\t" \
 \
	"movaps %%xmm4, %%xmm5                \n\t" \
	"addps  %%xmm3, %%xmm5                \n\t" \
	"mulps  %%xmm7, %%xmm5                \n\t" \
	"addps  %%xmm2, %%xmm5                \n\t" \
 \
	"movaps %%xmm4, %%xmm3                \n\t" \
	"movaps %%xmm6, %%xmm2                \n\t" 


static inline void analyze_inplace(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
// Anal_begin
	"movaps (%1), %%xmm7       \n\t"
	"movaps (%1), %%xmm1       \n\t"
	"movaps (%1), %%xmm0       \n\t"
	"movaps (%1), %%xmm2       \n\t"
	"shufps   $1, %%xmm1, %%xmm1 \n\t"
	"shufps   $2, %%xmm7, %%xmm7 \n\t"

	"movaps (%2), %%xmm5         \n\t"
	"addss  %%xmm7, %%xmm0       \n\t"
	"mulss  %%xmm5, %%xmm0       \n\t"	
	"addss  %%xmm1, %%xmm0       \n\t"

	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss  %%xmm0, %%xmm7       \n\t"
	
	"movaps 16(%2), %%xmm5       \n\t"
	"addss  %%xmm0, %%xmm0       \n\t"
	"mulss  %%xmm5, %%xmm0       \n\t"
	"addss  %%xmm2, %%xmm0       \n\t"	

	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss  %%xmm0, %%xmm7       \n\t"
	"shufps $147, %%xmm7, %%xmm7 \n\t"		

	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm0       \n\t"
	"movaps %%xmm1, %%xmm2       \n\t"
	"movups 12(%1), %%xmm4       \n\t"
	"shufps $221, %%xmm0, %%xmm1 \n\t"
	"shufps $136, %%xmm0, %%xmm2 \n\t"
	"shufps $147, %%xmm1, %%xmm1 \n\t"
	"movss  %%xmm4, %%xmm1       \n\t"

	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 0, %%xmm3)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 16, %%xmm1)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 32, %%xmm3)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm1, %%xmm1, 48, %%xmm2, $229)	

	"movaps 64(%2), %%xmm6       \n\t"
	"mulps  %%xmm6, %%xmm2       \n\t"	
	"movaps 80(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"	

	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm2, (%1, %5)       \n\t"

	"addl %5, %1                 \n\t"
	"addl %5, %1                 \n\t"

//Anal sse16
	".balign 16                  \n\t"
	"1:                          \n\t"
	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm0       \n\t"
	"movaps (%1, %6), %%xmm4       \n\t"
	"movaps (%1, %5, 4), %%xmm6       \n\t"
	"movaps %%xmm1, %%xmm2       \n\t"
	"movaps %%xmm4, %%xmm5       \n\t"
	"shufps $221, %%xmm0, %%xmm1 \n\t"
	"shufps $221, %%xmm6, %%xmm4 \n\t"
	"shufps $136, %%xmm0, %%xmm2 \n\t"
	"shufps $136, %%xmm6, %%xmm5 \n\t"
	"shufps $147, %%xmm1, %%xmm1 \n\t"
	"shufps $147, %%xmm4, %%xmm4 \n\t"
	"movups 12(%1), %%xmm3       \n\t"
	"movss  %%xmm3, %%xmm1       \n\t"
	"movups 12(%1, %5, 2), %%xmm3       \n\t"
	"movss  %%xmm3, %%xmm4       \n\t"
	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 0, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 16, %%xmm1, %%xmm4)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 32, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm1, %%xmm4, %%xmm1, %%xmm4, 48, %%xmm2, %%xmm5)	

	"movaps 64(%2), %%xmm0       \n\t"
	"movaps 80(%2), %%xmm6       \n\t"
	"mulps  %%xmm0, %%xmm2       \n\t"	
	"mulps  %%xmm0, %%xmm5       \n\t"	
	"mulps  %%xmm6, %%xmm1       \n\t"	
	"mulps  %%xmm6, %%xmm4       \n\t"	

	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm2, (%1, %5)       \n\t"
	"movaps %%xmm4, (%1, %5, 2)       \n\t"
	"movaps %%xmm5, (%1, %6)       \n\t"

	"addl %6, %1                \n\t"
	"addl %5, %1                \n\t"
	"subl  $1, %0                \n\t"
	"jnz 1b                      \n\t"

//Anal_end

	"movaps 16-16(%1, %5), %%xmm1       \n\t"
	"movaps 16-16(%1, %5), %%xmm0       \n\t"
	"shufps  $6, %%xmm0, %%xmm0 \n\t" //27 volt
	"movaps %%xmm1, %%xmm2       \n\t"
	"movups 12(%1), %%xmm4       \n\t"
	"shufps $221, %%xmm0, %%xmm1 \n\t"
	"shufps $136, %%xmm0, %%xmm2 \n\t"
	"shufps $147, %%xmm1, %%xmm1 \n\t"
	"movss  %%xmm4, %%xmm1       \n\t"

	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 0, %%xmm3)	
	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 16, %%xmm1)	
	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 32, %%xmm3)	
	HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm3, %%xmm1, %%xmm1, 48, %%xmm2)	

	"movaps 64(%2), %%xmm7       \n\t"
	"mulps  %%xmm7, %%xmm2       \n\t"	
	"movaps 80(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"	

	"movaps %%xmm1,  0(%1)       \n\t"
	"movaps %%xmm2, 16-16(%1, %5)       \n\t"


	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

static inline void synthetize_inplace(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
//Synth_begin
	"movaps (%1), %%xmm1         \n\t"
	"movaps (%1, %5), %%xmm2     \n\t"
	
	"movaps 96(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"
	"movaps 112(%2), %%xmm5      \n\t"
	"mulps  %%xmm5, %%xmm2       \n\t"

	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128, %%xmm3, $229)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+16, %%xmm1, $230)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128+32, %%xmm3, $228)	
	HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+48, %%xmm1, $228)	
	
	"unpckhps %%xmm2, %%xmm1     \n\t"
	"movaps %%xmm1, (%1)       \n\t"

	"addl %5, %1                 \n\t"

//Synth_sse16
	".balign 16                  \n\t"
	"1:                          \n\t"
	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm2       \n\t"
	"movaps (%1, %6), %%xmm4       \n\t"
	"movaps (%1, %5, 4), %%xmm5       \n\t"

	"movaps 96(%2), %%xmm0       \n\t"
	"mulps  %%xmm0, %%xmm1       \n\t"
	"mulps  %%xmm0, %%xmm4       \n\t"
	"movaps 112(%2), %%xmm0      \n\t"
	"mulps  %%xmm0, %%xmm2       \n\t"
	"mulps  %%xmm0, %%xmm5       \n\t"

	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 128, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 128+16, %%xmm1, %%xmm4)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 128+32, %%xmm3, %%xmm6)	
	HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 128+48, %%xmm1, %%xmm4)	

	"movaps %%xmm1, %%xmm3       \n\t"
	"movaps %%xmm4, %%xmm6       \n\t"

	"unpckhps %%xmm2, %%xmm3     \n\t"
	"unpcklps %%xmm2, %%xmm1     \n\t"
	"unpckhps %%xmm5, %%xmm6     \n\t"
	"unpcklps %%xmm5, %%xmm4     \n\t"
	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm3, (%1, %5)       \n\t"
	"movaps %%xmm4, (%1, %5, 2)       \n\t"
	"movaps %%xmm6, (%1, %6)       \n\t"

	"addl %5, %1                 \n\t"
	"addl %6, %1                 \n\t"	
	"subl  $1, %0                \n\t"
	"jnz 1b                      \n\t"

//Synthend_sse
	"movaps (%1, %5), %%xmm1       \n\t"
	"movaps (%1, %5, 2), %%xmm2       \n\t"
	
	"movaps 96(%2), %%xmm5       \n\t"
	"mulps  %%xmm5, %%xmm1       \n\t"
	"movaps 112(%2), %%xmm5      \n\t"
	"mulps  %%xmm5, %%xmm2       \n\t"

	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128, %%xmm3)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+16, %%xmm1)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128+32, %%xmm3)	
	HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+48, %%xmm1)	

	"movaps %%xmm1, %%xmm3       \n\t"

	"unpckhps %%xmm2, %%xmm3     \n\t"
	"unpcklps %%xmm2, %%xmm1     \n\t"
	"movaps %%xmm1, (%1)       \n\t"
	"movaps %%xmm3, (%1, %5)       \n\t"

// Vege
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm1        \n\t"

	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm2        \n\t"	
	"movss %%xmm7, %%xmm3        \n\t"	
	"movaps 128+16(%2), %%xmm5   \n\t"
	"addss %%xmm2, %%xmm2        \n\t"
	"mulss %%xmm5, %%xmm2        \n\t"
	"addss %%xmm1, %%xmm2        \n\t"
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm1        \n\t"
	"movss %%xmm7, %%xmm4        \n\t"
	"movaps 128+32(%2), %%xmm5   \n\t"
	"addss %%xmm2, %%xmm4        \n\t"
	"mulss %%xmm5, %%xmm4        \n\t"
	"addss %%xmm3, %%xmm4        \n\t"
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm7, %%xmm0        \n\t"
	
	"addss %%xmm4, %%xmm0        \n\t"
	"movaps 128+48(%2), %%xmm5   \n\t"
	"mulss %%xmm5, %%xmm0        \n\t"
	"addss %%xmm1, %%xmm0        \n\t"

	"movss %%xmm4, %%xmm6        \n\t"
	"addss %%xmm6, %%xmm6        \n\t"	
	"mulss %%xmm5, %%xmm6        \n\t"
	"addss %%xmm6, %%xmm2        \n\t"
	
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm2, %%xmm7        \n\t"	
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm4, %%xmm7        \n\t"	
	"shufps $147, %%xmm7, %%xmm7 \n\t"
	"movss %%xmm0, %%xmm7        \n\t"	
	"shufps $147, %%xmm7, %%xmm7 \n\t"

	"movups %%xmm7, (%1, %5, 2)       \n\t"


	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

static inline void analyze_inplace_vertical(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
// A_Start
	"movaps (%2), %%xmm7                  \n\t"		

	"movaps (%1, %5, 4), %%xmm0           \n\t"
	"movaps  (%1, %5, 2), %%xmm1          \n\t"
	"addps  %%xmm0, %%xmm1                \n\t"
	"mulps  %%xmm7, %%xmm1                \n\t"	
	"addps  (%1, %6), %%xmm1              \n\t"	

	"movaps (%1), %%xmm4                  \n\t"
	"addps  (%1, %5, 2), %%xmm4           \n\t"
	"mulps  %%xmm7, %%xmm4                \n\t"	
	"addps  (%1, %5), %%xmm4              \n\t"	

	"movaps 16(%2), %%xmm7                \n\t"		

	"movaps %%xmm1, %%xmm2                \n\t"
	"addps  %%xmm4, %%xmm2                \n\t"
	"mulps  %%xmm7, %%xmm2                \n\t"	
	"addps  (%1, %5, 2), %%xmm2           \n\t"	

	"movaps %%xmm4, %%xmm5                \n\t"
	"addps  %%xmm4, %%xmm5                \n\t"
	"mulps  %%xmm7, %%xmm5                \n\t"	
	"addps  (%1), %%xmm5                  \n\t"	

	"movaps 32(%2), %%xmm7                \n\t"		

	"movaps %%xmm5, %%xmm3                \n\t"
	"addps  %%xmm2, %%xmm3                \n\t"
	"mulps  %%xmm7, %%xmm3                \n\t"	
	"addps  %%xmm4, %%xmm3                \n\t"	

	"movaps 48(%2), %%xmm7                \n\t"		

	"movaps %%xmm3, %%xmm6                \n\t"
	"addps  %%xmm3, %%xmm6                \n\t"
	"mulps  %%xmm7, %%xmm6                \n\t"	
	"addps  %%xmm5, %%xmm6                \n\t"	

	"movaps 64(%2), %%xmm4                \n\t"		
	"movaps 80(%2), %%xmm5                \n\t"		
	"mulps  %%xmm3, %%xmm4                \n\t"
	"mulps  %%xmm5, %%xmm6                \n\t"

	"movaps %%xmm6, (%1)                  \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"
	
	"addl %5, %1                          \n\t"
	"addl %5, %1                          \n\t"

// A_Main
	".balign 16                           \n\t"
	"1:                                   \n\t"

	VERT_MAIN((%1, %5, 4), (%1, %6), 0)

	"movaps 64(%2), %%xmm7                \n\t" 
	"movaps 80(%2), %%xmm6                \n\t" 

	"mulps  %%xmm7, %%xmm4                \n\t" 
	"mulps  %%xmm6, %%xmm5                \n\t"

	"movaps %%xmm5, (%1)                  \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"

	"addl %5, %1                          \n\t"
	"addl %5, %1                          \n\t"
	"subl  $2, %0                         \n\t"
	"jnz 1b                               \n\t"

// A_End
	VERT_MAIN(%%xmm0, (%1, %6), 0)

	"movaps 64(%2), %%xmm7                \n\t" 
	"movaps 80(%2), %%xmm6                \n\t" 
 
	"mulps  %%xmm7, %%xmm4                \n\t" 
	"mulps  %%xmm6, %%xmm5                \n\t"
	
	"movaps (%1, %5), %%xmm6              \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"
	"movaps (%1), %%xmm4                  \n\t"
	"movaps %%xmm5, (%1)                  \n\t"

	VERT_MAIN(%%xmm4, %%xmm6, 0)

	"movaps 64(%2), %%xmm7                \n\t" 
	"movaps 80(%2), %%xmm6                \n\t" 
 
	"mulps  %%xmm7, %%xmm4                \n\t" 
	"mulps  %%xmm6, %%xmm5                \n\t"

	"movaps %%xmm5, (%1, %5, 2)           \n\t"
	"movaps %%xmm4, (%1, %6)              \n\t"

	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

static inline void synthetize_inplace_vertical(float *in, float *coeffs, int step, int iter) {
    int lof1, lof2;

    asm volatile(
// S_Start
	"movaps 128(%2), %%xmm7               \n\t"		
	"movaps 112(%2), %%xmm5               \n\t"		
	"movaps 96(%2), %%xmm4                \n\t"		
	"movaps %%xmm5, %%xmm0                \n\t"
	"movaps %%xmm4, %%xmm6                \n\t"
	"mulps  (%1, %5), %%xmm5              \n\t"
	"mulps  (%1, %6), %%xmm0              \n\t"
	"mulps  (%1, %5, 2), %%xmm4           \n\t"
	"mulps  (%1), %%xmm6                  \n\t"

	"movaps  %%xmm5, %%xmm1               \n\t"
	"addps  %%xmm0, %%xmm1                \n\t"
	"mulps  %%xmm7, %%xmm1                \n\t"	
	"addps  %%xmm4, %%xmm1                \n\t"	

	"movaps  %%xmm5, %%xmm4               \n\t"
	"addps  %%xmm4, %%xmm4                \n\t"
	"mulps  %%xmm7, %%xmm4                \n\t"	
	"addps  %%xmm6, %%xmm4                \n\t"	

	"movaps 128+16(%2), %%xmm7            \n\t"		

	"movaps %%xmm1, %%xmm2                \n\t"
	"addps  %%xmm4, %%xmm2                \n\t"
	"mulps  %%xmm7, %%xmm2                \n\t"	
	"addps  %%xmm5, %%xmm2                \n\t"	

	"movaps 128+32(%2), %%xmm7            \n\t"		

	"movaps %%xmm2, %%xmm3                \n\t"
	"addps  %%xmm3, %%xmm3                \n\t"
	"mulps  %%xmm7, %%xmm3                \n\t"	
	"addps  %%xmm4, %%xmm3                \n\t"	

	"movaps %%xmm3, (%1)                  \n\t"
	
	"addl %5, %1                          \n\t"

// S_Main
	".balign 16                           \n\t"
	"1:                                   \n\t"

	
	"movaps 96(%2), %%xmm6                \n\t"
	"movaps 112(%2), %%xmm4               \n\t"

	"mulps (%1, %5, 4), %%xmm4            \n\t"
	"mulps (%1, %6), %%xmm6               \n\t"
		
	VERT_MAIN(%%xmm4, %%xmm6, 128)

	"movaps %%xmm5, (%1)                  \n\t"
	"movaps %%xmm4, (%1, %5)              \n\t"

	"addl %5, %1                          \n\t"
	"addl %5, %1                          \n\t"
	"subl  $2, %0                         \n\t"
	"jnz 1b                               \n\t"

// S_End
	"movaps %%xmm1, %%xmm4                \n\t"
	"movaps 128+16(%2), %%xmm7            \n\t"		
	"addps  %%xmm1, %%xmm4                \n\t"
	"mulps  %%xmm7, %%xmm4                \n\t"
	"addps  %%xmm0, %%xmm4                \n\t"

	"movaps %%xmm2, %%xmm5                \n\t"
	"movaps 128+32(%2), %%xmm7            \n\t"		
	"addps  %%xmm4, %%xmm5                \n\t"
	"mulps  %%xmm7, %%xmm5                \n\t"
	"addps  %%xmm1, %%xmm5                \n\t"

	"movaps %%xmm3, %%xmm6                \n\t"
	"movaps 128+48(%2), %%xmm7            \n\t"		
	"addps  %%xmm5, %%xmm6                \n\t"
	"mulps  %%xmm7, %%xmm6                \n\t"
	"addps  %%xmm2, %%xmm6                \n\t"

	"movaps %%xmm5, %%xmm0                \n\t"
	"addps  %%xmm5, %%xmm0                \n\t"
	"mulps  %%xmm7, %%xmm0                \n\t"
	"addps  %%xmm4, %%xmm0                \n\t"

	"movaps %%xmm6, (%1)                  \n\t"
	"movaps %%xmm5, (%1, %5)              \n\t"
	"movaps %%xmm0, (%1, %5, 2)           \n\t"

	: "=r" (lof1), "=r" (lof2)
	: "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3)
    );
}

int dwt1d(float *pict, int width, int height, int it) {
    int h;

    for(h=1;h<(1<<it);h<<=1) 
	analyze_inplace(pict, coeffs_cdf97_sse, 16*h, (width/16/h)-1);    
}

int idwt1d(float *pict, int width, int height, int it) {
    int h;

    for(h=(1<<(it-1));h>0;h>>=1) 
	synthetize_inplace(pict, coeffs_cdf97_sse, 16*h, (width/h/16)-1);
}

int dwt2d(float *pict, int width, int height, int it) {
    int i,h;
    float *p;

    for(h=1;h<(1<<it);h<<=1) {
    
	for(p=pict,i=0;i<height;i+=h, p+=width*h)
	    analyze_inplace(p, coeffs_cdf97_sse, 16*h, (width/16/h)-1);

	for(p=pict, i=0;i<width;i+=4*h, p+=4*h)
	    analyze_inplace_vertical(p, coeffs_cdf97_sse, width*4*h, (height/h)-6);
    }
}

int idwt2d(float *pict, int width, int height, int it) {
    int i, h;
    float *p;

    for(h=(1<<(it-1));h>0;h>>=1) {
        for(p=pict, i=0;i<width;i+=4*h, p+=4*h)
	    synthetize_inplace_vertical(p, coeffs_cdf97_sse, width*4*h, (height/h)-4);
    
	for(p=pict,i=0;i<height;i+=h, p+=width*h)
	    synthetize_inplace(p, coeffs_cdf97_sse, 16*h, (width/h/16)-1);
    }
}