Openjpeg improvements
The Reason
OpenJPEG sucks. It does not use fast 9/7 or 5/3 lifting DWT algorithms. This page is all the information we have collected to help with improving openjpeg.
Says bushing:
Analysis on OSX/PPC showed: - 27.1% dwt_decode_1_real (OpenJPEG) - 22.9% tcd_decode_tile (OpenJPEG) - 5.6% t1_dec_sigpass_step (OpenJPEG) - 5.4% t1_decode_cblks (OpenJPEG) - 5.4% t1_dec_refpass_step (OpenJPEG) - 4.7% mqc_decode (OpenJPEG) tcd_decode_tile is really just a call to dwt_decode_real, so half of the time is spent in dwt_decode_real and dwt_decode_1_real. OpenJpeg: 1 min 25 sec GeoJasPer: 1 min 34 sec Kakadu: 19 sec (!)
Some Goodies
From the author of the below code: Balatoni Denes, dbalatoni at programozo.hu
I see that openjpeg uses a BSD licence. Yes you can use that code under BSD licence, thanks for asking too :) . I might not have time to do the integration myself, but of course I can answer questions if needed.
ps: this code can not do dwt on arbitrary size vectors. if you have k levels of dwt, than you can only have IIRC vectorsizes of multiplies of 2^k. for arbitrary size vectors, more coding is needed.
static float coeffs_cdf97_sse[12*4] __attribute__ ((__aligned__ (32))) = { -1.586134342,-1.586134342,-1.586134342,-1.586134342, -0.052980118,-0.052980118,-0.052980118,-0.052980118, 0.882911075, 0.882911075, 0.882911075, 0.882911075, 0.443506852, 0.443506852, 0.443506852, 0.443506852, -1.230174105,-1.230174105,-1.230174105,-1.230174105, 0.812893066, 0.812893066, 0.812893066, 0.812893066, 1.230174105, 1.230174105, 1.230174105, 1.230174105, -0.812893066,-0.812893066,-0.812893066,-0.812893066, -0.443506852,-0.443506852,-0.443506852,-0.443506852, -0.882911075,-0.882911075,-0.882911075,-0.882911075, 0.052980118, 0.052980118, 0.052980118, 0.052980118, 1.586134342, 1.586134342, 1.586134342, 1.586134342 }; #define HORIZ_LIFTING_STEP8(i1, i2, o1, o2, ofs, o2b) \ "movaps " #ofs "(%2), %%xmm5 \n\t" \ "movaps " #i1 ", " #o2 " \n\t" \ "shufps $147, %%xmm7, %%xmm7 \n\t" \ "shufps $147, " #o2 ", " #o2 " \n\t" \ "movss %%xmm7, %%xmm0 \n\t" \ "movss " #o2 ", %%xmm7 \n\t" \ "movss %%xmm0, " #o2 " \n\t" \ "addps " #o2b ", " #o1 " \n\t" \ "mulps %%xmm5, " #o1 " \n\t" \ "addps " #i2 ", " #o1 " \n\t" #define HORIZ_LIFTING_STEP8_END(i1, i2, o1, o2, ofs, o2b) \ "movaps " #ofs "(%2), %%xmm5 \n\t" \ "movaps " #i1 ", " #o2 " \n\t" \ "shufps $147, %%xmm7, %%xmm7 \n\t" \ "shufps $147, " #o2 ", " #o2 " \n\t" \ "movss %%xmm7, " #o2 " \n\t" \ "addps " #o2b ", " #o1 " \n\t" \ "mulps %%xmm5, " #o1 " \n\t" \ "addps " #i2 ", " #o1 " \n\t" #define HORIZ_LIFTING_STEP8_BEGIN(i1, i2, o1, o2, ofs, o2b, shu) \ "movaps " #ofs "(%2), %%xmm5 \n\t" \ "movaps " #i1 ", " #o2 " \n\t" \ "shufps $147, %%xmm7, %%xmm7 \n\t" \ "shufps $147, " #o2 ", " #o2 " \n\t" \ "movss " #o2 ", %%xmm7 \n\t" \ "shufps " #shu ", " #o2 ", " #o2 " \n\t" \ "addps " #o2b ", " #o1 " \n\t" \ "mulps %%xmm5, " #o1 " \n\t" \ "addps " #i2 ", " #o1 " \n\t" #define HORIZ_LIFTING_STEP16(i1, i2, i3, i4, o1, o2, o3, o4, ofs, o3b, o4b) \ "movaps " #i2 ", " #o4 " \n\t" \ "movaps " #i1 ", " #o3 " \n\t" \ "shufps $147, %%xmm7, %%xmm7 \n\t" \ "shufps $147, " #o4 ", " #o4 " \n\t" \ "shufps $147, " #o3 ", " #o3 " \n\t" \ "movss %%xmm7, %%xmm0 \n\t" \ "movss " #o4 ", %%xmm7 \n\t" \ "movss " #o3 ", " #o4 " \n\t" \ "movss %%xmm0, " #o3 " \n\t" \ "movaps " #ofs "(%2), %%xmm0 \n\t" \ "addps " #o3b ", " #o1 " \n\t" \ "addps " #o4b ", " #o2 " \n\t" \ "mulps %%xmm0, " #o1 " \n\t" \ "mulps %%xmm0, " #o2 " \n\t" \ "addps " #i3 ", " #o1 " \n\t" \ "addps " #i4 ", " #o2 " \n\t" /* Calculate 4 lifting steps to get 2 samples vertical, xmm0-xmm3 : buffer */ #define VERT_MAIN(in1, in2, ofs) \ "movaps " #ofs "(%2), %%xmm7 \n\t" \ \ "movaps " #in1 ", %%xmm4 \n\t" \ "movaps %%xmm0, %%xmm5 \n\t" \ "addps %%xmm4, %%xmm5 \n\t" \ "mulps %%xmm7, %%xmm5 \n\t" \ "addps " #in2 ", %%xmm5 \n\t" \ \ "movaps 16+" #ofs "(%2), %%xmm7 \n\t" \ \ "movaps %%xmm5, %%xmm6 \n\t" \ "addps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm7, %%xmm6 \n\t" \ "addps %%xmm0, %%xmm6 \n\t" \ \ "movaps 32+" #ofs "(%2), %%xmm7 \n\t" \ "movaps %%xmm4, %%xmm0 \n\t" \ \ "movaps %%xmm6, %%xmm4 \n\t" \ "addps %%xmm2, %%xmm4 \n\t" \ "mulps %%xmm7, %%xmm4 \n\t" \ "addps %%xmm1, %%xmm4 \n\t" \ \ "movaps 48+" #ofs "(%2), %%xmm7 \n\t" \ "movaps %%xmm5, %%xmm1 \n\t" \ \ "movaps %%xmm4, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm5 \n\t" \ "mulps %%xmm7, %%xmm5 \n\t" \ "addps %%xmm2, %%xmm5 \n\t" \ \ "movaps %%xmm4, %%xmm3 \n\t" \ "movaps %%xmm6, %%xmm2 \n\t" static inline void analyze_inplace(float *in, float *coeffs, int step, int iter) { int lof1, lof2; asm volatile( // Anal_begin "movaps (%1), %%xmm7 \n\t" "movaps (%1), %%xmm1 \n\t" "movaps (%1), %%xmm0 \n\t" "movaps (%1), %%xmm2 \n\t" "shufps $1, %%xmm1, %%xmm1 \n\t" "shufps $2, %%xmm7, %%xmm7 \n\t" "movaps (%2), %%xmm5 \n\t" "addss %%xmm7, %%xmm0 \n\t" "mulss %%xmm5, %%xmm0 \n\t" "addss %%xmm1, %%xmm0 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm0, %%xmm7 \n\t" "movaps 16(%2), %%xmm5 \n\t" "addss %%xmm0, %%xmm0 \n\t" "mulss %%xmm5, %%xmm0 \n\t" "addss %%xmm2, %%xmm0 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm0, %%xmm7 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movaps (%1, %5), %%xmm1 \n\t" "movaps (%1, %5, 2), %%xmm0 \n\t" "movaps %%xmm1, %%xmm2 \n\t" "movups 12(%1), %%xmm4 \n\t" "shufps $221, %%xmm0, %%xmm1 \n\t" "shufps $136, %%xmm0, %%xmm2 \n\t" "shufps $147, %%xmm1, %%xmm1 \n\t" "movss %%xmm4, %%xmm1 \n\t" HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 0, %%xmm3) HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 16, %%xmm1) HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 32, %%xmm3) HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm1, %%xmm1, 48, %%xmm2, $229) "movaps 64(%2), %%xmm6 \n\t" "mulps %%xmm6, %%xmm2 \n\t" "movaps 80(%2), %%xmm5 \n\t" "mulps %%xmm5, %%xmm1 \n\t" "movaps %%xmm1, (%1) \n\t" "movaps %%xmm2, (%1, %5) \n\t" "addl %5, %1 \n\t" "addl %5, %1 \n\t" //Anal sse16 ".balign 16 \n\t" "1: \n\t" "movaps (%1, %5), %%xmm1 \n\t" "movaps (%1, %5, 2), %%xmm0 \n\t" "movaps (%1, %6), %%xmm4 \n\t" "movaps (%1, %5, 4), %%xmm6 \n\t" "movaps %%xmm1, %%xmm2 \n\t" "movaps %%xmm4, %%xmm5 \n\t" "shufps $221, %%xmm0, %%xmm1 \n\t" "shufps $221, %%xmm6, %%xmm4 \n\t" "shufps $136, %%xmm0, %%xmm2 \n\t" "shufps $136, %%xmm6, %%xmm5 \n\t" "shufps $147, %%xmm1, %%xmm1 \n\t" "shufps $147, %%xmm4, %%xmm4 \n\t" "movups 12(%1), %%xmm3 \n\t" "movss %%xmm3, %%xmm1 \n\t" "movups 12(%1, %5, 2), %%xmm3 \n\t" "movss %%xmm3, %%xmm4 \n\t" HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 0, %%xmm3, %%xmm6) HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 16, %%xmm1, %%xmm4) HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 32, %%xmm3, %%xmm6) HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm1, %%xmm4, %%xmm1, %%xmm4, 48, %%xmm2, %%xmm5) "movaps 64(%2), %%xmm0 \n\t" "movaps 80(%2), %%xmm6 \n\t" "mulps %%xmm0, %%xmm2 \n\t" "mulps %%xmm0, %%xmm5 \n\t" "mulps %%xmm6, %%xmm1 \n\t" "mulps %%xmm6, %%xmm4 \n\t" "movaps %%xmm1, (%1) \n\t" "movaps %%xmm2, (%1, %5) \n\t" "movaps %%xmm4, (%1, %5, 2) \n\t" "movaps %%xmm5, (%1, %6) \n\t" "addl %6, %1 \n\t" "addl %5, %1 \n\t" "subl $1, %0 \n\t" "jnz 1b \n\t" //Anal_end "movaps 16-16(%1, %5), %%xmm1 \n\t" "movaps 16-16(%1, %5), %%xmm0 \n\t" "shufps $6, %%xmm0, %%xmm0 \n\t" //27 volt "movaps %%xmm1, %%xmm2 \n\t" "movups 12(%1), %%xmm4 \n\t" "shufps $221, %%xmm0, %%xmm1 \n\t" "shufps $136, %%xmm0, %%xmm2 \n\t" "shufps $147, %%xmm1, %%xmm1 \n\t" "movss %%xmm4, %%xmm1 \n\t" HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 0, %%xmm3) HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 16, %%xmm1) HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 32, %%xmm3) HORIZ_LIFTING_STEP8_END(%%xmm2, %%xmm3, %%xmm1, %%xmm1, 48, %%xmm2) "movaps 64(%2), %%xmm7 \n\t" "mulps %%xmm7, %%xmm2 \n\t" "movaps 80(%2), %%xmm5 \n\t" "mulps %%xmm5, %%xmm1 \n\t" "movaps %%xmm1, 0(%1) \n\t" "movaps %%xmm2, 16-16(%1, %5) \n\t" : "=r" (lof1), "=r" (lof2) : "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3) ); } static inline void synthetize_inplace(float *in, float *coeffs, int step, int iter) { int lof1, lof2; asm volatile( //Synth_begin "movaps (%1), %%xmm1 \n\t" "movaps (%1, %5), %%xmm2 \n\t" "movaps 96(%2), %%xmm5 \n\t" "mulps %%xmm5, %%xmm1 \n\t" "movaps 112(%2), %%xmm5 \n\t" "mulps %%xmm5, %%xmm2 \n\t" HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128, %%xmm3, $229) HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+16, %%xmm1, $230) HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128+32, %%xmm3, $228) HORIZ_LIFTING_STEP8_BEGIN(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+48, %%xmm1, $228) "unpckhps %%xmm2, %%xmm1 \n\t" "movaps %%xmm1, (%1) \n\t" "addl %5, %1 \n\t" //Synth_sse16 ".balign 16 \n\t" "1: \n\t" "movaps (%1, %5), %%xmm1 \n\t" "movaps (%1, %5, 2), %%xmm2 \n\t" "movaps (%1, %6), %%xmm4 \n\t" "movaps (%1, %5, 4), %%xmm5 \n\t" "movaps 96(%2), %%xmm0 \n\t" "mulps %%xmm0, %%xmm1 \n\t" "mulps %%xmm0, %%xmm4 \n\t" "movaps 112(%2), %%xmm0 \n\t" "mulps %%xmm0, %%xmm2 \n\t" "mulps %%xmm0, %%xmm5 \n\t" HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 128, %%xmm3, %%xmm6) HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 128+16, %%xmm1, %%xmm4) HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm1, %%xmm4, %%xmm2, %%xmm5, %%xmm3, %%xmm6, 128+32, %%xmm3, %%xmm6) HORIZ_LIFTING_STEP16(%%xmm2, %%xmm5, %%xmm3, %%xmm6, %%xmm2, %%xmm5, %%xmm1, %%xmm4, 128+48, %%xmm1, %%xmm4) "movaps %%xmm1, %%xmm3 \n\t" "movaps %%xmm4, %%xmm6 \n\t" "unpckhps %%xmm2, %%xmm3 \n\t" "unpcklps %%xmm2, %%xmm1 \n\t" "unpckhps %%xmm5, %%xmm6 \n\t" "unpcklps %%xmm5, %%xmm4 \n\t" "movaps %%xmm1, (%1) \n\t" "movaps %%xmm3, (%1, %5) \n\t" "movaps %%xmm4, (%1, %5, 2) \n\t" "movaps %%xmm6, (%1, %6) \n\t" "addl %5, %1 \n\t" "addl %6, %1 \n\t" "subl $1, %0 \n\t" "jnz 1b \n\t" //Synthend_sse "movaps (%1, %5), %%xmm1 \n\t" "movaps (%1, %5, 2), %%xmm2 \n\t" "movaps 96(%2), %%xmm5 \n\t" "mulps %%xmm5, %%xmm1 \n\t" "movaps 112(%2), %%xmm5 \n\t" "mulps %%xmm5, %%xmm2 \n\t" HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128, %%xmm3) HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+16, %%xmm1) HORIZ_LIFTING_STEP8(%%xmm2, %%xmm1, %%xmm2, %%xmm3, 128+32, %%xmm3) HORIZ_LIFTING_STEP8(%%xmm2, %%xmm3, %%xmm2, %%xmm1, 128+48, %%xmm1) "movaps %%xmm1, %%xmm3 \n\t" "unpckhps %%xmm2, %%xmm3 \n\t" "unpcklps %%xmm2, %%xmm1 \n\t" "movaps %%xmm1, (%1) \n\t" "movaps %%xmm3, (%1, %5) \n\t" // Vege "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm7, %%xmm1 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm7, %%xmm2 \n\t" "movss %%xmm7, %%xmm3 \n\t" "movaps 128+16(%2), %%xmm5 \n\t" "addss %%xmm2, %%xmm2 \n\t" "mulss %%xmm5, %%xmm2 \n\t" "addss %%xmm1, %%xmm2 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm7, %%xmm1 \n\t" "movss %%xmm7, %%xmm4 \n\t" "movaps 128+32(%2), %%xmm5 \n\t" "addss %%xmm2, %%xmm4 \n\t" "mulss %%xmm5, %%xmm4 \n\t" "addss %%xmm3, %%xmm4 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm7, %%xmm0 \n\t" "addss %%xmm4, %%xmm0 \n\t" "movaps 128+48(%2), %%xmm5 \n\t" "mulss %%xmm5, %%xmm0 \n\t" "addss %%xmm1, %%xmm0 \n\t" "movss %%xmm4, %%xmm6 \n\t" "addss %%xmm6, %%xmm6 \n\t" "mulss %%xmm5, %%xmm6 \n\t" "addss %%xmm6, %%xmm2 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm2, %%xmm7 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm4, %%xmm7 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movss %%xmm0, %%xmm7 \n\t" "shufps $147, %%xmm7, %%xmm7 \n\t" "movups %%xmm7, (%1, %5, 2) \n\t" : "=r" (lof1), "=r" (lof2) : "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3) ); } static inline void analyze_inplace_vertical(float *in, float *coeffs, int step, int iter) { int lof1, lof2; asm volatile( // A_Start "movaps (%2), %%xmm7 \n\t" "movaps (%1, %5, 4), %%xmm0 \n\t" "movaps (%1, %5, 2), %%xmm1 \n\t" "addps %%xmm0, %%xmm1 \n\t" "mulps %%xmm7, %%xmm1 \n\t" "addps (%1, %6), %%xmm1 \n\t" "movaps (%1), %%xmm4 \n\t" "addps (%1, %5, 2), %%xmm4 \n\t" "mulps %%xmm7, %%xmm4 \n\t" "addps (%1, %5), %%xmm4 \n\t" "movaps 16(%2), %%xmm7 \n\t" "movaps %%xmm1, %%xmm2 \n\t" "addps %%xmm4, %%xmm2 \n\t" "mulps %%xmm7, %%xmm2 \n\t" "addps (%1, %5, 2), %%xmm2 \n\t" "movaps %%xmm4, %%xmm5 \n\t" "addps %%xmm4, %%xmm5 \n\t" "mulps %%xmm7, %%xmm5 \n\t" "addps (%1), %%xmm5 \n\t" "movaps 32(%2), %%xmm7 \n\t" "movaps %%xmm5, %%xmm3 \n\t" "addps %%xmm2, %%xmm3 \n\t" "mulps %%xmm7, %%xmm3 \n\t" "addps %%xmm4, %%xmm3 \n\t" "movaps 48(%2), %%xmm7 \n\t" "movaps %%xmm3, %%xmm6 \n\t" "addps %%xmm3, %%xmm6 \n\t" "mulps %%xmm7, %%xmm6 \n\t" "addps %%xmm5, %%xmm6 \n\t" "movaps 64(%2), %%xmm4 \n\t" "movaps 80(%2), %%xmm5 \n\t" "mulps %%xmm3, %%xmm4 \n\t" "mulps %%xmm5, %%xmm6 \n\t" "movaps %%xmm6, (%1) \n\t" "movaps %%xmm4, (%1, %5) \n\t" "addl %5, %1 \n\t" "addl %5, %1 \n\t" // A_Main ".balign 16 \n\t" "1: \n\t" VERT_MAIN((%1, %5, 4), (%1, %6), 0) "movaps 64(%2), %%xmm7 \n\t" "movaps 80(%2), %%xmm6 \n\t" "mulps %%xmm7, %%xmm4 \n\t" "mulps %%xmm6, %%xmm5 \n\t" "movaps %%xmm5, (%1) \n\t" "movaps %%xmm4, (%1, %5) \n\t" "addl %5, %1 \n\t" "addl %5, %1 \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" // A_End VERT_MAIN(%%xmm0, (%1, %6), 0) "movaps 64(%2), %%xmm7 \n\t" "movaps 80(%2), %%xmm6 \n\t" "mulps %%xmm7, %%xmm4 \n\t" "mulps %%xmm6, %%xmm5 \n\t" "movaps (%1, %5), %%xmm6 \n\t" "movaps %%xmm4, (%1, %5) \n\t" "movaps (%1), %%xmm4 \n\t" "movaps %%xmm5, (%1) \n\t" VERT_MAIN(%%xmm4, %%xmm6, 0) "movaps 64(%2), %%xmm7 \n\t" "movaps 80(%2), %%xmm6 \n\t" "mulps %%xmm7, %%xmm4 \n\t" "mulps %%xmm6, %%xmm5 \n\t" "movaps %%xmm5, (%1, %5, 2) \n\t" "movaps %%xmm4, (%1, %6) \n\t" : "=r" (lof1), "=r" (lof2) : "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3) ); } static inline void synthetize_inplace_vertical(float *in, float *coeffs, int step, int iter) { int lof1, lof2; asm volatile( // S_Start "movaps 128(%2), %%xmm7 \n\t" "movaps 112(%2), %%xmm5 \n\t" "movaps 96(%2), %%xmm4 \n\t" "movaps %%xmm5, %%xmm0 \n\t" "movaps %%xmm4, %%xmm6 \n\t" "mulps (%1, %5), %%xmm5 \n\t" "mulps (%1, %6), %%xmm0 \n\t" "mulps (%1, %5, 2), %%xmm4 \n\t" "mulps (%1), %%xmm6 \n\t" "movaps %%xmm5, %%xmm1 \n\t" "addps %%xmm0, %%xmm1 \n\t" "mulps %%xmm7, %%xmm1 \n\t" "addps %%xmm4, %%xmm1 \n\t" "movaps %%xmm5, %%xmm4 \n\t" "addps %%xmm4, %%xmm4 \n\t" "mulps %%xmm7, %%xmm4 \n\t" "addps %%xmm6, %%xmm4 \n\t" "movaps 128+16(%2), %%xmm7 \n\t" "movaps %%xmm1, %%xmm2 \n\t" "addps %%xmm4, %%xmm2 \n\t" "mulps %%xmm7, %%xmm2 \n\t" "addps %%xmm5, %%xmm2 \n\t" "movaps 128+32(%2), %%xmm7 \n\t" "movaps %%xmm2, %%xmm3 \n\t" "addps %%xmm3, %%xmm3 \n\t" "mulps %%xmm7, %%xmm3 \n\t" "addps %%xmm4, %%xmm3 \n\t" "movaps %%xmm3, (%1) \n\t" "addl %5, %1 \n\t" // S_Main ".balign 16 \n\t" "1: \n\t" "movaps 96(%2), %%xmm6 \n\t" "movaps 112(%2), %%xmm4 \n\t" "mulps (%1, %5, 4), %%xmm4 \n\t" "mulps (%1, %6), %%xmm6 \n\t" VERT_MAIN(%%xmm4, %%xmm6, 128) "movaps %%xmm5, (%1) \n\t" "movaps %%xmm4, (%1, %5) \n\t" "addl %5, %1 \n\t" "addl %5, %1 \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" // S_End "movaps %%xmm1, %%xmm4 \n\t" "movaps 128+16(%2), %%xmm7 \n\t" "addps %%xmm1, %%xmm4 \n\t" "mulps %%xmm7, %%xmm4 \n\t" "addps %%xmm0, %%xmm4 \n\t" "movaps %%xmm2, %%xmm5 \n\t" "movaps 128+32(%2), %%xmm7 \n\t" "addps %%xmm4, %%xmm5 \n\t" "mulps %%xmm7, %%xmm5 \n\t" "addps %%xmm1, %%xmm5 \n\t" "movaps %%xmm3, %%xmm6 \n\t" "movaps 128+48(%2), %%xmm7 \n\t" "addps %%xmm5, %%xmm6 \n\t" "mulps %%xmm7, %%xmm6 \n\t" "addps %%xmm2, %%xmm6 \n\t" "movaps %%xmm5, %%xmm0 \n\t" "addps %%xmm5, %%xmm0 \n\t" "mulps %%xmm7, %%xmm0 \n\t" "addps %%xmm4, %%xmm0 \n\t" "movaps %%xmm6, (%1) \n\t" "movaps %%xmm5, (%1, %5) \n\t" "movaps %%xmm0, (%1, %5, 2) \n\t" : "=r" (lof1), "=r" (lof2) : "r" (coeffs), "1" (in), "0" (iter), "r" (step) , "r" (step*3) ); } int dwt1d(float *pict, int width, int height, int it) { int h; for(h=1;h<(1<<it);h<<=1) analyze_inplace(pict, coeffs_cdf97_sse, 16*h, (width/16/h)-1); } int idwt1d(float *pict, int width, int height, int it) { int h; for(h=(1<<(it-1));h>0;h>>=1) synthetize_inplace(pict, coeffs_cdf97_sse, 16*h, (width/h/16)-1); } int dwt2d(float *pict, int width, int height, int it) { int i,h; float *p; for(h=1;h<(1<<it);h<<=1) { for(p=pict,i=0;i<height;i+=h, p+=width*h) analyze_inplace(p, coeffs_cdf97_sse, 16*h, (width/16/h)-1); for(p=pict, i=0;i<width;i+=4*h, p+=4*h) analyze_inplace_vertical(p, coeffs_cdf97_sse, width*4*h, (height/h)-6); } } int idwt2d(float *pict, int width, int height, int it) { int i, h; float *p; for(h=(1<<(it-1));h>0;h>>=1) { for(p=pict, i=0;i<width;i+=4*h, p+=4*h) synthetize_inplace_vertical(p, coeffs_cdf97_sse, width*4*h, (height/h)-4); for(p=pict,i=0;i<height;i+=h, p+=width*h) synthetize_inplace(p, coeffs_cdf97_sse, 16*h, (width/h/16)-1); } }