00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
00026
00027 #if defined(CAN_COMPILE_MMX)
00028
00029
00030
00031 #define MMX_CALL(MMX_INSTRUCTIONS) \
00032 do { \
00033 __asm__ __volatile__( \
00034 ".p2align 3 \n\t \
00035 movd (%0), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00036 movd (%1), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00037 movq (%2), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00038 movq (%3), %%mm3 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00039 " \
00040 : \
00041 : "r" (p_u), "r" (p_v), \
00042 "r" (p_y1), "r" (p_y2) ); \
00043 __asm__ __volatile__( \
00044 ".p2align 3 \n\t" \
00045 MMX_INSTRUCTIONS \
00046 : \
00047 : "r" (p_line1), "r" (p_line2) ); \
00048 p_line1 += 16; p_line2 += 16; \
00049 p_y1 += 8; p_y2 += 8; \
00050 p_u += 4; p_v += 4; \
00051 } while(0)
00052
00053 #define MMX_END __asm__ __volatile__ ( "emms" )
00054
00055 #define MMX_YUV420_YUYV " \n\
00056 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00057 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00058 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
00059 movq %%mm2, (%0) # Store low YUYV \n\
00060 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
00061 movq %%mm0, 8(%0) # Store high YUYV \n\
00062 movq %%mm3, %%mm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00063 punpcklbw %%mm1, %%mm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
00064 movq %%mm4, (%1) # Store low YUYV \n\
00065 punpckhbw %%mm1, %%mm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
00066 movq %%mm3, 8(%1) # Store high YUYV \n\
00067 "
00068
00069 #define MMX_YUV420_YVYU " \n\
00070 punpcklbw %%mm1, %%mm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00071 movq %%mm0, %%mm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00072 punpcklbw %%mm2, %%mm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
00073 movq %%mm1, (%0) # Store low YUYV \n\
00074 punpckhbw %%mm2, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
00075 movq %%mm0, 8(%0) # Store high YUYV \n\
00076 movq %%mm3, %%mm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00077 punpcklbw %%mm2, %%mm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
00078 movq %%mm4, (%1) # Store low YUYV \n\
00079 punpckhbw %%mm2, %%mm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
00080 movq %%mm3, 8(%1) # Store high YUYV \n\
00081 "
00082
00083 #define MMX_YUV420_UYVY " \n\
00084 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00085 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00086 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00087 movq %%mm2, (%0) # Store low UYVY \n\
00088 movq %%mm1, %%mm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00089 punpckhbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00090 movq %%mm2, 8(%0) # Store high UYVY \n\
00091 movq %%mm1, %%mm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00092 punpcklbw %%mm3, %%mm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
00093 movq %%mm4, (%1) # Store low UYVY \n\
00094 punpckhbw %%mm3, %%mm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
00095 movq %%mm1, 8(%1) # Store high UYVY \n\
00096 "
00097
00098
00099 #define MMX_YUV420_Y211 " \n\
00100 movd (%4), %%mm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00101 movd (%5), %%mm3 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00102 pand i_00ffw, %%mm0 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00103 packuswb %%mm0, %%mm0 # pack Y y6 y4 y2 y0 y6 y4 y2 y0 \n\
00104 pand i_00ffw, %%mm2 # get U even 00 u6 00 u4 00 u2 00 u0 \n\
00105 packuswb %%mm2, %%mm2 # pack U 00 00 u2 u0 00 00 u2 u0 \n\
00106 pand i_00ffw, %%mm3 # get V even 00 v6 00 v4 00 v2 00 v0 \n\
00107 packuswb %%mm3, %%mm3 # pack V 00 00 v2 v0 00 00 v2 v0 \n\
00108 punpcklbw %%mm3, %%mm2 # 00 00 00 00 v2 u2 v0 u0 \n\
00109 psubsw i_80w, %%mm2 # U,V -= 128 \n\
00110 punpcklbw %%mm2, %%mm0 # v2 y6 u2 y4 v0 y2 u0 y0 \n\
00111 movq %%mm0, (%0) # Store YUYV \n\
00112 pand i_00ffw, %%mm1 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00113 packuswb %%mm1, %%mm1 # pack Y Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0 \n\
00114 punpcklbw %%mm2, %%mm1 # v2 Y6 u2 Y4 v0 Y2 u0 Y0 \n\
00115 movq %%mm1, (%1) # Store YUYV \n\
00116 "
00117 #elif defined(HAVE_MMX_INTRINSICS)
00118
00119
00120
00121 #include <mmintrin.h>
00122
00123 #define MMX_CALL(MMX_INSTRUCTIONS) \
00124 do { \
00125 __m64 mm0, mm1, mm2, mm3, mm4; \
00126 MMX_INSTRUCTIONS \
00127 p_line1 += 16; p_line2 += 16; \
00128 p_y1 += 8; p_y2 += 8; \
00129 p_u += 4; p_v += 4; \
00130 } while(0)
00131
00132 #define MMX_END _mm_empty()
00133
00134 #define MMX_YUV420_YUYV \
00135 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
00136 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
00137 mm0 = (__m64)*(uint64_t*)p_y1; \
00138 mm3 = (__m64)*(uint64_t*)p_y2; \
00139 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
00140 mm2 = mm0; \
00141 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
00142 *(uint64_t*)p_line1 = (uint64_t)mm2; \
00143 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
00144 *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
00145 mm4 = mm3; \
00146 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
00147 *(uint64_t*)p_line2 = (uint64_t)mm4; \
00148 mm3 = _mm_unpackhi_pi8(mm3, mm1); \
00149 *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
00150
00151 #define MMX_YUV420_YVYU \
00152 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
00153 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00154 mm0 = (__m64)*(uint64_t*)p_y1; \
00155 mm3 = (__m64)*(uint64_t*)p_y2; \
00156 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
00157 mm2 = mm0; \
00158 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
00159 *(uint64_t*)p_line1 = (uint64_t)mm2; \
00160 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
00161 *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
00162 mm4 = mm3; \
00163 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
00164 *(uint64_t*)p_line2 = (uint64_t)mm4; \
00165 mm3 = _mm_unpackhi_pi8(mm3, mm1); \
00166 *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
00167
00168 #define MMX_YUV420_UYVY \
00169 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
00170 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
00171 mm0 = (__m64)*(uint64_t*)p_y1; \
00172 mm3 = (__m64)*(uint64_t*)p_y2; \
00173 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
00174 mm2 = mm1; \
00175 mm2 = _mm_unpacklo_pi8(mm2, mm0); \
00176 *(uint64_t*)p_line1 = (uint64_t)mm2; \
00177 mm2 = mm1; \
00178 mm2 = _mm_unpackhi_pi8(mm2, mm0); \
00179 *(uint64_t*)(p_line1+8) = (uint64_t)mm2;\
00180 mm4 = mm1; \
00181 mm4 = _mm_unpacklo_pi8(mm4, mm3); \
00182 *(uint64_t*)p_line2 = (uint64_t)mm4; \
00183 mm1 = _mm_unpackhi_pi8(mm1, mm3); \
00184 *(uint64_t*)(p_line2+8) = (uint64_t)mm1;
00185
00186 #endif
00187
00188 #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 )
00189
00190 #if defined(CAN_COMPILE_SSE2)
00191
00192
00193
00194 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00195 do { \
00196 __asm__ __volatile__( \
00197 ".p2align 3 \n\t \
00198 movq (%0), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\
00199 movq (%1), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\
00200 " \
00201 : \
00202 : "r" (p_u), "r" (p_v) ); \
00203 __asm__ __volatile__( \
00204 ".p2align 3 \n\t" \
00205 SSE2_INSTRUCTIONS \
00206 : \
00207 : "r" (p_line1), "r" (p_line2), \
00208 "r" (p_y1), "r" (p_y2) ); \
00209 p_line1 += 32; p_line2 += 32; \
00210 p_y1 += 16; p_y2 += 16; \
00211 p_u += 8; p_v += 8; \
00212 } while(0)
00213
00214 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
00215
00216 #define SSE2_YUV420_YUYV_ALIGNED " \n\
00217 movdqa (%2), %%xmm0 # Load 16 Y y15 y14 y13 .. y2 y1 y0 \n\
00218 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00219 punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\
00220 movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\
00221 punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\
00222 movntdq %%xmm2, (%0) # Store low YUYV \n\
00223 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
00224 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
00225 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00226 punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
00227 movntdq %%xmm4, (%1) # Store low YUYV \n\
00228 punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
00229 movntdq %%xmm3, 16(%1) # Store high YUYV \n\
00230 "
00231
00232 #define SSE2_YUV420_YUYV_UNALIGNED " \n\
00233 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00234 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00235 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
00236 prefetchnta (%1) # Tell CPU not to cache output YUYV data \n\
00237 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00238 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00239 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
00240 movdqu %%xmm2, (%0) # Store low YUYV \n\
00241 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
00242 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
00243 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00244 punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
00245 movdqu %%xmm4, (%1) # Store low YUYV \n\
00246 punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
00247 movdqu %%xmm3, 16(%1) # Store high YUYV \n\
00248 "
00249
00250 #define SSE2_YUV420_YVYU_ALIGNED " \n\
00251 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00252 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00253 punpcklbw %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00254 movdqa %%xmm0, %%xmm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00255 punpcklbw %%xmm2, %%xmm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
00256 movntdq %%xmm1, (%0) # Store low YUYV \n\
00257 punpckhbw %%xmm2, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
00258 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
00259 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00260 punpcklbw %%xmm2, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
00261 movntdq %%xmm4, (%1) # Store low YUYV \n\
00262 punpckhbw %%xmm2, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
00263 movntdq %%xmm3, 16(%1) # Store high YUYV \n\
00264 "
00265
00266 #define SSE2_YUV420_YVYU_UNALIGNED " \n\
00267 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00268 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00269 prefetchnta (%0) # Tell CPU not to cache output YVYU data \n\
00270 prefetchnta (%1) # Tell CPU not to cache output YVYU data \n\
00271 punpcklbw %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00272 movdqu %%xmm0, %%xmm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00273 punpcklbw %%xmm2, %%xmm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
00274 movdqu %%xmm1, (%0) # Store low YUYV \n\
00275 punpckhbw %%xmm2, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
00276 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
00277 movdqu %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00278 punpcklbw %%xmm2, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
00279 movdqu %%xmm4, (%1) # Store low YUYV \n\
00280 punpckhbw %%xmm2, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
00281 movdqu %%xmm3, 16(%1) # Store high YUYV \n\
00282 "
00283
00284 #define SSE2_YUV420_UYVY_ALIGNED " \n\
00285 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00286 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00287 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00288 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00289 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00290 movntdq %%xmm2, (%0) # Store low UYVY \n\
00291 movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00292 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00293 movntdq %%xmm2, 16(%0) # Store high UYVY \n\
00294 movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00295 punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
00296 movntdq %%xmm4, (%1) # Store low UYVY \n\
00297 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
00298 movntdq %%xmm1, 16(%1) # Store high UYVY \n\
00299 "
00300
00301 #define SSE2_YUV420_UYVY_UNALIGNED " \n\
00302 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00303 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00304 prefetchnta (%0) # Tell CPU not to cache output UYVY data \n\
00305 prefetchnta (%1) # Tell CPU not to cache output UYVY data \n\
00306 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00307 movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00308 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00309 movdqu %%xmm2, (%0) # Store low UYVY \n\
00310 movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00311 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00312 movdqu %%xmm2, 16(%0) # Store high UYVY \n\
00313 movdqu %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00314 punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
00315 movdqu %%xmm4, (%1) # Store low UYVY \n\
00316 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
00317 movdqu %%xmm1, 16(%1) # Store high UYVY \n\
00318 "
00319
00320 #elif defined(HAVE_SSE2_INTRINSICS)
00321
00322
00323
00324 #include <emmintrin.h>
00325
00326 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00327 do { \
00328 __m128i xmm0, xmm1, xmm2, xmm3, xmm4; \
00329 SSE2_INSTRUCTIONS \
00330 p_line1 += 32; p_line2 += 32; \
00331 p_y1 += 16; p_y2 += 16; \
00332 p_u += 8; p_v += 8; \
00333 } while(0)
00334
00335 #define SSE2_END _mm_sfence()
00336
00337 #define SSE2_YUV420_YUYV_ALIGNED \
00338 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00339 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00340 xmm0 = _mm_load_si128((__m128i *)p_y1); \
00341 xmm3 = _mm_load_si128((__m128i *)p_y2); \
00342 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00343 xmm2 = xmm0; \
00344 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00345 _mm_stream_si128((__m128i*)(p_line1), xmm2); \
00346 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00347 _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
00348 xmm4 = xmm3; \
00349 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
00350 _mm_stream_si128((__m128i*)(p_line2), xmm4); \
00351 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
00352 _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
00353
00354 #define SSE2_YUV420_YUYV_UNALIGNED \
00355 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00356 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00357 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
00358 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
00359 _mm_prefetch(p_line1, _MM_HINT_NTA); \
00360 _mm_prefetch(p_line2, _MM_HINT_NTA); \
00361 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00362 xmm2 = xmm0; \
00363 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00364 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \
00365 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00366 _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
00367 xmm4 = xmm3; \
00368 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
00369 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \
00370 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
00371 _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
00372
00373 #define SSE2_YUV420_YVYU_ALIGNED \
00374 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00375 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
00376 xmm0 = _mm_load_si128((__m128i *)p_y1); \
00377 xmm3 = _mm_load_si128((__m128i *)p_y2); \
00378 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00379 xmm2 = xmm0; \
00380 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00381 _mm_stream_si128((__m128i*)(p_line1), xmm2); \
00382 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00383 _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
00384 xmm4 = xmm3; \
00385 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
00386 _mm_stream_si128((__m128i*)(p_line2), xmm4); \
00387 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
00388 _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
00389
00390 #define SSE2_YUV420_YVYU_UNALIGNED \
00391 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00392 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
00393 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
00394 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
00395 _mm_prefetch(p_line1, _MM_HINT_NTA); \
00396 _mm_prefetch(p_line2, _MM_HINT_NTA); \
00397 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00398 xmm2 = xmm0; \
00399 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00400 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \
00401 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00402 _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
00403 xmm4 = xmm3; \
00404 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
00405 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \
00406 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
00407 _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
00408
00409 #define SSE2_YUV420_UYVY_ALIGNED \
00410 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00411 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00412 xmm0 = _mm_load_si128((__m128i *)p_y1); \
00413 xmm3 = _mm_load_si128((__m128i *)p_y2); \
00414 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00415 xmm2 = xmm1; \
00416 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
00417 _mm_stream_si128((__m128i*)(p_line1), xmm2); \
00418 xmm2 = xmm1; \
00419 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
00420 _mm_stream_si128((__m128i*)(p_line1+16), xmm2); \
00421 xmm4 = xmm1; \
00422 xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \
00423 _mm_stream_si128((__m128i*)(p_line2), xmm4); \
00424 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
00425 _mm_stream_si128((__m128i*)(p_line1+16), xmm1);
00426
00427 #define SSE2_YUV420_UYVY_UNALIGNED \
00428 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00429 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00430 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
00431 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
00432 _mm_prefetch(p_line1, _MM_HINT_NTA); \
00433 _mm_prefetch(p_line2, _MM_HINT_NTA); \
00434 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00435 xmm2 = xmm1; \
00436 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
00437 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \
00438 xmm2 = xmm1; \
00439 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
00440 _mm_storeu_si128((__m128i*)(p_line1+16), xmm2); \
00441 xmm4 = xmm1; \
00442 xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \
00443 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \
00444 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
00445 _mm_storeu_si128((__m128i*)(p_line1+16), xmm1);
00446
00447 #endif
00448
00449 #endif
00450
00451
00452
00453 #define C_YUV420_YVYU( ) \
00454 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
00455 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \
00456 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
00457 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \
00458
00459 #define C_YUV420_Y211( ) \
00460 *(p_line1)++ = *(p_y1); p_y1 += 2; \
00461 *(p_line2)++ = *(p_y2); p_y2 += 2; \
00462 *(p_line1)++ = *(p_line2)++ = *(p_u) - 0x80; p_u += 2; \
00463 *(p_line1)++ = *(p_y1); p_y1 += 2; \
00464 *(p_line2)++ = *(p_y2); p_y2 += 2; \
00465 *(p_line1)++ = *(p_line2)++ = *(p_v) - 0x80; p_v += 2; \
00466
00467
00468 #define C_YUV420_YUYV( ) \
00469 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
00470 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \
00471 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
00472 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \
00473
00474 #define C_YUV420_UYVY( ) \
00475 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \
00476 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
00477 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \
00478 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
00479