00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifdef MODULE_NAME_IS_i422_yuy2_mmx
00026
00027 #if defined(CAN_COMPILE_MMX)
00028
00029
00030
00031 #define MMX_CALL(MMX_INSTRUCTIONS) \
00032 do { \
00033 __asm__ __volatile__( \
00034 ".p2align 3 \n\t" \
00035 MMX_INSTRUCTIONS \
00036 : \
00037 : "r" (p_line), "r" (p_y), \
00038 "r" (p_u), "r" (p_v) ); \
00039 p_line += 16; p_y += 8; \
00040 p_u += 4; p_v += 4; \
00041 } while(0)
00042
00043 #define MMX_END __asm__ __volatile__ ( "emms" )
00044
00045 #define MMX_YUV422_YUYV " \n\
00046 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00047 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00048 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00049 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00050 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00051 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
00052 movq %%mm2, (%0) # Store low YUYV \n\
00053 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
00054 movq %%mm0, 8(%0) # Store high YUYV \n\
00055 "
00056
00057 #define MMX_YUV422_YVYU " \n\
00058 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00059 movd (%2), %%mm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00060 movd (%3), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00061 punpcklbw %%mm2, %%mm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00062 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00063 punpcklbw %%mm1, %%mm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
00064 movq %%mm2, (%0) # Store low YUYV \n\
00065 punpckhbw %%mm1, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
00066 movq %%mm0, 8(%0) # Store high YUYV \n\
00067 "
00068
00069 #define MMX_YUV422_UYVY " \n\
00070 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00071 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00072 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00073 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00074 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00075 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00076 movq %%mm2, (%0) # Store low UYVY \n\
00077 punpckhbw %%mm0, %%mm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
00078 movq %%mm1, 8(%0) # Store high UYVY \n\
00079 "
00080
00081 #define MMX_YUV422_Y211 " \n\
00082 "
00083
00084 #elif defined(HAVE_MMX_INTRINSICS)
00085
00086
00087
00088 #include <mmintrin.h>
00089
00090 #define MMX_CALL(MMX_INSTRUCTIONS) \
00091 do { \
00092 __m64 mm0, mm1, mm2; \
00093 MMX_INSTRUCTIONS \
00094 p_line += 16; p_y += 8; \
00095 p_u += 4; p_v += 4; \
00096 } while(0)
00097
00098 #define MMX_END _mm_empty()
00099
00100 #define MMX_YUV422_YUYV \
00101 mm0 = (__m64)*(uint64_t*)p_y; \
00102 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
00103 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
00104 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
00105 mm2 = mm0; \
00106 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
00107 *(uint64_t*)p_line = (uint64_t)mm2; \
00108 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
00109 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
00110
00111 #define MMX_YUV422_YVYU \
00112 mm0 = (__m64)*(uint64_t*)p_y; \
00113 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
00114 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00115 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
00116 mm2 = mm0; \
00117 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
00118 *(uint64_t*)p_line = (uint64_t)mm2; \
00119 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
00120 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
00121
00122 #define MMX_YUV422_UYVY \
00123 mm0 = (__m64)*(uint64_t*)p_y; \
00124 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
00125 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
00126 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
00127 mm2 = mm1; \
00128 mm2 = _mm_unpacklo_pi8(mm2, mm0); \
00129 *(uint64_t*)p_line = (uint64_t)mm2; \
00130 mm1 = _mm_unpackhi_pi8(mm1, mm0); \
00131 *(uint64_t*)(p_line+8) = (uint64_t)mm1;
00132
00133 #endif
00134
00135 #elif defined( MODULE_NAME_IS_i422_yuy2_sse2 )
00136
00137 #if defined(CAN_COMPILE_SSE2)
00138
00139
00140
00141 #define SSE2_CALL(MMX_INSTRUCTIONS) \
00142 do { \
00143 __asm__ __volatile__( \
00144 ".p2align 3 \n\t" \
00145 MMX_INSTRUCTIONS \
00146 : \
00147 : "r" (p_line), "r" (p_y), \
00148 "r" (p_u), "r" (p_v) ); \
00149 p_line += 32; p_y += 16; \
00150 p_u += 8; p_v += 8; \
00151 } while(0)
00152
00153 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
00154
00155 #define SSE2_YUV422_YUYV_ALIGNED " \n\
00156 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00157 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00158 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00159 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00160 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00161 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
00162 movntdq %%xmm2, (%0) # Store low YUYV \n\
00163 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
00164 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
00165 "
00166
00167 #define SSE2_YUV422_YUYV_UNALIGNED " \n\
00168 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00169 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00170 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00171 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
00172 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00173 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00174 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
00175 movdqu %%xmm2, (%0) # Store low YUYV \n\
00176 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
00177 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
00178 "
00179
00180 #define SSE2_YUV422_YVYU_ALIGNED " \n\
00181 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00182 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00183 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00184 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00185 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00186 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
00187 movntdq %%xmm2, (%0) # Store low YUYV \n\
00188 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
00189 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
00190 "
00191
00192 #define SSE2_YUV422_YVYU_UNALIGNED " \n\
00193 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00194 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00195 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00196 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
00197 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
00198 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
00199 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
00200 movdqu %%xmm2, (%0) # Store low YUYV \n\
00201 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
00202 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
00203 "
00204
00205 #define SSE2_YUV422_UYVY_ALIGNED " \n\
00206 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00207 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00208 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00209 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00210 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00211 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00212 movntdq %%xmm2, (%0) # Store low UYVY \n\
00213 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
00214 movntdq %%xmm1, 16(%0) # Store high UYVY \n\
00215 "
00216
00217 #define SSE2_YUV422_UYVY_UNALIGNED " \n\
00218 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
00219 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00220 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00221 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
00222 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00223 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
00224 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
00225 movdqu %%xmm2, (%0) # Store low UYVY \n\
00226 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
00227 movdqu %%xmm1, 16(%0) # Store high UYVY \n\
00228 "
00229
00230 #elif defined(HAVE_SSE2_INTRINSICS)
00231
00232
00233
00234 #include <emmintrin.h>
00235
00236 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00237 do { \
00238 __m128i xmm0, xmm1, xmm2; \
00239 SSE2_INSTRUCTIONS \
00240 p_line += 32; p_y += 16; \
00241 p_u += 8; p_v += 8; \
00242 } while(0)
00243
00244 #define SSE2_END _mm_sfence()
00245
00246 #define SSE2_YUV422_YUYV_ALIGNED \
00247 xmm0 = _mm_load_si128((__m128i *)p_y); \
00248 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00249 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00250 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00251 xmm2 = xmm0; \
00252 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00253 _mm_stream_si128((__m128i*)(p_line), xmm2); \
00254 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00255 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
00256
00257 #define SSE2_YUV422_YUYV_UNALIGNED \
00258 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
00259 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00260 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00261 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00262 xmm2 = xmm0; \
00263 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00264 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
00265 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00266 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
00267
00268 #define SSE2_YUV422_YVYU_ALIGNED \
00269 xmm0 = _mm_load_si128((__m128i *)p_y); \
00270 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
00271 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00272 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00273 xmm2 = xmm0; \
00274 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00275 _mm_stream_si128((__m128i*)(p_line), xmm2); \
00276 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00277 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
00278
00279 #define SSE2_YUV422_YVYU_UNALIGNED \
00280 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
00281 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
00282 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00283 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00284 xmm2 = xmm0; \
00285 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
00286 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
00287 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
00288 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
00289
00290 #define SSE2_YUV422_UYVY_ALIGNED \
00291 xmm0 = _mm_load_si128((__m128i *)p_y); \
00292 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00293 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00294 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00295 xmm2 = xmm1; \
00296 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
00297 _mm_stream_si128((__m128i*)(p_line), xmm2); \
00298 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
00299 _mm_stream_si128((__m128i*)(p_line+16), xmm1);
00300
00301 #define SSE2_YUV422_UYVY_UNALIGNED \
00302 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
00303 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
00304 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
00305 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
00306 xmm2 = xmm1; \
00307 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
00308 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
00309 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
00310 _mm_storeu_si128((__m128i*)(p_line+16), xmm1);
00311
00312 #endif
00313
00314 #endif
00315
00316 #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
00317 *(p_line)++ = *(p_y)++; \
00318 *(p_line)++ = *(p_u)++; \
00319 *(p_line)++ = *(p_y)++; \
00320 *(p_line)++ = *(p_v)++; \
00321
00322 #define C_YUV422_YVYU( p_line, p_y, p_u, p_v ) \
00323 *(p_line)++ = *(p_y)++; \
00324 *(p_line)++ = *(p_v)++; \
00325 *(p_line)++ = *(p_y)++; \
00326 *(p_line)++ = *(p_u)++; \
00327
00328 #define C_YUV422_UYVY( p_line, p_y, p_u, p_v ) \
00329 *(p_line)++ = *(p_u)++; \
00330 *(p_line)++ = *(p_y)++; \
00331 *(p_line)++ = *(p_v)++; \
00332 *(p_line)++ = *(p_y)++; \
00333
00334 #define C_YUV422_Y211( p_line, p_y, p_u, p_v ) \
00335 *(p_line)++ = *(p_y); p_y += 2; \
00336 *(p_line)++ = *(p_u) - 0x80; p_u += 2; \
00337 *(p_line)++ = *(p_y); p_y += 2; \
00338 *(p_line)++ = *(p_v) - 0x80; p_v += 2; \
00339
00340