i420_yuy2.h

Go to the documentation of this file.
00001 /*****************************************************************************
00002  * i420_yuy2.h : YUV to YUV conversion module for vlc
00003  *****************************************************************************
00004  * Copyright (C) 2000, 2001 the VideoLAN team
00005  * $Id$
00006  *
00007  * Authors: Samuel Hocevar <sam@zoy.org>
00008  *          Damien Fouilleul <damien@videolan.org>
00009  *
00010  * This program is free software; you can redistribute it and/or modify
00011  * it under the terms of the GNU General Public License as published by
00012  * the Free Software Foundation; either version 2 of the License, or
00013  * (at your option) any later version.
00014  *
00015  * This program is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU General Public License
00021  * along with this program; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
00023  *****************************************************************************/
00024 
00025 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
00026 
00027 #if defined(CAN_COMPILE_MMX)
00028 
00029 /* MMX assembly */
00030  
00031 #define MMX_CALL(MMX_INSTRUCTIONS)          \
00032     do {                                    \
00033     __asm__ __volatile__(                   \
00034         ".p2align 3 \n\t                    \
00035 movd       (%0), %%mm1  # Load 4 Cb           00 00 00 00 u3 u2 u1 u0     \n\
00036 movd       (%1), %%mm2  # Load 4 Cr           00 00 00 00 v3 v2 v1 v0     \n\
00037 movq       (%2), %%mm0  # Load 8 Y            y7 y6 y5 y4 y3 y2 y1 y0     \n\
00038 movq       (%3), %%mm3  # Load 8 Y            Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00039 " \
00040         :                                   \
00041         : "r" (p_u), "r" (p_v),             \
00042           "r" (p_y1), "r" (p_y2) );         \
00043     __asm__ __volatile__(                   \
00044         ".p2align 3 \n\t"                   \
00045         MMX_INSTRUCTIONS                    \
00046         :                                   \
00047         : "r" (p_line1), "r" (p_line2) );   \
00048         p_line1 += 16; p_line2 += 16;       \
00049         p_y1 += 8; p_y2 += 8;               \
00050         p_u += 4; p_v += 4;                 \
00051     } while(0)
00052 
00053 #define MMX_END __asm__ __volatile__ ( "emms" )
00054 
00055 #define MMX_YUV420_YUYV "                                                 \n\
00056 punpcklbw %%mm2, %%mm1  #                     v3 u3 v2 u2 v1 u1 v0 u0     \n\
00057 movq      %%mm0, %%mm2  #                     y7 y6 y5 y4 y3 y2 y1 y0     \n\
00058 punpcklbw %%mm1, %%mm2  #                     v1 y3 u1 y2 v0 y1 u0 y0     \n\
00059 movq      %%mm2, (%0)   # Store low YUYV                                  \n\
00060 punpckhbw %%mm1, %%mm0  #                     v3 y7 u3 y6 v2 y5 u2 y4     \n\
00061 movq      %%mm0, 8(%0)  # Store high YUYV                                 \n\
00062 movq      %%mm3, %%mm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00063 punpcklbw %%mm1, %%mm4  #                     v1 Y3 u1 Y2 v0 Y1 u0 Y0     \n\
00064 movq      %%mm4, (%1)   # Store low YUYV                                  \n\
00065 punpckhbw %%mm1, %%mm3  #                     v3 Y7 u3 Y6 v2 Y5 u2 Y4     \n\
00066 movq      %%mm3, 8(%1)  # Store high YUYV                                 \n\
00067 "
00068 
00069 #define MMX_YUV420_YVYU "                                                 \n\
00070 punpcklbw %%mm1, %%mm2  #                     u3 v3 u2 v2 u1 v1 u0 v0     \n\
00071 movq      %%mm0, %%mm1  #                     y7 y6 y5 y4 y3 y2 y1 y0     \n\
00072 punpcklbw %%mm2, %%mm1  #                     u1 y3 v1 y2 u0 y1 v0 y0     \n\
00073 movq      %%mm1, (%0)   # Store low YUYV                                  \n\
00074 punpckhbw %%mm2, %%mm0  #                     u3 y7 v3 y6 u2 y5 v2 y4     \n\
00075 movq      %%mm0, 8(%0)  # Store high YUYV                                 \n\
00076 movq      %%mm3, %%mm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00077 punpcklbw %%mm2, %%mm4  #                     u1 Y3 v1 Y2 u0 Y1 v0 Y0     \n\
00078 movq      %%mm4, (%1)   # Store low YUYV                                  \n\
00079 punpckhbw %%mm2, %%mm3  #                     u3 Y7 v3 Y6 u2 Y5 v2 Y4     \n\
00080 movq      %%mm3, 8(%1)  # Store high YUYV                                 \n\
00081 "
00082 
00083 #define MMX_YUV420_UYVY "                                                 \n\
00084 punpcklbw %%mm2, %%mm1  #                     v3 u3 v2 u2 v1 u1 v0 u0     \n\
00085 movq      %%mm1, %%mm2  #                     v3 u3 v2 u2 v1 u1 v0 u0     \n\
00086 punpcklbw %%mm0, %%mm2  #                     y3 v1 y2 u1 y1 v0 y0 u0     \n\
00087 movq      %%mm2, (%0)   # Store low UYVY                                  \n\
00088 movq      %%mm1, %%mm2  #                     u3 v3 u2 v2 u1 v1 u0 v0     \n\
00089 punpckhbw %%mm0, %%mm2  #                     y3 v1 y2 u1 y1 v0 y0 u0     \n\
00090 movq      %%mm2, 8(%0)  # Store high UYVY                                 \n\
00091 movq      %%mm1, %%mm4  #                     u3 v3 u2 v2 u1 v1 u0 v0     \n\
00092 punpcklbw %%mm3, %%mm4  #                     Y3 v1 Y2 u1 Y1 v0 Y0 u0     \n\
00093 movq      %%mm4, (%1)   # Store low UYVY                                  \n\
00094 punpckhbw %%mm3, %%mm1  #                     Y7 v3 Y6 u3 Y5 v2 Y4 u2     \n\
00095 movq      %%mm1, 8(%1)  # Store high UYVY                                 \n\
00096 "
00097 
00098 /* FIXME: this code does not work ! Chroma seems to be wrong. */
00099 #define MMX_YUV420_Y211 "                                                 \n\
00100 movd       (%4), %%mm2  # Load 4 Cb           00 00 00 00 u3 u2 u1 u0     \n\
00101 movd       (%5), %%mm3  # Load 4 Cr           00 00 00 00 v3 v2 v1 v0     \n\
00102 pand    i_00ffw, %%mm0  # get Y even          00 Y6 00 Y4 00 Y2 00 Y0     \n\
00103 packuswb  %%mm0, %%mm0  # pack Y              y6 y4 y2 y0 y6 y4 y2 y0     \n\
00104 pand    i_00ffw, %%mm2  # get U even          00 u6 00 u4 00 u2 00 u0     \n\
00105 packuswb  %%mm2, %%mm2  # pack U              00 00 u2 u0 00 00 u2 u0     \n\
00106 pand    i_00ffw, %%mm3  # get V even          00 v6 00 v4 00 v2 00 v0     \n\
00107 packuswb  %%mm3, %%mm3  # pack V              00 00 v2 v0 00 00 v2 v0     \n\
00108 punpcklbw %%mm3, %%mm2  #                     00 00 00 00 v2 u2 v0 u0     \n\
00109 psubsw    i_80w, %%mm2  # U,V -= 128                                      \n\
00110 punpcklbw %%mm2, %%mm0  #                     v2 y6 u2 y4 v0 y2 u0 y0     \n\
00111 movq      %%mm0, (%0)   # Store YUYV                                      \n\
00112 pand    i_00ffw, %%mm1  # get Y even          00 Y6 00 Y4 00 Y2 00 Y0     \n\
00113 packuswb  %%mm1, %%mm1  # pack Y              Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0     \n\
00114 punpcklbw %%mm2, %%mm1  #                     v2 Y6 u2 Y4 v0 Y2 u0 Y0     \n\
00115 movq      %%mm1, (%1)   # Store YUYV                                      \n\
00116 "
00117 #elif defined(HAVE_MMX_INTRINSICS)
00118 
00119 /* MMX intrinsics */
00120 
00121 #include <mmintrin.h>
00122 
00123 #define MMX_CALL(MMX_INSTRUCTIONS)          \
00124     do {                                    \
00125         __m64 mm0, mm1, mm2, mm3, mm4;      \
00126         MMX_INSTRUCTIONS                    \
00127         p_line1 += 16; p_line2 += 16;       \
00128         p_y1 += 8; p_y2 += 8;               \
00129         p_u += 4; p_v += 4;                 \
00130     } while(0)
00131 
00132 #define MMX_END _mm_empty()
00133  
00134 #define MMX_YUV420_YUYV                     \
00135     mm1 = _mm_cvtsi32_si64(*(int*)p_u);     \
00136     mm2 = _mm_cvtsi32_si64(*(int*)p_v);     \
00137     mm0 = (__m64)*(uint64_t*)p_y1;          \
00138     mm3 = (__m64)*(uint64_t*)p_y2;          \
00139     mm1 = _mm_unpacklo_pi8(mm1, mm2);       \
00140     mm2 = mm0;                              \
00141     mm2 = _mm_unpacklo_pi8(mm2, mm1);       \
00142     *(uint64_t*)p_line1 = (uint64_t)mm2;    \
00143     mm0 = _mm_unpackhi_pi8(mm0, mm1);       \
00144     *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
00145     mm4 = mm3;                              \
00146     mm4 = _mm_unpacklo_pi8(mm4, mm1);       \
00147     *(uint64_t*)p_line2 = (uint64_t)mm4;    \
00148     mm3 = _mm_unpackhi_pi8(mm3, mm1);       \
00149     *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
00150 
00151 #define MMX_YUV420_YVYU                     \
00152     mm2 = _mm_cvtsi32_si64(*(int*)p_u);     \
00153     mm1 = _mm_cvtsi32_si64(*(int*)p_v);     \
00154     mm0 = (__m64)*(uint64_t*)p_y1;          \
00155     mm3 = (__m64)*(uint64_t*)p_y2;          \
00156     mm1 = _mm_unpacklo_pi8(mm1, mm2);       \
00157     mm2 = mm0;                              \
00158     mm2 = _mm_unpacklo_pi8(mm2, mm1);       \
00159     *(uint64_t*)p_line1 = (uint64_t)mm2;    \
00160     mm0 = _mm_unpackhi_pi8(mm0, mm1);       \
00161     *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
00162     mm4 = mm3;                              \
00163     mm4 = _mm_unpacklo_pi8(mm4, mm1);       \
00164     *(uint64_t*)p_line2 = (uint64_t)mm4;    \
00165     mm3 = _mm_unpackhi_pi8(mm3, mm1);       \
00166     *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
00167 
00168 #define MMX_YUV420_UYVY                     \
00169     mm1 = _mm_cvtsi32_si64(*(int*)p_u);     \
00170     mm2 = _mm_cvtsi32_si64(*(int*)p_v);     \
00171     mm0 = (__m64)*(uint64_t*)p_y1;          \
00172     mm3 = (__m64)*(uint64_t*)p_y2;          \
00173     mm1 = _mm_unpacklo_pi8(mm1, mm2);       \
00174     mm2 = mm1;                              \
00175     mm2 = _mm_unpacklo_pi8(mm2, mm0);       \
00176     *(uint64_t*)p_line1 = (uint64_t)mm2;    \
00177     mm2 = mm1;                              \
00178     mm2 = _mm_unpackhi_pi8(mm2, mm0);       \
00179     *(uint64_t*)(p_line1+8) = (uint64_t)mm2;\
00180     mm4 = mm1;                              \
00181     mm4 = _mm_unpacklo_pi8(mm4, mm3);       \
00182     *(uint64_t*)p_line2 = (uint64_t)mm4;    \
00183     mm1 = _mm_unpackhi_pi8(mm1, mm3);       \
00184     *(uint64_t*)(p_line2+8) = (uint64_t)mm1;
00185 
00186 #endif
00187 
00188 #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 )
00189 
00190 #if defined(CAN_COMPILE_SSE2)
00191 
00192 /* SSE2 assembly */
00193 
00194 #define SSE2_CALL(SSE2_INSTRUCTIONS)    \
00195     do {                                \
00196     __asm__ __volatile__(               \
00197         ".p2align 3 \n\t                \
00198 movq        (%0), %%xmm1  # Load 8 Cb         u7 u6 u5 u4 u3 u2 u1 u0     \n\
00199 movq        (%1), %%xmm2  # Load 8 Cr         v7 06 v5 v4 v3 v2 v1 v0     \n\
00200 " \
00201         :                               \
00202         : "r" (p_u),  "r" (p_v) );      \
00203     __asm__ __volatile__(               \
00204         ".p2align 3 \n\t"               \
00205         SSE2_INSTRUCTIONS               \
00206         :                               \
00207         : "r" (p_line1), "r" (p_line2), \
00208           "r" (p_y1),  "r" (p_y2) );    \
00209         p_line1 += 32; p_line2 += 32;   \
00210         p_y1 += 16; p_y2 += 16;         \
00211         p_u += 8; p_v += 8;             \
00212     } while(0)
00213 
00214 #define SSE2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
00215 
00216 #define SSE2_YUV420_YUYV_ALIGNED "                                        \n\
00217 movdqa      (%2), %%xmm0  # Load 16 Y         y15 y14 y13 .. y2 y1 y0     \n\
00218 movdqa      (%3), %%xmm3  # Load 16 Y         Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00219 punpcklbw %%xmm2, %%xmm1  #                   v7 u7 v6 u6 .. u1 v0 u0     \n\
00220 movdqa    %%xmm0, %%xmm2  #                   y15 y14 y13 .. y2 y1 y0     \n\
00221 punpcklbw %%xmm1, %%xmm2  #                   v3 y7 u3 .. v0 y1 u0 y0     \n\
00222 movntdq   %%xmm2, (%0)    # Store low YUYV                                \n\
00223 punpckhbw %%xmm1, %%xmm0  #                   v3 y7 u3 y6 v2 y5 u2 y4     \n\
00224 movntdq   %%xmm0, 16(%0)  # Store high YUYV                               \n\
00225 movdqa    %%xmm3, %%xmm4  #                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00226 punpcklbw %%xmm1, %%xmm4  #                   v1 Y3 u1 Y2 v0 Y1 u0 Y0     \n\
00227 movntdq   %%xmm4, (%1)    # Store low YUYV                                \n\
00228 punpckhbw %%xmm1, %%xmm3  #                   v3 Y7 u3 Y6 v2 Y5 u2 Y4     \n\
00229 movntdq   %%xmm3, 16(%1)  # Store high YUYV                               \n\
00230 "
00231 
00232 #define SSE2_YUV420_YUYV_UNALIGNED "                                      \n\
00233 movdqu      (%2), %%xmm0  # Load 16 Y         y7 y6 y5 y4 y3 y2 y1 y0     \n\
00234 movdqu      (%3), %%xmm3  # Load 16 Y         Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00235 prefetchnta (%0)          # Tell CPU not to cache output YUYV data        \n\
00236 prefetchnta (%1)          # Tell CPU not to cache output YUYV data        \n\
00237 punpcklbw %%xmm2, %%xmm1  #                   v3 u3 v2 u2 v1 u1 v0 u0     \n\
00238 movdqa    %%xmm0, %%xmm2  #                   y7 y6 y5 y4 y3 y2 y1 y0     \n\
00239 punpcklbw %%xmm1, %%xmm2  #                   v1 y3 u1 y2 v0 y1 u0 y0     \n\
00240 movdqu    %%xmm2, (%0)    # Store low YUYV                                \n\
00241 punpckhbw %%xmm1, %%xmm0  #                   v3 y7 u3 y6 v2 y5 u2 y4     \n\
00242 movdqu    %%xmm0, 16(%0)  # Store high YUYV                               \n\
00243 movdqa    %%xmm3, %%xmm4  #                   Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
00244 punpcklbw %%xmm1, %%xmm4  #                   v1 Y3 u1 Y2 v0 Y1 u0 Y0     \n\
00245 movdqu    %%xmm4, (%1)    # Store low YUYV                                \n\
00246 punpckhbw %%xmm1, %%xmm3  #                   v3 Y7 u3 Y6 v2 Y5 u2 Y4     \n\
00247 movdqu    %%xmm3, 16(%1)  # Store high YUYV                               \n\
00248 "
00249 
00250 #define SSE2_YUV420_YVYU_ALIGNED "                                        \n\
00251 movdqa      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   \n\
00252 movdqa      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00253 punpcklbw %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   \n\
00254 movdqa    %%xmm0, %%xmm1  #                     y7 y6 y5 y4 y3 y2 y1 y0   \n\
00255 punpcklbw %%xmm2, %%xmm1  #                     u1 y3 v1 y2 u0 y1 v0 y0   \n\
00256 movntdq   %%xmm1, (%0)    # Store low YUYV                                \n\
00257 punpckhbw %%xmm2, %%xmm0  #                     u3 y7 v3 y6 u2 y5 v2 y4   \n\
00258 movntdq   %%xmm0, 16(%0)  # Store high YUYV                               \n\
00259 movdqa    %%xmm3, %%xmm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00260 punpcklbw %%xmm2, %%xmm4  #                     u1 Y3 v1 Y2 u0 Y1 v0 Y0   \n\
00261 movntdq   %%xmm4, (%1)    # Store low YUYV                                \n\
00262 punpckhbw %%xmm2, %%xmm3  #                     u3 Y7 v3 Y6 u2 Y5 v2 Y4   \n\
00263 movntdq   %%xmm3, 16(%1)  # Store high YUYV                               \n\
00264 "
00265 
00266 #define SSE2_YUV420_YVYU_UNALIGNED "                                      \n\
00267 movdqu      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   \n\
00268 movdqu      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00269 prefetchnta (%0)          # Tell CPU not to cache output YVYU data        \n\
00270 prefetchnta (%1)          # Tell CPU not to cache output YVYU data        \n\
00271 punpcklbw %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   \n\
00272 movdqu    %%xmm0, %%xmm1  #                     y7 y6 y5 y4 y3 y2 y1 y0   \n\
00273 punpcklbw %%xmm2, %%xmm1  #                     u1 y3 v1 y2 u0 y1 v0 y0   \n\
00274 movdqu    %%xmm1, (%0)    # Store low YUYV                                \n\
00275 punpckhbw %%xmm2, %%xmm0  #                     u3 y7 v3 y6 u2 y5 v2 y4   \n\
00276 movdqu    %%xmm0, 16(%0)  # Store high YUYV                               \n\
00277 movdqu    %%xmm3, %%xmm4  #                     Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00278 punpcklbw %%xmm2, %%xmm4  #                     u1 Y3 v1 Y2 u0 Y1 v0 Y0   \n\
00279 movdqu    %%xmm4, (%1)    # Store low YUYV                                \n\
00280 punpckhbw %%xmm2, %%xmm3  #                     u3 Y7 v3 Y6 u2 Y5 v2 Y4   \n\
00281 movdqu    %%xmm3, 16(%1)  # Store high YUYV                               \n\
00282 "
00283 
00284 #define SSE2_YUV420_UYVY_ALIGNED "                                        \n\
00285 movdqa      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   \n\
00286 movdqa      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00287 punpcklbw %%xmm2, %%xmm1  #                     v3 u3 v2 u2 v1 u1 v0 u0   \n\
00288 movdqa    %%xmm1, %%xmm2  #                     v3 u3 v2 u2 v1 u1 v0 u0   \n\
00289 punpcklbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   \n\
00290 movntdq   %%xmm2, (%0)    # Store low UYVY                                \n\
00291 movdqa    %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   \n\
00292 punpckhbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   \n\
00293 movntdq   %%xmm2, 16(%0)  # Store high UYVY                               \n\
00294 movdqa    %%xmm1, %%xmm4  #                     u3 v3 u2 v2 u1 v1 u0 v0   \n\
00295 punpcklbw %%xmm3, %%xmm4  #                     Y3 v1 Y2 u1 Y1 v0 Y0 u0   \n\
00296 movntdq   %%xmm4, (%1)    # Store low UYVY                                \n\
00297 punpckhbw %%xmm3, %%xmm1  #                     Y7 v3 Y6 u3 Y5 v2 Y4 u2   \n\
00298 movntdq   %%xmm1, 16(%1)  # Store high UYVY                               \n\
00299 "
00300 
00301 #define SSE2_YUV420_UYVY_UNALIGNED "                                      \n\
00302 movdqu      (%2), %%xmm0  # Load 16 Y           y7 y6 y5 y4 y3 y2 y1 y0   \n\
00303 movdqu      (%3), %%xmm3  # Load 16 Y           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00304 prefetchnta (%0)          # Tell CPU not to cache output UYVY data        \n\
00305 prefetchnta (%1)          # Tell CPU not to cache output UYVY data        \n\
00306 punpcklbw %%xmm2, %%xmm1  #                     v3 u3 v2 u2 v1 u1 v0 u0   \n\
00307 movdqu    %%xmm1, %%xmm2  #                     v3 u3 v2 u2 v1 u1 v0 u0   \n\
00308 punpcklbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   \n\
00309 movdqu    %%xmm2, (%0)    # Store low UYVY                                \n\
00310 movdqu    %%xmm1, %%xmm2  #                     u3 v3 u2 v2 u1 v1 u0 v0   \n\
00311 punpckhbw %%xmm0, %%xmm2  #                     y3 v1 y2 u1 y1 v0 y0 u0   \n\
00312 movdqu    %%xmm2, 16(%0)  # Store high UYVY                               \n\
00313 movdqu    %%xmm1, %%xmm4  #                     u3 v3 u2 v2 u1 v1 u0 v0   \n\
00314 punpcklbw %%xmm3, %%xmm4  #                     Y3 v1 Y2 u1 Y1 v0 Y0 u0   \n\
00315 movdqu    %%xmm4, (%1)    # Store low UYVY                                \n\
00316 punpckhbw %%xmm3, %%xmm1  #                     Y7 v3 Y6 u3 Y5 v2 Y4 u2   \n\
00317 movdqu    %%xmm1, 16(%1)  # Store high UYVY                               \n\
00318 "
00319 
00320 #elif defined(HAVE_SSE2_INTRINSICS)
00321 
00322 /* SSE2 intrinsics */
00323 
00324 #include <emmintrin.h>
00325 
00326 #define SSE2_CALL(SSE2_INSTRUCTIONS)            \
00327     do {                                        \
00328         __m128i xmm0, xmm1, xmm2, xmm3, xmm4;   \
00329         SSE2_INSTRUCTIONS                       \
00330         p_line1 += 32; p_line2 += 32;           \
00331         p_y1 += 16; p_y2 += 16;                 \
00332         p_u += 8; p_v += 8;                     \
00333     } while(0)
00334 
00335 #define SSE2_END  _mm_sfence()
00336 
00337 #define SSE2_YUV420_YUYV_ALIGNED                    \
00338     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         \
00339     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         \
00340     xmm0 = _mm_load_si128((__m128i *)p_y1);         \
00341     xmm3 = _mm_load_si128((__m128i *)p_y2);         \
00342     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
00343     xmm2 = xmm0;                                    \
00344     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           \
00345     _mm_stream_si128((__m128i*)(p_line1), xmm2);    \
00346     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           \
00347     _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
00348     xmm4 = xmm3;                                    \
00349     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           \
00350     _mm_stream_si128((__m128i*)(p_line2), xmm4);    \
00351     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           \
00352     _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
00353 
00354 #define SSE2_YUV420_YUYV_UNALIGNED                  \
00355     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         \
00356     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         \
00357     xmm0 = _mm_loadu_si128((__m128i *)p_y1);        \
00358     xmm3 = _mm_loadu_si128((__m128i *)p_y2);        \
00359     _mm_prefetch(p_line1, _MM_HINT_NTA);            \
00360     _mm_prefetch(p_line2, _MM_HINT_NTA);            \
00361     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
00362     xmm2 = xmm0;                                    \
00363     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           \
00364     _mm_storeu_si128((__m128i*)(p_line1), xmm2);    \
00365     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           \
00366     _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
00367     xmm4 = xmm3;                                    \
00368     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           \
00369     _mm_storeu_si128((__m128i*)(p_line2), xmm4);    \
00370     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           \
00371     _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
00372 
00373 #define SSE2_YUV420_YVYU_ALIGNED                    \
00374     xmm1 = _mm_loadl_epi64((__m128i *)p_v);         \
00375     xmm2 = _mm_loadl_epi64((__m128i *)p_u);         \
00376     xmm0 = _mm_load_si128((__m128i *)p_y1);         \
00377     xmm3 = _mm_load_si128((__m128i *)p_y2);         \
00378     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
00379     xmm2 = xmm0;                                    \
00380     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           \
00381     _mm_stream_si128((__m128i*)(p_line1), xmm2);    \
00382     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           \
00383     _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
00384     xmm4 = xmm3;                                    \
00385     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           \
00386     _mm_stream_si128((__m128i*)(p_line2), xmm4);    \
00387     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           \
00388     _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
00389 
00390 #define SSE2_YUV420_YVYU_UNALIGNED                  \
00391     xmm1 = _mm_loadl_epi64((__m128i *)p_v);         \
00392     xmm2 = _mm_loadl_epi64((__m128i *)p_u);         \
00393     xmm0 = _mm_loadu_si128((__m128i *)p_y1);        \
00394     xmm3 = _mm_loadu_si128((__m128i *)p_y2);        \
00395     _mm_prefetch(p_line1, _MM_HINT_NTA);            \
00396     _mm_prefetch(p_line2, _MM_HINT_NTA);            \
00397     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
00398     xmm2 = xmm0;                                    \
00399     xmm2 = _mm_unpacklo_epi8(xmm2, xmm1);           \
00400     _mm_storeu_si128((__m128i*)(p_line1), xmm2);    \
00401     xmm0 = _mm_unpackhi_epi8(xmm0, xmm1);           \
00402     _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
00403     xmm4 = xmm3;                                    \
00404     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           \
00405     _mm_storeu_si128((__m128i*)(p_line2), xmm4);    \
00406     xmm3 = _mm_unpackhi_epi8(xmm3, xmm1);           \
00407     _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
00408 
00409 #define SSE2_YUV420_UYVY_ALIGNED                    \
00410     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         \
00411     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         \
00412     xmm0 = _mm_load_si128((__m128i *)p_y1);         \
00413     xmm3 = _mm_load_si128((__m128i *)p_y2);         \
00414     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
00415     xmm2 = xmm1;                                    \
00416     xmm2 = _mm_unpacklo_epi8(xmm2, xmm0);           \
00417     _mm_stream_si128((__m128i*)(p_line1), xmm2);    \
00418     xmm2 = xmm1;                                    \
00419     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           \
00420     _mm_stream_si128((__m128i*)(p_line1+16), xmm2); \
00421     xmm4 = xmm1;                                    \
00422     xmm4 = _mm_unpacklo_epi8(xmm4, xmm3);           \
00423     _mm_stream_si128((__m128i*)(p_line2), xmm4);    \
00424     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           \
00425     _mm_stream_si128((__m128i*)(p_line1+16), xmm1);
00426 
00427 #define SSE2_YUV420_UYVY_UNALIGNED                  \
00428     xmm1 = _mm_loadl_epi64((__m128i *)p_u);         \
00429     xmm2 = _mm_loadl_epi64((__m128i *)p_v);         \
00430     xmm0 = _mm_loadu_si128((__m128i *)p_y1);        \
00431     xmm3 = _mm_loadu_si128((__m128i *)p_y2);        \
00432     _mm_prefetch(p_line1, _MM_HINT_NTA);            \
00433     _mm_prefetch(p_line2, _MM_HINT_NTA);            \
00434     xmm1 = _mm_unpacklo_epi8(xmm1, xmm2);           \
00435     xmm2 = xmm1;                                    \
00436     xmm2 = _mm_unpacklo_epi8(xmm2, xmm0);           \
00437     _mm_storeu_si128((__m128i*)(p_line1), xmm2);    \
00438     xmm2 = xmm1;                                    \
00439     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           \
00440     _mm_storeu_si128((__m128i*)(p_line1+16), xmm2); \
00441     xmm4 = xmm1;                                    \
00442     xmm4 = _mm_unpacklo_epi8(xmm4, xmm3);           \
00443     _mm_storeu_si128((__m128i*)(p_line2), xmm4);    \
00444     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           \
00445     _mm_storeu_si128((__m128i*)(p_line1+16), xmm1);
00446 
00447 #endif
00448 
00449 #endif
00450 
00451 /* Used in both accelerated and C modules */
00452 
00453 #define C_YUV420_YVYU( )                                                    \
00454     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
00455     *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      \
00456     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
00457     *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      \
00458 
00459 #define C_YUV420_Y211( )                                                    \
00460     *(p_line1)++ = *(p_y1); p_y1 += 2;                                      \
00461     *(p_line2)++ = *(p_y2); p_y2 += 2;                                      \
00462     *(p_line1)++ = *(p_line2)++ = *(p_u) - 0x80; p_u += 2;                  \
00463     *(p_line1)++ = *(p_y1); p_y1 += 2;                                      \
00464     *(p_line2)++ = *(p_y2); p_y2 += 2;                                      \
00465     *(p_line1)++ = *(p_line2)++ = *(p_v) - 0x80; p_v += 2;                  \
00466 
00467 
00468 #define C_YUV420_YUYV( )                                                    \
00469     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
00470     *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      \
00471     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
00472     *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      \
00473 
00474 #define C_YUV420_UYVY( )                                                    \
00475     *(p_line1)++ =            *(p_line2)++ = *(p_u)++;                      \
00476     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
00477     *(p_line1)++ =            *(p_line2)++ = *(p_v)++;                      \
00478     *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++;                     \
00479 

Generated on Wed Aug 13 08:02:38 2008 for VLC by  doxygen 1.5.1