i420_rgb_mmx.h

Go to the documentation of this file.
00001 /*****************************************************************************
00002  * transforms_yuvmmx.h: MMX YUV transformation assembly
00003  *****************************************************************************
00004  * Copyright (C) 1999-2007 the VideoLAN team
00005  * $Id: 587ba33ea6502434d06af1bb355b698319d3dae2 $
00006  *
00007  * Authors: Olie Lho <ollie@sis.com.tw>
00008  *          Gaƫl Hendryckx <jimmy@via.ecp.fr>
00009  *          Samuel Hocevar <sam@zoy.org>
00010  *          Damien Fouilleul <damienf@videolan.org>
00011  *
00012  * This program is free software; you can redistribute it and/or modify
00013  * it under the terms of the GNU General Public License as published by
00014  * the Free Software Foundation; either version 2 of the License, or
00015  * (at your option) any later version.
00016  *
00017  * This program is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020  * GNU General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU General Public License
00023  * along with this program; if not, write to the Free Software
00024  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
00025  *****************************************************************************/
00026 
00027 #ifdef MODULE_NAME_IS_i420_rgb_mmx
00028 
00029 /* hope these constant values are cache line aligned */
00030 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
00031 #define USED_U64(foo) \
00032     static const uint64_t foo __asm__ (#foo) __attribute__((used))
00033 #else
00034 #define USED_U64(foo) \
00035     static const uint64_t foo __asm__ (#foo) __attribute__((unused))
00036 #endif
00037 USED_U64(mmx_80w)     = 0x0080008000800080ULL; /* Will be referenced as %4 in inline asm */
00038 USED_U64(mmx_10w)     = 0x1010101010101010ULL; /* -- as %5 */
00039 USED_U64(mmx_00ffw)   = 0x00ff00ff00ff00ffULL; /* -- as %6 */
00040 USED_U64(mmx_Y_coeff) = 0x253f253f253f253fULL; /* -- as %7 */
00041 
00042 USED_U64(mmx_U_green) = 0xf37df37df37df37dULL; /* -- as %8 */
00043 USED_U64(mmx_U_blue)  = 0x4093409340934093ULL; /* -- as %9 */
00044 USED_U64(mmx_V_red)   = 0x3312331233123312ULL; /* -- as %10 */
00045 USED_U64(mmx_V_green) = 0xe5fce5fce5fce5fcULL; /* -- as %11 */
00046 
00047 USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL; /* -- as %12 */
00048 USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL; /* -- as %13 */
00049 #undef USED_U64
00050 
00051 #if defined(CAN_COMPILE_MMX)
00052 
00053 /* MMX assembly */
00054  
00055 #define MMX_CALL(MMX_INSTRUCTIONS)      \
00056     do {                                \
00057     __asm__ __volatile__(               \
00058         ".p2align 3 \n\t"               \
00059         MMX_INSTRUCTIONS                \
00060         :                               \
00061         : "r" (p_y), "r" (p_u),         \
00062           "r" (p_v), "r" (p_buffer),    \
00063       "m" (mmx_80w), "m" (mmx_10w), \
00064       "m" (mmx_00ffw), "m" (mmx_Y_coeff), \
00065       "m" (mmx_U_green), "m" (mmx_U_blue), \
00066       "m" (mmx_V_red), "m" (mmx_V_green), \
00067       "m" (mmx_mask_f8), "m" (mmx_mask_fc) );  \
00068     } while(0)
00069 
00070 #define MMX_END __asm__ __volatile__ ( "emms" )
00071 
00072 #define MMX_INIT_16 "                                                       \n\
00073 movd       (%1), %%mm0      # Load 4 Cb       00 00 00 00 u3 u2 u1 u0       \n\
00074 movd       (%2), %%mm1      # Load 4 Cr       00 00 00 00 v3 v2 v1 v0       \n\
00075 pxor      %%mm4, %%mm4      # zero mm4                                      \n\
00076 movq       (%0), %%mm6      # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00077 "
00078 
00079 #define MMX_INIT_16_GRAY "                                                  \n\
00080 movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00081 #movl      $0, (%3)         # cache preload for image                       \n\
00082 "
00083 
00084 #define MMX_INIT_32 "                                                       \n\
00085 movd      (%1), %%mm0       # Load 4 Cb       00 00 00 00 u3 u2 u1 u0       \n\
00086 movl        $0, (%3)        # cache preload for image                       \n\
00087 movd      (%2), %%mm1       # Load 4 Cr       00 00 00 00 v3 v2 v1 v0       \n\
00088 pxor     %%mm4, %%mm4       # zero mm4                                      \n\
00089 movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00090 "
00091 
00092 /*
00093  * Do the multiply part of the conversion for even and odd pixels,
00094  * register usage:
00095  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
00096  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels,
00097  * mm6 -> Y even, mm7 -> Y odd
00098  */
00099 
00100 #define MMX_YUV_MUL "                                                       \n\
00101 # convert the chroma part                                                   \n\
00102 punpcklbw %%mm4, %%mm0          # scatter 4 Cb    00 u3 00 u2 00 u1 00 u0   \n\
00103 punpcklbw %%mm4, %%mm1          # scatter 4 Cr    00 v3 00 v2 00 v1 00 v0   \n\
00104 psubsw    %4, %%mm0     # Cb -= 128                                 \n\
00105 psubsw    %4, %%mm1     # Cr -= 128                                 \n\
00106 psllw     $3, %%mm0             # Promote precision                         \n\
00107 psllw     $3, %%mm1             # Promote precision                         \n\
00108 movq      %%mm0, %%mm2          # Copy 4 Cb       00 u3 00 u2 00 u1 00 u0   \n\
00109 movq      %%mm1, %%mm3          # Copy 4 Cr       00 v3 00 v2 00 v1 00 v0   \n\
00110 pmulhw    %8, %%mm2 # Mul Cb with green coeff -> Cb green       \n\
00111 pmulhw    %11, %%mm3 # Mul Cr with green coeff -> Cr green       \n\
00112 pmulhw    %9, %%mm0  # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0   \n\
00113 pmulhw    %10, %%mm1   # Mul Cr -> Cred  00 r3 00 r2 00 r1 00 r0   \n\
00114 paddsw    %%mm3, %%mm2          # Cb green + Cr green -> Cgreen             \n\
00115                                                                             \n\
00116 # convert the luma part                                                     \n\
00117 psubusb   %5, %%mm6     # Y -= 16                                   \n\
00118 movq      %%mm6, %%mm7          # Copy 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00119 pand      %6, %%mm6   # get Y even      00 Y6 00 Y4 00 Y2 00 Y0   \n\
00120 psrlw     $8, %%mm7             # get Y odd       00 Y7 00 Y5 00 Y3 00 Y1   \n\
00121 psllw     $3, %%mm6             # Promote precision                         \n\
00122 psllw     $3, %%mm7             # Promote precision                         \n\
00123 pmulhw    %7, %%mm6 # Mul 4 Y even    00 y6 00 y4 00 y2 00 y0   \n\
00124 pmulhw    %7, %%mm7 # Mul 4 Y odd     00 y7 00 y5 00 y3 00 y1   \n\
00125 "
00126 
00127 /*
00128  * Do the addition part of the conversion for even and odd pixels,
00129  * register usage:
00130  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
00131  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels,
00132  * mm6 -> Y even, mm7 -> Y odd
00133  */
00134 
00135 #define MMX_YUV_ADD "                                                       \n\
00136 # Do horizontal and vertical scaling                                        \n\
00137 movq      %%mm0, %%mm3          # Copy Cblue                                \n\
00138 movq      %%mm1, %%mm4          # Copy Cred                                 \n\
00139 movq      %%mm2, %%mm5          # Copy Cgreen                               \n\
00140 paddsw    %%mm6, %%mm0          # Y even + Cblue  00 B6 00 B4 00 B2 00 B0   \n\
00141 paddsw    %%mm7, %%mm3          # Y odd  + Cblue  00 B7 00 B5 00 B3 00 B1   \n\
00142 paddsw    %%mm6, %%mm1          # Y even + Cred   00 R6 00 R4 00 R2 00 R0   \n\
00143 paddsw    %%mm7, %%mm4          # Y odd  + Cred   00 R7 00 R5 00 R3 00 R1   \n\
00144 paddsw    %%mm6, %%mm2          # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0   \n\
00145 paddsw    %%mm7, %%mm5          # Y odd  + Cgreen 00 G7 00 G5 00 G3 00 G1   \n\
00146                                                                             \n\
00147 # Limit RGB even to 0..255                                                  \n\
00148 packuswb  %%mm0, %%mm0          # B6 B4 B2 B0 / B6 B4 B2 B0                 \n\
00149 packuswb  %%mm1, %%mm1          # R6 R4 R2 R0 / R6 R4 R2 R0                 \n\
00150 packuswb  %%mm2, %%mm2          # G6 G4 G2 G0 / G6 G4 G2 G0                 \n\
00151                                                                             \n\
00152 # Limit RGB odd to 0..255                                                   \n\
00153 packuswb  %%mm3, %%mm3          # B7 B5 B3 B1 / B7 B5 B3 B1                 \n\
00154 packuswb  %%mm4, %%mm4          # R7 R5 R3 R1 / R7 R5 R3 R1                 \n\
00155 packuswb  %%mm5, %%mm5          # G7 G5 G3 G1 / G7 G5 G3 G1                 \n\
00156                                                                             \n\
00157 # Interleave RGB even and odd                                               \n\
00158 punpcklbw %%mm3, %%mm0          #                 B7 B6 B5 B4 B3 B2 B1 B0   \n\
00159 punpcklbw %%mm4, %%mm1          #                 R7 R6 R5 R4 R3 R2 R1 R0   \n\
00160 punpcklbw %%mm5, %%mm2          #                 G7 G6 G5 G4 G3 G2 G1 G0   \n\
00161 "
00162 
00163 /*
00164  * Grayscale case, only use Y
00165  */
00166 
00167 #define MMX_YUV_GRAY "                                                      \n\
00168 # convert the luma part                                                     \n\
00169 psubusb   %5, %%mm6                                                 \n\
00170 movq      %%mm6, %%mm7                                                      \n\
00171 pand      %6, %%mm6                                               \n\
00172 psrlw     $8, %%mm7                                                         \n\
00173 psllw     $3, %%mm6                                                         \n\
00174 psllw     $3, %%mm7                                                         \n\
00175 pmulhw    %7, %%mm6                                             \n\
00176 pmulhw    %7, %%mm7                                             \n\
00177 packuswb  %%mm6, %%mm6                                                      \n\
00178 packuswb  %%mm7, %%mm7                                                      \n\
00179 punpcklbw %%mm7, %%mm6                                                      \n\
00180 "
00181 
00182 #define MMX_UNPACK_16_GRAY "                                                \n\
00183 movq      %%mm6, %%mm5                                                      \n\
00184 pand      %12, %%mm6                                             \n\
00185 pand      %13, %%mm5                                             \n\
00186 movq      %%mm6, %%mm7                                                      \n\
00187 psrlw     $3, %%mm7                                                         \n\
00188 pxor      %%mm3, %%mm3                                                      \n\
00189 movq      %%mm7, %%mm2                                                      \n\
00190 movq      %%mm5, %%mm0                                                      \n\
00191 punpcklbw %%mm3, %%mm5                                                      \n\
00192 punpcklbw %%mm6, %%mm7                                                      \n\
00193 psllw     $3, %%mm5                                                         \n\
00194 por       %%mm5, %%mm7                                                      \n\
00195 movq      %%mm7, (%3)                                                       \n\
00196 punpckhbw %%mm3, %%mm0                                                      \n\
00197 punpckhbw %%mm6, %%mm2                                                      \n\
00198 psllw     $3, %%mm0                                                         \n\
00199 movq      8(%0), %%mm6                                                      \n\
00200 por       %%mm0, %%mm2                                                      \n\
00201 movq      %%mm2, 8(%3)                                                      \n\
00202 "
00203 
00204 
00205 /*
00206  * convert RGB plane to RGB 15 bits,
00207  * mm0 -> B, mm1 -> R, mm2 -> G,
00208  * mm4 -> GB, mm5 -> AR pixel 4-7,
00209  * mm6 -> GB, mm7 -> AR pixel 0-3
00210  */
00211 
00212 #define MMX_UNPACK_15 "                                                     \n\
00213 # mask unneeded bits off                                                    \n\
00214 pand      %12, %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
00215 psrlw     $3,%%mm0              # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
00216 pand      %12, %%mm2 # g7g6g5g4 g3______ g7g6g5g4 g3______       \n\
00217 pand      %12, %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
00218 psrlw     $1,%%mm1              # __r7r6r5 r4r3____ __r7r6r5 r4r3____       \n\
00219 pxor      %%mm4, %%mm4          # zero mm4                                  \n\
00220 movq      %%mm0, %%mm5          # Copy B7-B0                                \n\
00221 movq      %%mm2, %%mm7          # Copy G7-G0                                \n\
00222                                                                             \n\
00223 # convert rgb24 plane to rgb15 pack for pixel 0-3                           \n\
00224 punpcklbw %%mm4, %%mm2          # ________ ________ g7g6g5g4 g3______       \n\
00225 punpcklbw %%mm1, %%mm0          # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00226 psllw     $2,%%mm2              # ________ ____g7g6 g5g4g3__ ________       \n\
00227 por       %%mm2, %%mm0          # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       \n\
00228 movq      8(%0), %%mm6          # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00229 movq      %%mm0, (%3)           # store pixel 0-3                           \n\
00230                                                                             \n\
00231 # convert rgb24 plane to rgb16 pack for pixel 0-3                           \n\
00232 punpckhbw %%mm4, %%mm7          # ________ ________ g7g6g5g4 g3______       \n\
00233 punpckhbw %%mm1, %%mm5          # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00234 psllw     $2,%%mm7              # ________ ____g7g6 g5g4g3__ ________       \n\
00235 movd      4(%1), %%mm0          # Load 4 Cb       __ __ __ __ u3 u2 u1 u0   \n\
00236 por       %%mm7, %%mm5          # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       \n\
00237 movd      4(%2), %%mm1          # Load 4 Cr       __ __ __ __ v3 v2 v1 v0   \n\
00238 movq      %%mm5, 8(%3)          # store pixel 4-7                           \n\
00239 "
00240 
00241 /*
00242  * convert RGB plane to RGB 16 bits,
00243  * mm0 -> B, mm1 -> R, mm2 -> G,
00244  * mm4 -> GB, mm5 -> AR pixel 4-7,
00245  * mm6 -> GB, mm7 -> AR pixel 0-3
00246  */
00247 
00248 #define MMX_UNPACK_16 "                                                     \n\
00249 # mask unneeded bits off                                                    \n\
00250 pand      %12, %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
00251 pand      %13, %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       \n\
00252 pand      %12, %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
00253 psrlw     $3,%%mm0              # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
00254 pxor      %%mm4, %%mm4          # zero mm4                                  \n\
00255 movq      %%mm0, %%mm5          # Copy B7-B0                                \n\
00256 movq      %%mm2, %%mm7          # Copy G7-G0                                \n\
00257                                                                             \n\
00258 # convert rgb24 plane to rgb16 pack for pixel 0-3                           \n\
00259 punpcklbw %%mm4, %%mm2          # ________ ________ g7g6g5g4 g3g2____       \n\
00260 punpcklbw %%mm1, %%mm0          # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00261 psllw     $3,%%mm2              # ________ __g7g6g5 g4g3g2__ ________       \n\
00262 por       %%mm2, %%mm0          # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
00263 movq      8(%0), %%mm6          # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00264 movq      %%mm0, (%3)           # store pixel 0-3                           \n\
00265                                                                             \n\
00266 # convert rgb24 plane to rgb16 pack for pixel 0-3                           \n\
00267 punpckhbw %%mm4, %%mm7          # ________ ________ g7g6g5g4 g3g2____       \n\
00268 punpckhbw %%mm1, %%mm5          # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00269 psllw     $3,%%mm7              # ________ __g7g6g5 g4g3g2__ ________       \n\
00270 movd      4(%1), %%mm0          # Load 4 Cb       __ __ __ __ u3 u2 u1 u0   \n\
00271 por       %%mm7, %%mm5          # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
00272 movd      4(%2), %%mm1          # Load 4 Cr       __ __ __ __ v3 v2 v1 v0   \n\
00273 movq      %%mm5, 8(%3)          # store pixel 4-7                           \n\
00274 "
00275 
00276 /*
00277  * convert RGB plane to RGB packed format,
00278  * mm0 -> B, mm1 -> R, mm2 -> G
00279  */
00280 
00281 #define MMX_UNPACK_32_ARGB "                                                \n\
00282 pxor      %%mm3, %%mm3  # zero mm3                                          \n\
00283 movq      %%mm0, %%mm4  #                 B7 B6 B5 B4 B3 B2 B1 B0           \n\
00284 punpcklbw %%mm2, %%mm4  #                 G3 B3 G2 B2 G1 B1 G0 B0           \n\
00285 movq      %%mm1, %%mm5  #                 R7 R6 R5 R4 R3 R2 R1 R0           \n\
00286 punpcklbw %%mm3, %%mm5  #                 00 R3 00 R2 00 R1 00 R0           \n\
00287 movq      %%mm4, %%mm6  #                 G3 B3 G2 B2 G1 B1 G0 B0           \n\
00288 punpcklwd %%mm5, %%mm4  #                 00 R1 B1 G1 00 R0 B0 G0           \n\
00289 movq      %%mm4, (%3)   # Store ARGB1 ARGB0                                 \n\
00290 punpckhwd %%mm5, %%mm6  #                 00 R3 B3 G3 00 R2 B2 G2           \n\
00291 movq      %%mm6, 8(%3)  # Store ARGB3 ARGB2                                 \n\
00292 punpckhbw %%mm2, %%mm0  #                 G7 B7 G6 B6 G5 B5 G4 B4           \n\
00293 punpckhbw %%mm3, %%mm1  #                 00 R7 00 R6 00 R5 00 R4           \n\
00294 movq      %%mm0, %%mm5  #                 G7 B7 G6 B6 G5 B5 G4 B4           \n\
00295 punpcklwd %%mm1, %%mm5  #                 00 R5 B5 G5 00 R4 B4 G4           \n\
00296 movq      %%mm5, 16(%3) # Store ARGB5 ARGB4                                 \n\
00297 punpckhwd %%mm1, %%mm0  #                 00 R7 B7 G7 00 R6 B6 G6           \n\
00298 movq      %%mm0, 24(%3) # Store ARGB7 ARGB6                                 \n\
00299 "
00300 
00301 #define MMX_UNPACK_32_RGBA "                                                \n\
00302 pxor      %%mm3, %%mm3  # zero mm3                                          \n\
00303 movq      %%mm2, %%mm4  #                 G7 G6 G5 G4 G3 G2 G1 G0           \n\
00304 punpcklbw %%mm1, %%mm4  #                 R3 G3 R2 G2 R1 G1 R0 G0           \n\
00305 punpcklbw %%mm0, %%mm3  #                 B3 00 B2 00 B1 00 B0 00           \n\
00306 movq      %%mm3, %%mm5  #                 R3 00 R2 00 R1 00 R0 00           \n\
00307 punpcklwd %%mm4, %%mm3  #                 R1 G1 B1 00 R0 G0 B0 00           \n\
00308 movq      %%mm3, (%3)   # Store RGBA1 RGBA0                                 \n\
00309 punpckhwd %%mm4, %%mm5  #                 R3 G3 B3 00 R2 G2 B2 00           \n\
00310 movq      %%mm5, 8(%3)  # Store RGBA3 RGBA2                                 \n\
00311 pxor      %%mm6, %%mm6  # zero mm6                                          \n\
00312 punpckhbw %%mm1, %%mm2  #                 R7 G7 R6 G6 R5 G5 R4 G4           \n\
00313 punpckhbw %%mm0, %%mm6  #                 B7 00 B6 00 B5 00 B4 00           \n\
00314 movq      %%mm6, %%mm0  #                 B7 00 B6 00 B5 00 B4 00           \n\
00315 punpcklwd %%mm2, %%mm6  #                 R5 G5 B5 00 R4 G4 B4 00           \n\
00316 movq      %%mm6, 16(%3) # Store RGBA5 RGBA4                                 \n\
00317 punpckhwd %%mm2, %%mm0  #                 R7 G7 B7 00 R6 G6 B6 00           \n\
00318 movq      %%mm0, 24(%3) # Store RGBA7 RGBA6                                 \n\
00319 "
00320 
00321 #define MMX_UNPACK_32_BGRA "                                                \n\
00322 pxor      %%mm3, %%mm3  # zero mm3                                          \n\
00323 movq      %%mm2, %%mm4  #                 G7 G6 G5 G4 G3 G2 G1 G0           \n\
00324 punpcklbw %%mm0, %%mm4  #                 B3 G3 B2 G2 B1 G1 B0 G0           \n\
00325 punpcklbw %%mm1, %%mm3  #                 R3 00 R2 00 R1 00 R0 00           \n\
00326 movq      %%mm3, %%mm5  #                 R3 00 R2 00 R1 00 R0 00           \n\
00327 punpcklwd %%mm4, %%mm3  #                 B1 G1 R1 00 B0 G0 R0 00           \n\
00328 movq      %%mm3, (%3)   # Store BGRA1 BGRA0                                 \n\
00329 punpckhwd %%mm4, %%mm5  #                 B3 G3 R3 00 B2 G2 R2 00           \n\
00330 movq      %%mm5, 8(%3)  # Store BGRA3 BGRA2                                 \n\
00331 pxor      %%mm6, %%mm6  # zero mm6                                          \n\
00332 punpckhbw %%mm0, %%mm2  #                 B7 G7 B6 G6 B5 G5 B4 G4           \n\
00333 punpckhbw %%mm1, %%mm6  #                 R7 00 R6 00 R5 00 R4 00           \n\
00334 movq      %%mm6, %%mm0  #                 R7 00 R6 00 R5 00 R4 00           \n\
00335 punpcklwd %%mm2, %%mm6  #                 B5 G5 R5 00 B4 G4 R4 00           \n\
00336 movq      %%mm6, 16(%3) # Store BGRA5 BGRA4                                 \n\
00337 punpckhwd %%mm2, %%mm0  #                 B7 G7 R7 00 B6 G6 R6 00           \n\
00338 movq      %%mm0, 24(%3) # Store BGRA7 BGRA6                                 \n\
00339 "
00340 
00341 #define MMX_UNPACK_32_ABGR "                                                \n\
00342 pxor      %%mm3, %%mm3  # zero mm3                                          \n\
00343 movq      %%mm1, %%mm4  #                 R7 R6 R5 R4 R3 R2 R1 R0           \n\
00344 punpcklbw %%mm2, %%mm4  #                 G3 R3 G2 R2 G1 R1 G0 R0           \n\
00345 movq      %%mm0, %%mm5  #                 B7 B6 B5 B4 B3 B2 B1 B0           \n\
00346 punpcklbw %%mm3, %%mm5  #                 00 B3 00 B2 00 B1 00 B0           \n\
00347 movq      %%mm4, %%mm6  #                 G3 R3 G2 R2 G1 R1 G0 R0           \n\
00348 punpcklwd %%mm5, %%mm4  #                 00 B1 G1 R1 00 B0 G0 R0           \n\
00349 movq      %%mm4, (%3)   # Store ABGR1 ABGR0                                 \n\
00350 punpckhwd %%mm5, %%mm6  #                 00 B3 G3 R3 00 B2 G2 R2           \n\
00351 movq      %%mm6, 8(%3)  # Store ABGR3 ABGR2                                 \n\
00352 punpckhbw %%mm2, %%mm1  #                 G7 R7 G6 R6 G5 R5 G4 R4           \n\
00353 punpckhbw %%mm3, %%mm0  #                 00 B7 00 B6 00 B5 00 B4           \n\
00354 movq      %%mm1, %%mm2  #                 G7 R7 G6 R6 G5 R5 G4 R4           \n\
00355 punpcklwd %%mm0, %%mm1  #                 00 B5 G5 R5 00 B4 G4 R4           \n\
00356 movq      %%mm1, 16(%3) # Store ABGR5 ABGR4                                 \n\
00357 punpckhwd %%mm0, %%mm2  #                 B7 G7 R7 00 B6 G6 R6 00           \n\
00358 movq      %%mm2, 24(%3) # Store ABGR7 ABGR6                                 \n\
00359 "
00360 
00361 #elif defined(HAVE_MMX_INTRINSICS)
00362 
00363 /* MMX intrinsics */
00364 
00365 #include <mmintrin.h>
00366 
00367 #define MMX_CALL(MMX_INSTRUCTIONS)  \
00368     do {                            \
00369         __m64 mm0, mm1, mm2, mm3,   \
00370               mm4, mm5, mm6, mm7;   \
00371         MMX_INSTRUCTIONS            \
00372     } while(0)
00373 
00374 #define MMX_END _mm_empty()
00375  
00376 #define MMX_INIT_16                     \
00377     mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
00378     mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00379     mm4 = _mm_setzero_si64();           \
00380     mm6 = (__m64)*(uint64_t *)p_y;
00381 
00382 #define MMX_INIT_32                     \
00383     mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
00384     *(uint16_t *)p_buffer = 0;          \
00385     mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00386     mm4 = _mm_setzero_si64();           \
00387     mm6 = (__m64)*(uint64_t *)p_y;
00388 
00389 #define MMX_YUV_MUL                                 \
00390     mm0 = _mm_unpacklo_pi8(mm0, mm4);               \
00391     mm1 = _mm_unpacklo_pi8(mm1, mm4);               \
00392     mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w);       \
00393     mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w);       \
00394     mm0 = _mm_slli_pi16(mm0, 3);                    \
00395     mm1 = _mm_slli_pi16(mm1, 3);                    \
00396     mm2 = mm0;                                      \
00397     mm3 = mm1;                                      \
00398     mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green);  \
00399     mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green);  \
00400     mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue);   \
00401     mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red);    \
00402     mm2 = _mm_adds_pi16(mm2, mm3);                  \
00403     \
00404     mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w);        \
00405     mm7 = mm6;                                      \
00406     mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw);      \
00407     mm7 = _mm_srli_pi16(mm7, 8);                    \
00408     mm6 = _mm_slli_pi16(mm6, 3);                    \
00409     mm7 = _mm_slli_pi16(mm7, 3);                    \
00410     mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff);  \
00411     mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
00412 
00413 #define MMX_YUV_ADD                     \
00414     mm3 = mm0;                          \
00415     mm4 = mm1;                          \
00416     mm5 = mm2;                          \
00417     mm0 = _mm_adds_pi16(mm0, mm6);      \
00418     mm3 = _mm_adds_pi16(mm3, mm7);      \
00419     mm1 = _mm_adds_pi16(mm1, mm6);      \
00420     mm4 = _mm_adds_pi16(mm4, mm7);      \
00421     mm2 = _mm_adds_pi16(mm2, mm6);      \
00422     mm5 = _mm_adds_pi16(mm5, mm7);      \
00423     \
00424     mm0 = _mm_packs_pu16(mm0, mm0);     \
00425     mm1 = _mm_packs_pu16(mm1, mm1);     \
00426     mm2 = _mm_packs_pu16(mm2, mm2);     \
00427     \
00428     mm3 = _mm_packs_pu16(mm3, mm3);     \
00429     mm4 = _mm_packs_pu16(mm4, mm4);     \
00430     mm5 = _mm_packs_pu16(mm5, mm5);     \
00431     \
00432     mm0 = _mm_unpacklo_pi8(mm0, mm3);   \
00433     mm1 = _mm_unpacklo_pi8(mm1, mm4);   \
00434     mm2 = _mm_unpacklo_pi8(mm2, mm5);
00435 
00436 #define MMX_UNPACK_15                               \
00437     mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8);    \
00438     mm0 = _mm_srli_pi16(mm0, 3);                    \
00439     mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8);    \
00440     mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8);    \
00441     mm1 = _mm_srli_pi16(mm1, 1);                    \
00442     mm4 = _mm_setzero_si64();                       \
00443     mm5 = mm0;                                      \
00444     mm7 = mm2;                                      \
00445     \
00446     mm2 = _mm_unpacklo_pi8(mm2, mm4);               \
00447     mm0 = _mm_unpacklo_pi8(mm0, mm1);               \
00448     mm2 = _mm_slli_pi16(mm2, 2);                    \
00449     mm0 = _mm_or_si64(mm0, mm2);                    \
00450     mm6 = (__m64)*(uint64_t *)(p_y + 8);            \
00451     *(uint64_t *)p_buffer = (uint64_t)mm0;          \
00452     \
00453     mm7 = _mm_unpackhi_pi8(mm7, mm4);               \
00454     mm5 = _mm_unpackhi_pi8(mm5, mm1);               \
00455     mm7 = _mm_slli_pi16(mm7, 2);                    \
00456     mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
00457     mm5 = _mm_or_si64(mm5, mm7);                    \
00458     mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
00459     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00460 
00461 #define MMX_UNPACK_16                               \
00462     mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8);    \
00463     mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc);    \
00464     mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8);    \
00465     mm0 = _mm_srli_pi16(mm0, 3);                    \
00466     mm4 = _mm_setzero_si64();                       \
00467     mm5 = mm0;                                      \
00468     mm7 = mm2;                                      \
00469     \
00470     mm2 = _mm_unpacklo_pi8(mm2, mm4);               \
00471     mm0 = _mm_unpacklo_pi8(mm0, mm1);               \
00472     mm2 = _mm_slli_pi16(mm2, 3);                    \
00473     mm0 = _mm_or_si64(mm0, mm2);                    \
00474     mm6 = (__m64)*(uint64_t *)(p_y + 8);            \
00475     *(uint64_t *)p_buffer = (uint64_t)mm0;          \
00476     \
00477     mm7 = _mm_unpackhi_pi8(mm7, mm4);               \
00478     mm5 = _mm_unpackhi_pi8(mm5, mm1);               \
00479     mm7 = _mm_slli_pi16(mm7, 3);                    \
00480     mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
00481     mm5 = _mm_or_si64(mm5, mm7);                    \
00482     mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
00483     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00484 
00485 #define MMX_UNPACK_32_ARGB                      \
00486     mm3 = _mm_setzero_si64();                   \
00487     mm4 = mm0;                                  \
00488     mm4 = _mm_unpacklo_pi8(mm4, mm2);           \
00489     mm5 = mm1;                                  \
00490     mm5 = _mm_unpacklo_pi8(mm5, mm3);           \
00491     mm6 = mm4;                                  \
00492     mm4 = _mm_unpacklo_pi16(mm4, mm5);          \
00493     *(uint64_t *)p_buffer = (uint64_t)mm4;      \
00494     mm6 = _mm_unpackhi_pi16(mm6, mm5);          \
00495     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
00496     mm0 = _mm_unpackhi_pi8(mm0, mm2);           \
00497     mm1 = _mm_unpackhi_pi8(mm1, mm3);           \
00498     mm5 = mm0;                                  \
00499     mm5 = _mm_unpacklo_pi16(mm5, mm1);          \
00500     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;\
00501     mm0 = _mm_unpackhi_pi16(mm0, mm1);          \
00502     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00503 
00504 #define MMX_UNPACK_32_RGBA                      \
00505     mm3 = _mm_setzero_si64();                   \
00506     mm4 = mm2;                                  \
00507     mm4 = _mm_unpacklo_pi8(mm4, mm1);           \
00508     mm3 = _mm_unpacklo_pi8(mm3, mm0);           \
00509     mm5 = mm3;                                  \
00510     mm3 = _mm_unpacklo_pi16(mm3, mm4);          \
00511     *(uint64_t *)p_buffer = (uint64_t)mm3;      \
00512     mm5 = _mm_unpackhi_pi16(mm5, mm4);          \
00513     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
00514     mm6 = _mm_setzero_si64();                   \
00515     mm2 = _mm_unpackhi_pi8(mm2, mm1);           \
00516     mm6 = _mm_unpackhi_pi8(mm6, mm0);           \
00517     mm0 = mm6;                                  \
00518     mm6 = _mm_unpacklo_pi16(mm6, mm2);          \
00519     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
00520     mm0 = _mm_unpackhi_pi16(mm0, mm2);          \
00521     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00522 
00523 #define MMX_UNPACK_32_BGRA                      \
00524     mm3 = _mm_setzero_si64();                   \
00525     mm4 = mm2;                                  \
00526     mm4 = _mm_unpacklo_pi8(mm4, mm0);           \
00527     mm3 = _mm_unpacklo_pi8(mm3, mm1);           \
00528     mm5 = mm3;                                  \
00529     mm3 = _mm_unpacklo_pi16(mm3, mm4);          \
00530     *(uint64_t *)p_buffer = (uint64_t)mm3;      \
00531     mm5 = _mm_unpackhi_pi16(mm5, mm4);          \
00532     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
00533     mm6 = _mm_setzero_si64();                   \
00534     mm2 = _mm_unpackhi_pi8(mm2, mm0);           \
00535     mm6 = _mm_unpackhi_pi8(mm6, mm1);           \
00536     mm0 = mm6;                                  \
00537     mm6 = _mm_unpacklo_pi16(mm6, mm2);          \
00538     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
00539     mm0 = _mm_unpackhi_pi16(mm0, mm2);          \
00540     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00541 
00542 #define MMX_UNPACK_32_ABGR                      \
00543     mm3 = _mm_setzero_si64();                   \
00544     mm4 = mm1;                                  \
00545     mm4 = _mm_unpacklo_pi8(mm4, mm2);           \
00546     mm5 = mm0;                                  \
00547     mm5 = _mm_unpacklo_pi8(mm5, mm3);           \
00548     mm6 = mm4;                                  \
00549     mm4 = _mm_unpacklo_pi16(mm4, mm5);          \
00550     *(uint64_t *)p_buffer = (uint64_t)mm4;      \
00551     mm6 = _mm_unpackhi_pi16(mm6, mm5);          \
00552     *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
00553     mm1 = _mm_unpackhi_pi8(mm1, mm2);           \
00554     mm0 = _mm_unpackhi_pi8(mm0, mm3);           \
00555     mm2 = mm1;                                  \
00556     mm1 = _mm_unpacklo_pi16(mm1, mm0);          \
00557     *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
00558     mm2 = _mm_unpackhi_pi16(mm2, mm0);          \
00559     *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
00560 
00561 #endif
00562 
00563 #elif defined( MODULE_NAME_IS_i420_rgb_sse2 )
00564 
00565 #if defined(CAN_COMPILE_SSE2)
00566 
00567 /* SSE2 assembly */
00568 
00569 #define SSE2_CALL(SSE2_INSTRUCTIONS)    \
00570     do {                                \
00571     __asm__ __volatile__(               \
00572         ".p2align 3 \n\t"               \
00573         SSE2_INSTRUCTIONS               \
00574         :                               \
00575         : "r" (p_y), "r" (p_u),         \
00576           "r" (p_v), "r" (p_buffer)     \
00577         : "eax" );                      \
00578     } while(0)
00579 
00580 #define SSE2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
00581 
00582 #define SSE2_INIT_16_ALIGNED "                                              \n\
00583 movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       \n\
00584 movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       \n\
00585 pxor      %%xmm4, %%xmm4    # zero mm4                                      \n\
00586 movdqa      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00587 "
00588 
00589 #define SSE2_INIT_16_UNALIGNED "                                            \n\
00590 movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       \n\
00591 movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       \n\
00592 pxor      %%xmm4, %%xmm4    # zero mm4                                      \n\
00593 movdqu      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00594 prefetchnta (%3)            # Tell CPU not to cache output RGB data         \n\
00595 "
00596 
00597 #define SSE2_INIT_32_ALIGNED "                                              \n\
00598 movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       \n\
00599 movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       \n\
00600 pxor      %%xmm4, %%xmm4    # zero mm4                                      \n\
00601 movdqa      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00602 "
00603 
00604 #define SSE2_INIT_32_UNALIGNED "                                            \n\
00605 movq        (%1), %%xmm0    # Load 8 Cb       00 00 00 00 u3 u2 u1 u0       \n\
00606 movq        (%2), %%xmm1    # Load 8 Cr       00 00 00 00 v3 v2 v1 v0       \n\
00607 pxor      %%xmm4, %%xmm4    # zero mm4                                      \n\
00608 movdqu      (%0), %%xmm6    # Load 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
00609 prefetchnta (%3)            # Tell CPU not to cache output RGB data         \n\
00610 "
00611 
00612 #define SSE2_YUV_MUL "                                                      \n\
00613 # convert the chroma part                                                   \n\
00614 punpcklbw %%xmm4, %%xmm0        # scatter 8 Cb    00 u3 00 u2 00 u1 00 u0   \n\
00615 punpcklbw %%xmm4, %%xmm1        # scatter 8 Cr    00 v3 00 v2 00 v1 00 v0   \n\
00616 movl      $0x00800080, %%eax    #                                           \n\
00617 movd      %%eax, %%xmm5         #                                           \n\
00618 pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     0080 0080 ... 0080 0080   \n\
00619 psubsw    %%xmm5, %%xmm0        # Cb -= 128                                 \n\
00620 psubsw    %%xmm5, %%xmm1        # Cr -= 128                                 \n\
00621 psllw     $3, %%xmm0            # Promote precision                         \n\
00622 psllw     $3, %%xmm1            # Promote precision                         \n\
00623 movdqa    %%xmm0, %%xmm2        # Copy 8 Cb       00 u3 00 u2 00 u1 00 u0   \n\
00624 movdqa    %%xmm1, %%xmm3        # Copy 8 Cr       00 v3 00 v2 00 v1 00 v0   \n\
00625 movl      $0xf37df37d, %%eax    #                                           \n\
00626 movd      %%eax, %%xmm5         #                                           \n\
00627 pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     f37d f37d ... f37d f37d   \n\
00628 pmulhw    %%xmm5, %%xmm2        # Mul Cb with green coeff -> Cb green       \n\
00629 movl      $0xe5fce5fc, %%eax    #                                           \n\
00630 movd      %%eax, %%xmm5         #                                           \n\
00631 pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     e5fc e5fc ... e5fc e5fc   \n\
00632 pmulhw    %%xmm5, %%xmm3        # Mul Cr with green coeff -> Cr green       \n\
00633 movl      $0x40934093, %%eax    #                                           \n\
00634 movd      %%eax, %%xmm5         #                                           \n\
00635 pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     4093 4093 ... 4093 4093   \n\
00636 pmulhw    %%xmm5, %%xmm0        # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0   \n\
00637 movl      $0x33123312, %%eax    #                                           \n\
00638 movd      %%eax, %%xmm5         #                                           \n\
00639 pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to     3312 3312 ... 3312 3312   \n\
00640 pmulhw    %%xmm5, %%xmm1        # Mul Cr -> Cred  00 r3 00 r2 00 r1 00 r0   \n\
00641 paddsw    %%xmm3, %%xmm2        # Cb green + Cr green -> Cgreen             \n\
00642                                                                             \n\
00643 # convert the luma part                                                     \n\
00644 movl      $0x10101010, %%eax    #                                           \n\
00645 movd      %%eax, %%xmm5         #                                           \n\
00646 pshufd    $0, %%xmm5, %%xmm5    # Set xmm5 to   1010 1010 ... 1010 1010     \n\
00647 psubusb   %%xmm5, %%xmm6        # Y -= 16                                   \n\
00648 movdqa    %%xmm6, %%xmm7        # Copy 16 Y       Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
00649 movl      $0x00ff00ff, %%eax    #                                           \n\
00650 movd      %%eax, %%xmm5         #                                           \n\
00651 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     00ff 00ff ... 00ff 00ff   \n\
00652 pand      %%xmm5, %%xmm6        # get Y even      00 Y6 00 Y4 00 Y2 00 Y0   \n\
00653 psrlw     $8, %%xmm7            # get Y odd       00 Y7 00 Y5 00 Y3 00 Y1   \n\
00654 psllw     $3, %%xmm6            # Promote precision                         \n\
00655 psllw     $3, %%xmm7            # Promote precision                         \n\
00656 movl      $0x253f253f, %%eax    #                                           \n\
00657 movd      %%eax, %%xmm5         #                                           \n\
00658 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     253f 253f ... 253f 253f   \n\
00659 pmulhw    %%xmm5, %%xmm6        # Mul 8 Y even    00 y6 00 y4 00 y2 00 y0   \n\
00660 pmulhw    %%xmm5, %%xmm7        # Mul 8 Y odd     00 y7 00 y5 00 y3 00 y1   \n\
00661 "
00662 
00663 #define SSE2_YUV_ADD "                                                      \n\
00664 # Do horizontal and vertical scaling                                        \n\
00665 movdqa    %%xmm0, %%xmm3        # Copy Cblue                                \n\
00666 movdqa    %%xmm1, %%xmm4        # Copy Cred                                 \n\
00667 movdqa    %%xmm2, %%xmm5        # Copy Cgreen                               \n\
00668 paddsw    %%xmm6, %%xmm0        # Y even + Cblue  00 B6 00 B4 00 B2 00 B0   \n\
00669 paddsw    %%xmm7, %%xmm3        # Y odd  + Cblue  00 B7 00 B5 00 B3 00 B1   \n\
00670 paddsw    %%xmm6, %%xmm1        # Y even + Cred   00 R6 00 R4 00 R2 00 R0   \n\
00671 paddsw    %%xmm7, %%xmm4        # Y odd  + Cred   00 R7 00 R5 00 R3 00 R1   \n\
00672 paddsw    %%xmm6, %%xmm2        # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0   \n\
00673 paddsw    %%xmm7, %%xmm5        # Y odd  + Cgreen 00 G7 00 G5 00 G3 00 G1   \n\
00674                                                                             \n\
00675 # Limit RGB even to 0..255                                                  \n\
00676 packuswb  %%xmm0, %%xmm0        # B6 B4 B2 B0 / B6 B4 B2 B0                 \n\
00677 packuswb  %%xmm1, %%xmm1        # R6 R4 R2 R0 / R6 R4 R2 R0                 \n\
00678 packuswb  %%xmm2, %%xmm2        # G6 G4 G2 G0 / G6 G4 G2 G0                 \n\
00679                                                                             \n\
00680 # Limit RGB odd to 0..255                                                   \n\
00681 packuswb  %%xmm3, %%xmm3        # B7 B5 B3 B1 / B7 B5 B3 B1                 \n\
00682 packuswb  %%xmm4, %%xmm4        # R7 R5 R3 R1 / R7 R5 R3 R1                 \n\
00683 packuswb  %%xmm5, %%xmm5        # G7 G5 G3 G1 / G7 G5 G3 G1                 \n\
00684                                                                             \n\
00685 # Interleave RGB even and odd                                               \n\
00686 punpcklbw %%xmm3, %%xmm0        #                 B7 B6 B5 B4 B3 B2 B1 B0   \n\
00687 punpcklbw %%xmm4, %%xmm1        #                 R7 R6 R5 R4 R3 R2 R1 R0   \n\
00688 punpcklbw %%xmm5, %%xmm2        #                 G7 G6 G5 G4 G3 G2 G1 G0   \n\
00689 "
00690 
00691 #define SSE2_UNPACK_15_ALIGNED "                                            \n\
00692 # mask unneeded bits off                                                    \n\
00693 movl      $0xf8f8f8f8, %%eax    #                                           \n\
00694 movd      %%eax, %%xmm5         #                                           \n\
00695 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   \n\
00696 pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
00697 psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
00698 pand      %%xmm5, %%xmm2        # g7g6g5g4 g3______ g7g6g5g4 g3______       \n\
00699 pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
00700 psrlw     $1,%%xmm1             # __r7r6r5 r4r3____ __r7r6r5 r4r3____       \n\
00701 pxor      %%xmm4, %%xmm4        # zero mm4                                  \n\
00702 movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               \n\
00703 movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               \n\
00704                                                                             \n\
00705 # convert rgb24 plane to rgb15 pack for pixel 0-7                           \n\
00706 punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3______       \n\
00707 punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00708 psllw     $2,%%xmm2             # ________ ____g7g6 g5g4g3__ ________       \n\
00709 por       %%xmm2, %%xmm0        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       \n\
00710 movntdq   %%xmm0, (%3)          # store pixel 0-7                           \n\
00711                                                                             \n\
00712 # convert rgb24 plane to rgb15 pack for pixel 8-15                          \n\
00713 punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3______       \n\
00714 punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00715 psllw     $2,%%xmm7             # ________ ____g7g6 g5g4g3__ ________       \n\
00716 por       %%xmm7, %%xmm5        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       \n\
00717 movntdq   %%xmm5, 16(%3)        # store pixel 4-7                           \n\
00718 "
00719 
00720 #define SSE2_UNPACK_15_UNALIGNED "                                          \n\
00721 # mask unneeded bits off                                                    \n\
00722 movl      $0xf8f8f8f8, %%eax    #                                           \n\
00723 movd      %%eax, %%xmm5         #                                           \n\
00724 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   \n\
00725 pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
00726 psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
00727 pand      %%xmm5, %%xmm2        # g7g6g5g4 g3______ g7g6g5g4 g3______       \n\
00728 pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
00729 psrlw     $1,%%xmm1             # __r7r6r5 r4r3____ __r7r6r5 r4r3____       \n\
00730 pxor      %%xmm4, %%xmm4        # zero mm4                                  \n\
00731 movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               \n\
00732 movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               \n\
00733                                                                             \n\
00734 # convert rgb24 plane to rgb15 pack for pixel 0-7                           \n\
00735 punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3______       \n\
00736 punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00737 psllw     $2,%%xmm2             # ________ ____g7g6 g5g4g3__ ________       \n\
00738 por       %%xmm2, %%xmm0        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       \n\
00739 movdqu    %%xmm0, (%3)          # store pixel 0-7                           \n\
00740                                                                             \n\
00741 # convert rgb24 plane to rgb15 pack for pixel 8-15                          \n\
00742 punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3______       \n\
00743 punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00744 psllw     $2,%%xmm7             # ________ ____g7g6 g5g4g3__ ________       \n\
00745 por       %%xmm7, %%xmm5        # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3       \n\
00746 movdqu    %%xmm5, 16(%3)        # store pixel 4-7                           \n\
00747 "
00748 
00749 #define SSE2_UNPACK_16_ALIGNED "                                            \n\
00750 # mask unneeded bits off                                                    \n\
00751 movl      $0xf8f8f8f8, %%eax    #                                           \n\
00752 movd      %%eax, %%xmm5         #                                           \n\
00753 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   \n\
00754 pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
00755 pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
00756 movl      $0xfcfcfcfc, %%eax    #                                           \n\
00757 movd      %%eax, %%xmm5         #                                           \n\
00758 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   \n\
00759 pand      %%xmm5, %%xmm2        # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       \n\
00760 psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
00761 pxor      %%xmm4, %%xmm4        # zero mm4                                  \n\
00762 movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               \n\
00763 movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               \n\
00764                                                                             \n\
00765 # convert rgb24 plane to rgb16 pack for pixel 0-7                           \n\
00766 punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3g2____       \n\
00767 punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00768 psllw     $3,%%xmm2             # ________ __g7g6g5 g4g3g2__ ________       \n\
00769 por       %%xmm2, %%xmm0        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
00770 movntdq   %%xmm0, (%3)          # store pixel 0-7                           \n\
00771                                                                             \n\
00772 # convert rgb24 plane to rgb16 pack for pixel 8-15                          \n\
00773 punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3g2____       \n\
00774 punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00775 psllw     $3,%%xmm7             # ________ __g7g6g5 g4g3g2__ ________       \n\
00776 por       %%xmm7, %%xmm5        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
00777 movntdq   %%xmm5, 16(%3)        # store pixel 4-7                           \n\
00778 "
00779 
00780 #define SSE2_UNPACK_16_UNALIGNED "                                          \n\
00781 # mask unneeded bits off                                                    \n\
00782 movl      $0xf8f8f8f8, %%eax    #                                           \n\
00783 movd      %%eax, %%xmm5         #                                           \n\
00784 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   \n\
00785 pand      %%xmm5, %%xmm0        # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
00786 pand      %%xmm5, %%xmm1        # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
00787 movl      $0xfcfcfcfc, %%eax    #                                           \n\
00788 movd      %%eax, %%xmm5         #                                           \n\
00789 pshufd    $0, %%xmm5, %%xmm5    # set xmm5 to     f8f8 f8f8 ... f8f8 f8f8   \n\
00790 pand      %%xmm5, %%xmm2        # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       \n\
00791 psrlw     $3,%%xmm0             # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
00792 pxor      %%xmm4, %%xmm4        # zero mm4                                  \n\
00793 movdqa    %%xmm0, %%xmm5        # Copy B15-B0                               \n\
00794 movdqa    %%xmm2, %%xmm7        # Copy G15-G0                               \n\
00795                                                                             \n\
00796 # convert rgb24 plane to rgb16 pack for pixel 0-7                           \n\
00797 punpcklbw %%xmm4, %%xmm2        # ________ ________ g7g6g5g4 g3g2____       \n\
00798 punpcklbw %%xmm1, %%xmm0        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00799 psllw     $3,%%xmm2             # ________ __g7g6g5 g4g3g2__ ________       \n\
00800 por       %%xmm2, %%xmm0        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
00801 movdqu    %%xmm0, (%3)          # store pixel 0-7                           \n\
00802                                                                             \n\
00803 # convert rgb24 plane to rgb16 pack for pixel 8-15                          \n\
00804 punpckhbw %%xmm4, %%xmm7        # ________ ________ g7g6g5g4 g3g2____       \n\
00805 punpckhbw %%xmm1, %%xmm5        # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
00806 psllw     $3,%%xmm7             # ________ __g7g6g5 g4g3g2__ ________       \n\
00807 por       %%xmm7, %%xmm5        # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
00808 movdqu    %%xmm5, 16(%3)        # store pixel 4-7                           \n\
00809 "
00810 
00811 #define SSE2_UNPACK_32_ARGB_ALIGNED "                                       \n\
00812 pxor      %%xmm3, %%xmm3  # zero xmm3                                       \n\
00813 movdqa    %%xmm0, %%xmm4  #               B7 B6 B5 B4 B3 B2 B1 B0           \n\
00814 punpcklbw %%xmm2, %%xmm4  #               G3 B3 G2 B2 G1 B1 G0 B0           \n\
00815 movdqa    %%xmm1, %%xmm5  #               R7 R6 R5 R4 R3 R2 R1 R0           \n\
00816 punpcklbw %%xmm3, %%xmm5  #               00 R3 00 R2 00 R1 00 R0           \n\
00817 movdqa    %%xmm4, %%xmm6  #               G3 B3 G2 B2 G1 B1 G0 B0           \n\
00818 punpcklwd %%xmm5, %%xmm4  #               00 R1 B1 G1 00 R0 B0 G0           \n\
00819 movntdq   %%xmm4, (%3)    # Store ARGB3 ARGB2 ARGB1 ARGB0                   \n\
00820 punpckhwd %%xmm5, %%xmm6  #               00 R3 B3 G3 00 R2 B2 G2           \n\
00821 movntdq   %%xmm6, 16(%3)  # Store ARGB7 ARGB6 ARGB5 ARGB4                   \n\
00822 punpckhbw %%xmm2, %%xmm0  #               G7 B7 G6 B6 G5 B5 G4 B4           \n\
00823 punpckhbw %%xmm3, %%xmm1  #               00 R7 00 R6 00 R5 00 R4           \n\
00824 movdqa    %%xmm0, %%xmm5  #               G7 B7 G6 B6 G5 B5 G4 B4           \n\
00825 punpcklwd %%xmm1, %%xmm5  #               00 R5 B5 G5 00 R4 B4 G4           \n\
00826 movntdq   %%xmm5, 32(%3)  # Store ARGB11 ARGB10 ARGB9 ARGB8                 \n\
00827 punpckhwd %%xmm1, %%xmm0  #               00 R7 B7 G7 00 R6 B6 G6           \n\
00828 movntdq   %%xmm0, 48(%3)  # Store ARGB15 ARGB14 ARGB13 ARGB12               \n\
00829 "
00830 
00831 #define SSE2_UNPACK_32_ARGB_UNALIGNED "                                     \n\
00832 pxor      %%xmm3, %%xmm3  # zero xmm3                                       \n\
00833 movdqa    %%xmm0, %%xmm4  #               B7 B6 B5 B4 B3 B2 B1 B0           \n\
00834 punpcklbw %%xmm2, %%xmm4  #               G3 B3 G2 B2 G1 B1 G0 B0           \n\
00835 movdqa    %%xmm1, %%xmm5  #               R7 R6 R5 R4 R3 R2 R1 R0           \n\
00836 punpcklbw %%xmm3, %%xmm5  #               00 R3 00 R2 00 R1 00 R0           \n\
00837 movdqa    %%xmm4, %%xmm6  #               G3 B3 G2 B2 G1 B1 G0 B0           \n\
00838 punpcklwd %%xmm5, %%xmm4  #               00 R1 B1 G1 00 R0 B0 G0           \n\
00839 movdqu    %%xmm4, (%3)    # Store ARGB3 ARGB2 ARGB1 ARGB0                   \n\
00840 punpckhwd %%xmm5, %%xmm6  #               00 R3 B3 G3 00 R2 B2 G2           \n\
00841 movdqu    %%xmm6, 16(%3)  # Store ARGB7 ARGB6 ARGB5 ARGB4                   \n\
00842 punpckhbw %%xmm2, %%xmm0  #               G7 B7 G6 B6 G5 B5 G4 B4           \n\
00843 punpckhbw %%xmm3, %%xmm1  #               00 R7 00 R6 00 R5 00 R4           \n\
00844 movdqa    %%xmm0, %%xmm5  #               G7 B7 G6 B6 G5 B5 G4 B4           \n\
00845 punpcklwd %%xmm1, %%xmm5  #               00 R5 B5 G5 00 R4 B4 G4           \n\
00846 movdqu    %%xmm5, 32(%3)  # Store ARGB11 ARGB10 ARGB9 ARGB8                 \n\
00847 punpckhwd %%xmm1, %%xmm0  #               00 R7 B7 G7 00 R6 B6 G6           \n\
00848 movdqu    %%xmm0, 48(%3)  # Store ARGB15 ARGB14 ARGB13 ARGB12               \n\
00849 "
00850 
00851 #define SSE2_UNPACK_32_RGBA_ALIGNED "                                       \n\
00852 pxor      %%xmm3, %%xmm3  # zero mm3                                        \n\
00853 movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         \n\
00854 punpcklbw %%xmm1, %%xmm4  #                 R3 G3 R2 G2 R1 G1 R0 G0         \n\
00855 punpcklbw %%xmm0, %%xmm3  #                 B3 00 B2 00 B1 00 B0 00         \n\
00856 movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         \n\
00857 punpcklwd %%xmm4, %%xmm3  #                 R1 G1 B1 00 R0 B0 G0 00         \n\
00858 movntdq   %%xmm3, (%3)    # Store RGBA3 RGBA2 RGBA1 RGBA0                   \n\
00859 punpckhwd %%xmm4, %%xmm5  #                 R3 G3 B3 00 R2 G2 B2 00         \n\
00860 movntdq   %%xmm5, 16(%3)  # Store RGBA7 RGBA6 RGBA5 RGBA4                   \n\
00861 pxor      %%xmm6, %%xmm6  # zero mm6                                        \n\
00862 punpckhbw %%xmm1, %%xmm2  #                 R7 G7 R6 G6 R5 G5 R4 G4         \n\
00863 punpckhbw %%xmm0, %%xmm6  #                 B7 00 B6 00 B5 00 B4 00         \n\
00864 movdqa    %%xmm6, %%xmm0  #                 B7 00 B6 00 B5 00 B4 00         \n\
00865 punpcklwd %%xmm2, %%xmm6  #                 R5 G5 B5 00 R4 G4 B4 00         \n\
00866 movntdq   %%xmm6, 32(%3)  # Store BGRA11 BGRA10 BGRA9 RGBA8                 \n\
00867 punpckhwd %%xmm2, %%xmm0  #                 R7 G7 B7 00 R6 G6 B6 00         \n\
00868 movntdq   %%xmm0, 48(%3)  # Store RGBA15 RGBA14 RGBA13 RGBA12               \n\
00869 "
00870 
00871 #define SSE2_UNPACK_32_RGBA_UNALIGNED "                                     \n\
00872 pxor      %%xmm3, %%xmm3  # zero mm3                                        \n\
00873 movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         \n\
00874 punpcklbw %%xmm1, %%xmm4  #                 R3 G3 R2 G2 R1 G1 R0 G0         \n\
00875 punpcklbw %%xmm0, %%xmm3  #                 B3 00 B2 00 B1 00 B0 00         \n\
00876 movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         \n\
00877 punpcklwd %%xmm4, %%xmm3  #                 R1 G1 B1 00 R0 B0 G0 00         \n\
00878 movdqu    %%xmm3, (%3)    # Store RGBA3 RGBA2 RGBA1 RGBA0                   \n\
00879 punpckhwd %%xmm4, %%xmm5  #                 R3 G3 B3 00 R2 G2 B2 00         \n\
00880 movdqu    %%xmm5, 16(%3)  # Store RGBA7 RGBA6 RGBA5 RGBA4                   \n\
00881 pxor      %%xmm6, %%xmm6  # zero mm6                                        \n\
00882 punpckhbw %%xmm1, %%xmm2  #                 R7 G7 R6 G6 R5 G5 R4 G4         \n\
00883 punpckhbw %%xmm0, %%xmm6  #                 B7 00 B6 00 B5 00 B4 00         \n\
00884 movdqa    %%xmm6, %%xmm0  #                 B7 00 B6 00 B5 00 B4 00         \n\
00885 punpcklwd %%xmm2, %%xmm6  #                 R5 G5 B5 00 R4 G4 B4 00         \n\
00886 movdqu    %%xmm6, 32(%3)  # Store RGBA11 RGBA10 RGBA9 RGBA8                 \n\
00887 punpckhwd %%xmm2, %%xmm0  #                 R7 G7 B7 00 R6 G6 B6 00         \n\
00888 movdqu    %%xmm0, 48(%3)  # Store RGBA15 RGBA14 RGBA13 RGBA12               \n\
00889 "
00890 
00891 #define SSE2_UNPACK_32_BGRA_ALIGNED "                                       \n\
00892 pxor      %%xmm3, %%xmm3  # zero mm3                                        \n\
00893 movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         \n\
00894 punpcklbw %%xmm0, %%xmm4  #                 B3 G3 B2 G2 B1 G1 B0 G0         \n\
00895 punpcklbw %%xmm1, %%xmm3  #                 R3 00 R2 00 R1 00 R0 00         \n\
00896 movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         \n\
00897 punpcklwd %%xmm4, %%xmm3  #                 B1 G1 R1 00 B0 G0 R0 00         \n\
00898 movntdq   %%xmm3, (%3)    # Store BGRA3 BGRA2 BGRA1 BGRA0                   \n\
00899 punpckhwd %%xmm4, %%xmm5  #                 B3 G3 R3 00 B2 G2 R2 00         \n\
00900 movntdq   %%xmm5, 16(%3)  # Store BGRA7 BGRA6 BGRA5 BGRA4                   \n\
00901 pxor      %%xmm6, %%xmm6  # zero mm6                                        \n\
00902 punpckhbw %%xmm0, %%xmm2  #                 B7 G7 B6 G6 B5 G5 B4 G4         \n\
00903 punpckhbw %%xmm1, %%xmm6  #                 R7 00 R6 00 R5 00 R4 00         \n\
00904 movdqa    %%xmm6, %%xmm0  #                 R7 00 R6 00 R5 00 R4 00         \n\
00905 punpcklwd %%xmm2, %%xmm6  #                 B5 G5 R5 00 B4 G4 R4 00         \n\
00906 movntdq   %%xmm6, 32(%3)  # Store BGRA11 BGRA10 BGRA9 BGRA8                 \n\
00907 punpckhwd %%xmm2, %%xmm0  #                 B7 G7 R7 00 B6 G6 R6 00         \n\
00908 movntdq   %%xmm0, 48(%3)  # Store BGRA15 BGRA14 BGRA13 BGRA12               \n\
00909 "
00910 
00911 #define SSE2_UNPACK_32_BGRA_UNALIGNED "                                     \n\
00912 pxor      %%xmm3, %%xmm3  # zero mm3                                        \n\
00913 movdqa    %%xmm2, %%xmm4  #                 G7 G6 G5 G4 G3 G2 G1 G0         \n\
00914 punpcklbw %%xmm0, %%xmm4  #                 B3 G3 B2 G2 B1 G1 B0 G0         \n\
00915 punpcklbw %%xmm1, %%xmm3  #                 R3 00 R2 00 R1 00 R0 00         \n\
00916 movdqa    %%xmm3, %%xmm5  #                 R3 00 R2 00 R1 00 R0 00         \n\
00917 punpcklwd %%xmm4, %%xmm3  #                 B1 G1 R1 00 B0 G0 R0 00         \n\
00918 movdqu    %%xmm3, (%3)    # Store BGRA3 BGRA2 BGRA1 BGRA0                   \n\
00919 punpckhwd %%xmm4, %%xmm5  #                 B3 G3 R3 00 B2 G2 R2 00         \n\
00920 movdqu    %%xmm5, 16(%3)  # Store BGRA7 BGRA6 BGRA5 BGRA4                   \n\
00921 pxor      %%xmm6, %%xmm6  # zero mm6                                        \n\
00922 punpckhbw %%xmm0, %%xmm2  #                 B7 G7 B6 G6 B5 G5 B4 G4         \n\
00923 punpckhbw %%xmm1, %%xmm6  #                 R7 00 R6 00 R5 00 R4 00         \n\
00924 movdqa    %%xmm6, %%xmm0  #                 R7 00 R6 00 R5 00 R4 00         \n\
00925 punpcklwd %%xmm2, %%xmm6  #                 B5 G5 R5 00 B4 G4 R4 00         \n\
00926 movdqu    %%xmm6, 32(%3)  # Store BGRA11 BGRA10 BGRA9 BGRA8                 \n\
00927 punpckhwd %%xmm2, %%xmm0  #                 B7 G7 R7 00 B6 G6 R6 00         \n\
00928 movdqu    %%xmm0, 48(%3)  # Store BGRA15 BGRA14 BGRA13 BGRA12               \n\
00929 "
00930 
00931 #define SSE2_UNPACK_32_ABGR_ALIGNED "                                       \n\
00932 pxor      %%xmm3, %%xmm3  # zero mm3                                        \n\
00933 movdqa    %%xmm1, %%xmm4  #                 R7 R6 R5 R4 R3 R2 R1 R0         \n\
00934 punpcklbw %%xmm2, %%xmm4  #                 G3 R3 G2 R2 G1 R1 G0 R0         \n\
00935 movdqa    %%xmm0, %%xmm5  #                 B7 B6 B5 B4 B3 B2 B1 B0         \n\
00936 punpcklbw %%xmm3, %%xmm5  #                 00 B3 00 B2 00 B1 00 B0         \n\
00937 movdqa    %%xmm4, %%xmm6  #                 G3 R3 G2 R2 G1 R1 G0 R0         \n\
00938 punpcklwd %%xmm5, %%xmm4  #                 00 B1 G1 R1 00 B0 G0 R0         \n\
00939 movntdq   %%xmm4, (%3)    # Store ABGR3 ABGR2 ABGR1 ABGR0                   \n\
00940 punpckhwd %%xmm5, %%xmm6  #                 00 B3 G3 R3 00 B2 G2 R2         \n\
00941 movntdq   %%xmm6, 16(%3)  # Store ABGR7 ABGR6 ABGR5 ABGR4                   \n\
00942 punpckhbw %%xmm2, %%xmm1  #                 G7 R7 G6 R6 G5 R5 G4 R4         \n\
00943 punpckhbw %%xmm3, %%xmm0  #                 00 B7 00 B6 00 B5 00 B4         \n\
00944 movdqa    %%xmm1, %%xmm2  #                 G7 R7 G6 R6 G5 R5 G4 R4         \n\
00945 punpcklwd %%xmm0, %%xmm1  #                 00 B5 G5 R5 00 B4 G4 R4         \n\
00946 movntdq   %%xmm1, 32(%3)  # Store ABGR11 ABGR10 ABGR9 ABGR8                 \n\
00947 punpckhwd %%xmm0, %%xmm2  #                 B7 G7 R7 00 B6 G6 R6 00         \n\
00948 movntdq   %%xmm2, 48(%3)  # Store ABGR15 ABGR14 ABGR13 ABGR12               \n\
00949 "
00950 
00951 #define SSE2_UNPACK_32_ABGR_UNALIGNED "                                     \n\
00952 pxor      %%xmm3, %%xmm3  # zero mm3                                        \n\
00953 movdqa    %%xmm1, %%xmm4  #                 R7 R6 R5 R4 R3 R2 R1 R0         \n\
00954 punpcklbw %%xmm2, %%xmm4  #                 G3 R3 G2 R2 G1 R1 G0 R0         \n\
00955 movdqa    %%xmm0, %%xmm5  #                 B7 B6 B5 B4 B3 B2 B1 B0         \n\
00956 punpcklbw %%xmm3, %%xmm5  #                 00 B3 00 B2 00 B1 00 B0         \n\
00957 movdqa    %%xmm4, %%xmm6  #                 G3 R3 G2 R2 G1 R1 G0 R0         \n\
00958 punpcklwd %%xmm5, %%xmm4  #                 00 B1 G1 R1 00 B0 G0 R0         \n\
00959 movdqu    %%xmm4, (%3)    # Store ABGR3 ABGR2 ABGR1 ABGR0                   \n\
00960 punpckhwd %%xmm5, %%xmm6  #                 00 B3 G3 R3 00 B2 G2 R2         \n\
00961 movdqu    %%xmm6, 16(%3)  # Store ABGR7 ABGR6 ABGR5 ABGR4                   \n\
00962 punpckhbw %%xmm2, %%xmm1  #                 G7 R7 G6 R6 G5 R5 G4 R4         \n\
00963 punpckhbw %%xmm3, %%xmm0  #                 00 B7 00 B6 00 B5 00 B4         \n\
00964 movdqa    %%xmm1, %%xmm2  #                 R7 00 R6 00 R5 00 R4 00         \n\
00965 punpcklwd %%xmm0, %%xmm1  #                 00 B5 G5 R5 00 B4 G4 R4         \n\
00966 movdqu    %%xmm1, 32(%3)  # Store ABGR11 ABGR10 ABGR9 ABGR8                 \n\
00967 punpckhwd %%xmm0, %%xmm2  #                 B7 G7 R7 00 B6 G6 R6 00         \n\
00968 movdqu    %%xmm2, 48(%3)  # Store ABGR15 ABGR14 ABGR13 ABGR12               \n\
00969 "
00970 
00971 #elif defined(HAVE_SSE2_INTRINSICS)
00972 
00973 /* SSE2 intrinsics */
00974 
00975 #include <emmintrin.h>
00976 
00977 #define SSE2_CALL(SSE2_INSTRUCTIONS)        \
00978     do {                                    \
00979         __m128i xmm0, xmm1, xmm2, xmm3,     \
00980                 xmm4, xmm5, xmm6, xmm7;     \
00981         SSE2_INSTRUCTIONS                   \
00982     } while(0)
00983 
00984 #define SSE2_END  _mm_sfence()
00985 
00986 #define SSE2_INIT_16_ALIGNED                \
00987     xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
00988     xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00989     xmm4 = _mm_setzero_si128();             \
00990     xmm6 = _mm_load_si128((__m128i *)p_y);
00991 
00992 #define SSE2_INIT_16_UNALIGNED              \
00993     xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
00994     xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00995     xmm4 = _mm_setzero_si128();             \
00996     xmm6 = _mm_loadu_si128((__m128i *)p_y); \
00997     _mm_prefetch(p_buffer, _MM_HINT_NTA);
00998 
00999 #define SSE2_INIT_32_ALIGNED                \
01000     xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
01001     xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
01002     xmm4 = _mm_setzero_si128();             \
01003     xmm6 = _mm_load_si128((__m128i *)p_y);
01004 
01005 #define SSE2_INIT_32_UNALIGNED              \
01006     xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
01007     xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
01008     xmm4 = _mm_setzero_si128();             \
01009     xmm6 = _mm_loadu_si128((__m128i *)p_y); \
01010     _mm_prefetch(p_buffer, _MM_HINT_NTA);
01011 
01012 #define SSE2_YUV_MUL                        \
01013     xmm0 = _mm_unpacklo_epi8(xmm0, xmm4);   \
01014     xmm1 = _mm_unpacklo_epi8(xmm1, xmm4);   \
01015     xmm5 = _mm_set1_epi32(0x00800080UL);    \
01016     xmm0 = _mm_subs_epi16(xmm0, xmm5);      \
01017     xmm1 = _mm_subs_epi16(xmm1, xmm5);      \
01018     xmm0 = _mm_slli_epi16(xmm0, 3);         \
01019     xmm1 = _mm_slli_epi16(xmm1, 3);         \
01020     xmm2 = xmm0;                            \
01021     xmm3 = xmm1;                            \
01022     xmm5 = _mm_set1_epi32(0xf37df37dUL);    \
01023     xmm2 = _mm_mulhi_epi16(xmm2, xmm5);     \
01024     xmm5 = _mm_set1_epi32(0xe5fce5fcUL);    \
01025     xmm3 = _mm_mulhi_epi16(xmm3, xmm5);     \
01026     xmm5 = _mm_set1_epi32(0x40934093UL);    \
01027     xmm0 = _mm_mulhi_epi16(xmm0, xmm5);     \
01028     xmm5 = _mm_set1_epi32(0x33123312UL);    \
01029     xmm1 = _mm_mulhi_epi16(xmm1, xmm5);     \
01030     xmm2 = _mm_adds_epi16(xmm2, xmm3);      \
01031     \
01032     xmm5 = _mm_set1_epi32(0x10101010UL);    \
01033     xmm6 = _mm_subs_epu8(xmm6, xmm5);       \
01034     xmm7 = xmm6;                            \
01035     xmm5 = _mm_set1_epi32(0x00ff00ffUL);    \
01036     xmm6 = _mm_and_si128(xmm6, xmm5);       \
01037     xmm7 = _mm_srli_epi16(xmm7, 8);         \
01038     xmm6 = _mm_slli_epi16(xmm6, 3);         \
01039     xmm7 = _mm_slli_epi16(xmm7, 3);         \
01040     xmm5 = _mm_set1_epi32(0x253f253fUL);    \
01041     xmm6 = _mm_mulhi_epi16(xmm6, xmm5);     \
01042     xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
01043 
01044 #define SSE2_YUV_ADD                        \
01045     xmm3 = xmm0;                            \
01046     xmm4 = xmm1;                            \
01047     xmm5 = xmm2;                            \
01048     xmm0 = _mm_adds_epi16(xmm0, xmm6);      \
01049     xmm3 = _mm_adds_epi16(xmm3, xmm7);      \
01050     xmm1 = _mm_adds_epi16(xmm1, xmm6);      \
01051     xmm4 = _mm_adds_epi16(xmm4, xmm7);      \
01052     xmm2 = _mm_adds_epi16(xmm2, xmm6);      \
01053     xmm5 = _mm_adds_epi16(xmm5, xmm7);      \
01054     \
01055     xmm0 = _mm_packus_epi16(xmm0, xmm0);    \
01056     xmm1 = _mm_packus_epi16(xmm1, xmm1);    \
01057     xmm2 = _mm_packus_epi16(xmm2, xmm2);    \
01058     \
01059     xmm3 = _mm_packus_epi16(xmm3, xmm3);    \
01060     xmm4 = _mm_packus_epi16(xmm4, xmm4);    \
01061     xmm5 = _mm_packus_epi16(xmm5, xmm5);    \
01062     \
01063     xmm0 = _mm_unpacklo_epi8(xmm0, xmm3);   \
01064     xmm1 = _mm_unpacklo_epi8(xmm1, xmm4);   \
01065     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
01066 
01067 #define SSE2_UNPACK_15_ALIGNED                      \
01068     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            \
01069     xmm0 = _mm_and_si128(xmm0, xmm5);               \
01070     xmm0 = _mm_srli_epi16(xmm0, 3);                 \
01071     xmm2 = _mm_and_si128(xmm2, xmm5);               \
01072     xmm1 = _mm_and_si128(xmm1, xmm5);               \
01073     xmm1 = _mm_srli_epi16(xmm1, 1);                 \
01074     xmm4 = _mm_setzero_si128();                     \
01075     xmm5 = xmm0;                                    \
01076     xmm7 = xmm2;                                    \
01077     \
01078     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           \
01079     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           \
01080     xmm2 = _mm_slli_epi16(xmm2, 2);                 \
01081     xmm0 = _mm_or_si128(xmm0, xmm2);                \
01082     _mm_stream_si128((__m128i*)p_buffer, xmm0);     \
01083     \
01084     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           \
01085     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           \
01086     xmm7 = _mm_slli_epi16(xmm7, 2);                 \
01087     xmm5 = _mm_or_si128(xmm5, xmm7);                \
01088     _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
01089 
01090 #define SSE2_UNPACK_15_UNALIGNED                    \
01091     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            \
01092     xmm0 = _mm_and_si128(xmm0, xmm5);               \
01093     xmm0 = _mm_srli_epi16(xmm0, 3);                 \
01094     xmm2 = _mm_and_si128(xmm2, xmm5);               \
01095     xmm1 = _mm_and_si128(xmm1, xmm5);               \
01096     xmm1 = _mm_srli_epi16(xmm1, 1);                 \
01097     xmm4 = _mm_setzero_si128();                     \
01098     xmm5 = xmm0;                                    \
01099     xmm7 = xmm2;                                    \
01100     \
01101     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           \
01102     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           \
01103     xmm2 = _mm_slli_epi16(xmm2, 2);                 \
01104     xmm0 = _mm_or_si128(xmm0, xmm2);                \
01105     _mm_storeu_si128((__m128i*)p_buffer, xmm0);     \
01106     \
01107     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           \
01108     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           \
01109     xmm7 = _mm_slli_epi16(xmm7, 2);                 \
01110     xmm5 = _mm_or_si128(xmm5, xmm7);                \
01111     _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
01112 
01113 #define SSE2_UNPACK_16_ALIGNED                      \
01114     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            \
01115     xmm0 = _mm_and_si128(xmm0, xmm5);               \
01116     xmm1 = _mm_and_si128(xmm1, xmm5);               \
01117     xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);            \
01118     xmm2 = _mm_and_si128(xmm2, xmm5);               \
01119     xmm0 = _mm_srli_epi16(xmm0, 3);                 \
01120     xmm4 = _mm_setzero_si128();                     \
01121     xmm5 = xmm0;                                    \
01122     xmm7 = xmm2;                                    \
01123     \
01124     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           \
01125     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           \
01126     xmm2 = _mm_slli_epi16(xmm2, 3);                 \
01127     xmm0 = _mm_or_si128(xmm0, xmm2);                \
01128     _mm_stream_si128((__m128i*)p_buffer, xmm0);     \
01129     \
01130     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           \
01131     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           \
01132     xmm7 = _mm_slli_epi16(xmm7, 3);                 \
01133     xmm5 = _mm_or_si128(xmm5, xmm7);                \
01134     _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
01135 
01136 #define SSE2_UNPACK_16_UNALIGNED                    \
01137     xmm5 = _mm_set1_epi32(0xf8f8f8f8UL);            \
01138     xmm0 = _mm_and_si128(xmm0, xmm5);               \
01139     xmm1 = _mm_and_si128(xmm1, xmm5);               \
01140     xmm5 = _mm_set1_epi32(0xfcfcfcfcUL);            \
01141     xmm2 = _mm_and_si128(xmm2, xmm5);               \
01142     xmm0 = _mm_srli_epi16(xmm0, 3);                 \
01143     xmm4 = _mm_setzero_si128();                     \
01144     xmm5 = xmm0;                                    \
01145     xmm7 = xmm2;                                    \
01146     \
01147     xmm2 = _mm_unpacklo_epi8(xmm2, xmm4);           \
01148     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);           \
01149     xmm2 = _mm_slli_epi16(xmm2, 3);                 \
01150     xmm0 = _mm_or_si128(xmm0, xmm2);                \
01151     _mm_storeu_si128((__m128i*)p_buffer, xmm0);     \
01152     \
01153     xmm7 = _mm_unpackhi_epi8(xmm7, xmm4);           \
01154     xmm5 = _mm_unpackhi_epi8(xmm5, xmm1);           \
01155     xmm7 = _mm_slli_epi16(xmm7, 3);                 \
01156     xmm5 = _mm_or_si128(xmm5, xmm7);                \
01157     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
01158 
01159 #define SSE2_UNPACK_32_ARGB_ALIGNED                 \
01160     xmm3 = _mm_setzero_si128();                     \
01161     xmm4 = xmm0;                                    \
01162     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           \
01163     xmm5 = xmm1;                                    \
01164     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           \
01165     xmm6 = xmm4;                                    \
01166     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          \
01167     _mm_stream_si128((__m128i*)(p_buffer), xmm4);   \
01168     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          \
01169     _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
01170     xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);           \
01171     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           \
01172     xmm5 = xmm0;                                    \
01173     xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);          \
01174     _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
01175     xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);          \
01176     _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
01177 
01178 #define SSE2_UNPACK_32_ARGB_UNALIGNED               \
01179     xmm3 = _mm_setzero_si128();                     \
01180     xmm4 = xmm0;                                    \
01181     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           \
01182     xmm5 = xmm1;                                    \
01183     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           \
01184     xmm6 = xmm4;                                    \
01185     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          \
01186     _mm_storeu_si128((__m128i*)(p_buffer), xmm4);   \
01187     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          \
01188     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
01189     xmm0 = _mm_unpackhi_epi8(xmm0, xmm2);           \
01190     xmm1 = _mm_unpackhi_epi8(xmm1, xmm3);           \
01191     xmm5 = xmm0;                                    \
01192     xmm5 = _mm_unpacklo_epi16(xmm5, xmm1);          \
01193     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
01194     xmm0 = _mm_unpackhi_epi16(xmm0, xmm1);          \
01195     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
01196 
01197 #define SSE2_UNPACK_32_RGBA_ALIGNED                 \
01198     xmm3 = _mm_setzero_si128();                     \
01199     xmm4 = xmm2;                                    \
01200     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           \
01201     xmm3 = _mm_unpacklo_epi8(xmm3, xmm0);           \
01202     xmm5 = xmm3;                                    \
01203     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          \
01204     _mm_stream_si128((__m128i*)(p_buffer), xmm3);   \
01205     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          \
01206     _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
01207     xmm6 = _mm_setzero_si128();                     \
01208     xmm2 = _mm_unpackhi_epi8(xmm2, xmm1);           \
01209     xmm6 = _mm_unpackhi_epi8(xmm6, xmm0);           \
01210     xmm0 = xmm6;                                    \
01211     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          \
01212     _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
01213     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          \
01214     _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
01215 
01216 #define SSE2_UNPACK_32_RGBA_UNALIGNED               \
01217     xmm3 = _mm_setzero_si128();                     \
01218     xmm4 = xmm2;                                    \
01219     xmm4 = _mm_unpacklo_epi8(xmm4, xmm1);           \
01220     xmm3 = _mm_unpacklo_epi8(xmm3, xmm0);           \
01221     xmm5 = xmm3;                                    \
01222     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          \
01223     _mm_storeu_si128((__m128i*)(p_buffer), xmm3);   \
01224     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          \
01225     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
01226     xmm6 = _mm_setzero_si128();                     \
01227     xmm2 = _mm_unpackhi_epi8(xmm2, xmm1);           \
01228     xmm6 = _mm_unpackhi_epi8(xmm6, xmm0);           \
01229     xmm0 = xmm6;                                    \
01230     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          \
01231     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
01232     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          \
01233     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
01234 
01235 #define SSE2_UNPACK_32_BGRA_ALIGNED                 \
01236     xmm3 = _mm_setzero_si128();                     \
01237     xmm4 = xmm2;                                    \
01238     xmm4 = _mm_unpacklo_epi8(xmm4, xmm0);           \
01239     xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);           \
01240     xmm5 = xmm3;                                    \
01241     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          \
01242     _mm_stream_si128((__m128i*)(p_buffer), xmm3);   \
01243     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          \
01244     _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
01245     xmm6 = _mm_setzero_si128();                     \
01246     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           \
01247     xmm6 = _mm_unpackhi_epi8(xmm6, xmm1);           \
01248     xmm0 = xmm6;                                    \
01249     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          \
01250     _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
01251     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          \
01252     _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
01253 
01254 #define SSE2_UNPACK_32_BGRA_UNALIGNED               \
01255     xmm3 = _mm_setzero_si128();                     \
01256     xmm4 = xmm2;                                    \
01257     xmm4 = _mm_unpacklo_epi8(xmm4, xmm0);           \
01258     xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);           \
01259     xmm5 = xmm3;                                    \
01260     xmm3 = _mm_unpacklo_epi16(xmm3, xmm4);          \
01261     _mm_storeu_si128((__m128i*)(p_buffer), xmm3);   \
01262     xmm5 = _mm_unpackhi_epi16(xmm5, xmm4);          \
01263     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
01264     xmm6 = _mm_setzero_si128();                     \
01265     xmm2 = _mm_unpackhi_epi8(xmm2, xmm0);           \
01266     xmm6 = _mm_unpackhi_epi8(xmm6, xmm1);           \
01267     xmm0 = xmm6;                                    \
01268     xmm6 = _mm_unpacklo_epi16(xmm6, xmm2);          \
01269     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
01270     xmm0 = _mm_unpackhi_epi16(xmm0, xmm2);          \
01271     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
01272 
01273 #define SSE2_UNPACK_32_ABGR_ALIGNED                 \
01274     xmm3 = _mm_setzero_si128();                     \
01275     xmm4 = xmm1;                                    \
01276     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           \
01277     xmm5 = xmm0;                                    \
01278     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           \
01279     xmm6 = xmm4;                                    \
01280     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          \
01281     _mm_stream_si128((__m128i*)(p_buffer), xmm4);   \
01282     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          \
01283     _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
01284     xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);           \
01285     xmm0 = _mm_unpackhi_epi8(xmm0, xmm3);           \
01286     xmm2 = xmm1;                                    \
01287     xmm1 = _mm_unpacklo_epi16(xmm1, xmm0);          \
01288     _mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \
01289     xmm2 = _mm_unpackhi_epi16(xmm2, xmm0);          \
01290     _mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
01291 
01292 #define SSE2_UNPACK_32_ABGR_UNALIGNED               \
01293     xmm3 = _mm_setzero_si128();                     \
01294     xmm4 = xmm1;                                    \
01295     xmm4 = _mm_unpacklo_epi8(xmm4, xmm2);           \
01296     xmm5 = xmm0;                                    \
01297     xmm5 = _mm_unpacklo_epi8(xmm5, xmm3);           \
01298     xmm6 = xmm4;                                    \
01299     xmm4 = _mm_unpacklo_epi16(xmm4, xmm5);          \
01300     _mm_storeu_si128((__m128i*)(p_buffer), xmm4);   \
01301     xmm6 = _mm_unpackhi_epi16(xmm6, xmm5);          \
01302     _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
01303     xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);           \
01304     xmm0 = _mm_unpackhi_epi8(xmm0, xmm3);           \
01305     xmm2 = xmm1;                                    \
01306     xmm1 = _mm_unpacklo_epi16(xmm1, xmm0);          \
01307     _mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \
01308     xmm2 = _mm_unpackhi_epi16(xmm2, xmm0);          \
01309     _mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
01310 
01311 #endif
01312 
01313 #endif

Generated on Tue May 25 08:04:59 2010 for VLC by  doxygen 1.5.6