00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifdef MODULE_NAME_IS_i420_rgb_mmx
00028
00029
00030 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
00031 #define USED_U64(foo) \
00032 static const uint64_t foo __asm__ (#foo) __attribute__((used))
00033 #else
00034 #define USED_U64(foo) \
00035 static const uint64_t foo __asm__ (#foo) __attribute__((unused))
00036 #endif
00037 USED_U64(mmx_80w) = 0x0080008000800080ULL;
00038 USED_U64(mmx_10w) = 0x1010101010101010ULL;
00039 USED_U64(mmx_00ffw) = 0x00ff00ff00ff00ffULL;
00040 USED_U64(mmx_Y_coeff) = 0x253f253f253f253fULL;
00041
00042 USED_U64(mmx_U_green) = 0xf37df37df37df37dULL;
00043 USED_U64(mmx_U_blue) = 0x4093409340934093ULL;
00044 USED_U64(mmx_V_red) = 0x3312331233123312ULL;
00045 USED_U64(mmx_V_green) = 0xe5fce5fce5fce5fcULL;
00046
00047 USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL;
00048 USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL;
00049 #undef USED_U64
00050
00051 #if defined(CAN_COMPILE_MMX)
00052
00053
00054
00055 #define MMX_CALL(MMX_INSTRUCTIONS) \
00056 do { \
00057 __asm__ __volatile__( \
00058 ".p2align 3 \n\t" \
00059 MMX_INSTRUCTIONS \
00060 : \
00061 : "r" (p_y), "r" (p_u), \
00062 "r" (p_v), "r" (p_buffer), \
00063 "m" (mmx_80w), "m" (mmx_10w), \
00064 "m" (mmx_00ffw), "m" (mmx_Y_coeff), \
00065 "m" (mmx_U_green), "m" (mmx_U_blue), \
00066 "m" (mmx_V_red), "m" (mmx_V_green), \
00067 "m" (mmx_mask_f8), "m" (mmx_mask_fc) ); \
00068 } while(0)
00069
00070 #define MMX_END __asm__ __volatile__ ( "emms" )
00071
00072 #define MMX_INIT_16 " \n\
00073 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00074 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00075 pxor %%mm4, %%mm4 # zero mm4 \n\
00076 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00077 "
00078
00079 #define MMX_INIT_16_GRAY " \n\
00080 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00081 #movl $0, (%3) # cache preload for image \n\
00082 "
00083
00084 #define MMX_INIT_32 " \n\
00085 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00086 movl $0, (%3) # cache preload for image \n\
00087 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00088 pxor %%mm4, %%mm4 # zero mm4 \n\
00089 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00090 "
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100 #define MMX_YUV_MUL " \n\
00101 # convert the chroma part \n\
00102 punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00103 punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00104 psubsw %4, %%mm0 # Cb -= 128 \n\
00105 psubsw %4, %%mm1 # Cr -= 128 \n\
00106 psllw $3, %%mm0 # Promote precision \n\
00107 psllw $3, %%mm1 # Promote precision \n\
00108 movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00109 movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00110 pmulhw %8, %%mm2 # Mul Cb with green coeff -> Cb green \n\
00111 pmulhw %11, %%mm3 # Mul Cr with green coeff -> Cr green \n\
00112 pmulhw %9, %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
00113 pmulhw %10, %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
00114 paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen \n\
00115 \n\
00116 # convert the luma part \n\
00117 psubusb %5, %%mm6 # Y -= 16 \n\
00118 movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00119 pand %6, %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00120 psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
00121 psllw $3, %%mm6 # Promote precision \n\
00122 psllw $3, %%mm7 # Promote precision \n\
00123 pmulhw %7, %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
00124 pmulhw %7, %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
00125 "
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135 #define MMX_YUV_ADD " \n\
00136 # Do horizontal and vertical scaling \n\
00137 movq %%mm0, %%mm3 # Copy Cblue \n\
00138 movq %%mm1, %%mm4 # Copy Cred \n\
00139 movq %%mm2, %%mm5 # Copy Cgreen \n\
00140 paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
00141 paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
00142 paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
00143 paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
00144 paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
00145 paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
00146 \n\
00147 # Limit RGB even to 0..255 \n\
00148 packuswb %%mm0, %%mm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
00149 packuswb %%mm1, %%mm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
00150 packuswb %%mm2, %%mm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
00151 \n\
00152 # Limit RGB odd to 0..255 \n\
00153 packuswb %%mm3, %%mm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
00154 packuswb %%mm4, %%mm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
00155 packuswb %%mm5, %%mm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
00156 \n\
00157 # Interleave RGB even and odd \n\
00158 punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00159 punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00160 punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00161 "
00162
00163
00164
00165
00166
00167 #define MMX_YUV_GRAY " \n\
00168 # convert the luma part \n\
00169 psubusb %5, %%mm6 \n\
00170 movq %%mm6, %%mm7 \n\
00171 pand %6, %%mm6 \n\
00172 psrlw $8, %%mm7 \n\
00173 psllw $3, %%mm6 \n\
00174 psllw $3, %%mm7 \n\
00175 pmulhw %7, %%mm6 \n\
00176 pmulhw %7, %%mm7 \n\
00177 packuswb %%mm6, %%mm6 \n\
00178 packuswb %%mm7, %%mm7 \n\
00179 punpcklbw %%mm7, %%mm6 \n\
00180 "
00181
00182 #define MMX_UNPACK_16_GRAY " \n\
00183 movq %%mm6, %%mm5 \n\
00184 pand %12, %%mm6 \n\
00185 pand %13, %%mm5 \n\
00186 movq %%mm6, %%mm7 \n\
00187 psrlw $3, %%mm7 \n\
00188 pxor %%mm3, %%mm3 \n\
00189 movq %%mm7, %%mm2 \n\
00190 movq %%mm5, %%mm0 \n\
00191 punpcklbw %%mm3, %%mm5 \n\
00192 punpcklbw %%mm6, %%mm7 \n\
00193 psllw $3, %%mm5 \n\
00194 por %%mm5, %%mm7 \n\
00195 movq %%mm7, (%3) \n\
00196 punpckhbw %%mm3, %%mm0 \n\
00197 punpckhbw %%mm6, %%mm2 \n\
00198 psllw $3, %%mm0 \n\
00199 movq 8(%0), %%mm6 \n\
00200 por %%mm0, %%mm2 \n\
00201 movq %%mm2, 8(%3) \n\
00202 "
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212 #define MMX_UNPACK_15 " \n\
00213 # mask unneeded bits off \n\
00214 pand %12, %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00215 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00216 pand %12, %%mm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00217 pand %12, %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00218 psrlw $1,%%mm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00219 pxor %%mm4, %%mm4 # zero mm4 \n\
00220 movq %%mm0, %%mm5 # Copy B7-B0 \n\
00221 movq %%mm2, %%mm7 # Copy G7-G0 \n\
00222 \n\
00223 # convert rgb24 plane to rgb15 pack for pixel 0-3 \n\
00224 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3______ \n\
00225 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00226 psllw $2,%%mm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00227 por %%mm2, %%mm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00228 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00229 movq %%mm0, (%3) # store pixel 0-3 \n\
00230 \n\
00231 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00232 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3______ \n\
00233 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00234 psllw $2,%%mm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00235 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
00236 por %%mm7, %%mm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00237 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
00238 movq %%mm5, 8(%3) # store pixel 4-7 \n\
00239 "
00240
00241
00242
00243
00244
00245
00246
00247
00248 #define MMX_UNPACK_16 " \n\
00249 # mask unneeded bits off \n\
00250 pand %12, %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00251 pand %13, %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00252 pand %12, %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00253 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00254 pxor %%mm4, %%mm4 # zero mm4 \n\
00255 movq %%mm0, %%mm5 # Copy B7-B0 \n\
00256 movq %%mm2, %%mm7 # Copy G7-G0 \n\
00257 \n\
00258 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00259 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00260 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00261 psllw $3,%%mm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00262 por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00263 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00264 movq %%mm0, (%3) # store pixel 0-3 \n\
00265 \n\
00266 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00267 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00268 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00269 psllw $3,%%mm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00270 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
00271 por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00272 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
00273 movq %%mm5, 8(%3) # store pixel 4-7 \n\
00274 "
00275
00276
00277
00278
00279
00280
00281 #define MMX_UNPACK_32_ARGB " \n\
00282 pxor %%mm3, %%mm3 # zero mm3 \n\
00283 movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00284 punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00285 movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00286 punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\
00287 movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00288 punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00289 movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\
00290 punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
00291 movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\
00292 punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00293 punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\
00294 movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00295 punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00296 movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\
00297 punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
00298 movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
00299 "
00300
00301 #define MMX_UNPACK_32_RGBA " \n\
00302 pxor %%mm3, %%mm3 # zero mm3 \n\
00303 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00304 punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
00305 punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\
00306 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
00307 punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\
00308 movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\
00309 punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
00310 movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\
00311 pxor %%mm6, %%mm6 # zero mm6 \n\
00312 punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
00313 punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\
00314 movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\
00315 punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
00316 movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\
00317 punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
00318 movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\
00319 "
00320
00321 #define MMX_UNPACK_32_BGRA " \n\
00322 pxor %%mm3, %%mm3 # zero mm3 \n\
00323 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00324 punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
00325 punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\
00326 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
00327 punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
00328 movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\
00329 punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
00330 movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\
00331 pxor %%mm6, %%mm6 # zero mm6 \n\
00332 punpckhbw %%mm0, %%mm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
00333 punpckhbw %%mm1, %%mm6 # R7 00 R6 00 R5 00 R4 00 \n\
00334 movq %%mm6, %%mm0 # R7 00 R6 00 R5 00 R4 00 \n\
00335 punpcklwd %%mm2, %%mm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
00336 movq %%mm6, 16(%3) # Store BGRA5 BGRA4 \n\
00337 punpckhwd %%mm2, %%mm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
00338 movq %%mm0, 24(%3) # Store BGRA7 BGRA6 \n\
00339 "
00340
00341 #define MMX_UNPACK_32_ABGR " \n\
00342 pxor %%mm3, %%mm3 # zero mm3 \n\
00343 movq %%mm1, %%mm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00344 punpcklbw %%mm2, %%mm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00345 movq %%mm0, %%mm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00346 punpcklbw %%mm3, %%mm5 # 00 B3 00 B2 00 B1 00 B0 \n\
00347 movq %%mm4, %%mm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00348 punpcklwd %%mm5, %%mm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
00349 movq %%mm4, (%3) # Store ABGR1 ABGR0 \n\
00350 punpckhwd %%mm5, %%mm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
00351 movq %%mm6, 8(%3) # Store ABGR3 ABGR2 \n\
00352 punpckhbw %%mm2, %%mm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00353 punpckhbw %%mm3, %%mm0 # 00 B7 00 B6 00 B5 00 B4 \n\
00354 movq %%mm1, %%mm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00355 punpcklwd %%mm0, %%mm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
00356 movq %%mm1, 16(%3) # Store ABGR5 ABGR4 \n\
00357 punpckhwd %%mm0, %%mm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
00358 movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
00359 "
00360
00361 #elif defined(HAVE_MMX_INTRINSICS)
00362
00363
00364
00365 #include <mmintrin.h>
00366
00367 #define MMX_CALL(MMX_INSTRUCTIONS) \
00368 do { \
00369 __m64 mm0, mm1, mm2, mm3, \
00370 mm4, mm5, mm6, mm7; \
00371 MMX_INSTRUCTIONS \
00372 } while(0)
00373
00374 #define MMX_END _mm_empty()
00375
00376 #define MMX_INIT_16 \
00377 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
00378 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00379 mm4 = _mm_setzero_si64(); \
00380 mm6 = (__m64)*(uint64_t *)p_y;
00381
00382 #define MMX_INIT_32 \
00383 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
00384 *(uint16_t *)p_buffer = 0; \
00385 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00386 mm4 = _mm_setzero_si64(); \
00387 mm6 = (__m64)*(uint64_t *)p_y;
00388
00389 #define MMX_YUV_MUL \
00390 mm0 = _mm_unpacklo_pi8(mm0, mm4); \
00391 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
00392 mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
00393 mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
00394 mm0 = _mm_slli_pi16(mm0, 3); \
00395 mm1 = _mm_slli_pi16(mm1, 3); \
00396 mm2 = mm0; \
00397 mm3 = mm1; \
00398 mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
00399 mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
00400 mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
00401 mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
00402 mm2 = _mm_adds_pi16(mm2, mm3); \
00403 \
00404 mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
00405 mm7 = mm6; \
00406 mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
00407 mm7 = _mm_srli_pi16(mm7, 8); \
00408 mm6 = _mm_slli_pi16(mm6, 3); \
00409 mm7 = _mm_slli_pi16(mm7, 3); \
00410 mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
00411 mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
00412
00413 #define MMX_YUV_ADD \
00414 mm3 = mm0; \
00415 mm4 = mm1; \
00416 mm5 = mm2; \
00417 mm0 = _mm_adds_pi16(mm0, mm6); \
00418 mm3 = _mm_adds_pi16(mm3, mm7); \
00419 mm1 = _mm_adds_pi16(mm1, mm6); \
00420 mm4 = _mm_adds_pi16(mm4, mm7); \
00421 mm2 = _mm_adds_pi16(mm2, mm6); \
00422 mm5 = _mm_adds_pi16(mm5, mm7); \
00423 \
00424 mm0 = _mm_packs_pu16(mm0, mm0); \
00425 mm1 = _mm_packs_pu16(mm1, mm1); \
00426 mm2 = _mm_packs_pu16(mm2, mm2); \
00427 \
00428 mm3 = _mm_packs_pu16(mm3, mm3); \
00429 mm4 = _mm_packs_pu16(mm4, mm4); \
00430 mm5 = _mm_packs_pu16(mm5, mm5); \
00431 \
00432 mm0 = _mm_unpacklo_pi8(mm0, mm3); \
00433 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
00434 mm2 = _mm_unpacklo_pi8(mm2, mm5);
00435
00436 #define MMX_UNPACK_15 \
00437 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
00438 mm0 = _mm_srli_pi16(mm0, 3); \
00439 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
00440 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
00441 mm1 = _mm_srli_pi16(mm1, 1); \
00442 mm4 = _mm_setzero_si64(); \
00443 mm5 = mm0; \
00444 mm7 = mm2; \
00445 \
00446 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
00447 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
00448 mm2 = _mm_slli_pi16(mm2, 2); \
00449 mm0 = _mm_or_si64(mm0, mm2); \
00450 mm6 = (__m64)*(uint64_t *)(p_y + 8); \
00451 *(uint64_t *)p_buffer = (uint64_t)mm0; \
00452 \
00453 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
00454 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
00455 mm7 = _mm_slli_pi16(mm7, 2); \
00456 mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
00457 mm5 = _mm_or_si64(mm5, mm7); \
00458 mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
00459 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00460
00461 #define MMX_UNPACK_16 \
00462 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
00463 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
00464 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
00465 mm0 = _mm_srli_pi16(mm0, 3); \
00466 mm4 = _mm_setzero_si64(); \
00467 mm5 = mm0; \
00468 mm7 = mm2; \
00469 \
00470 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
00471 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
00472 mm2 = _mm_slli_pi16(mm2, 3); \
00473 mm0 = _mm_or_si64(mm0, mm2); \
00474 mm6 = (__m64)*(uint64_t *)(p_y + 8); \
00475 *(uint64_t *)p_buffer = (uint64_t)mm0; \
00476 \
00477 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
00478 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
00479 mm7 = _mm_slli_pi16(mm7, 3); \
00480 mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
00481 mm5 = _mm_or_si64(mm5, mm7); \
00482 mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
00483 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00484
00485 #define MMX_UNPACK_32_ARGB \
00486 mm3 = _mm_setzero_si64(); \
00487 mm4 = mm0; \
00488 mm4 = _mm_unpacklo_pi8(mm4, mm2); \
00489 mm5 = mm1; \
00490 mm5 = _mm_unpacklo_pi8(mm5, mm3); \
00491 mm6 = mm4; \
00492 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
00493 *(uint64_t *)p_buffer = (uint64_t)mm4; \
00494 mm6 = _mm_unpackhi_pi16(mm6, mm5); \
00495 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
00496 mm0 = _mm_unpackhi_pi8(mm0, mm2); \
00497 mm1 = _mm_unpackhi_pi8(mm1, mm3); \
00498 mm5 = mm0; \
00499 mm5 = _mm_unpacklo_pi16(mm5, mm1); \
00500 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;\
00501 mm0 = _mm_unpackhi_pi16(mm0, mm1); \
00502 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00503
00504 #define MMX_UNPACK_32_RGBA \
00505 mm3 = _mm_setzero_si64(); \
00506 mm4 = mm2; \
00507 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
00508 mm3 = _mm_unpacklo_pi8(mm3, mm0); \
00509 mm5 = mm3; \
00510 mm3 = _mm_unpacklo_pi16(mm3, mm4); \
00511 *(uint64_t *)p_buffer = (uint64_t)mm3; \
00512 mm5 = _mm_unpackhi_pi16(mm5, mm4); \
00513 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
00514 mm6 = _mm_setzero_si64(); \
00515 mm2 = _mm_unpackhi_pi8(mm2, mm1); \
00516 mm6 = _mm_unpackhi_pi8(mm6, mm0); \
00517 mm0 = mm6; \
00518 mm6 = _mm_unpacklo_pi16(mm6, mm2); \
00519 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
00520 mm0 = _mm_unpackhi_pi16(mm0, mm2); \
00521 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00522
00523 #define MMX_UNPACK_32_BGRA \
00524 mm3 = _mm_setzero_si64(); \
00525 mm4 = mm2; \
00526 mm4 = _mm_unpacklo_pi8(mm4, mm0); \
00527 mm3 = _mm_unpacklo_pi8(mm3, mm1); \
00528 mm5 = mm3; \
00529 mm3 = _mm_unpacklo_pi16(mm3, mm4); \
00530 *(uint64_t *)p_buffer = (uint64_t)mm3; \
00531 mm5 = _mm_unpackhi_pi16(mm5, mm4); \
00532 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
00533 mm6 = _mm_setzero_si64(); \
00534 mm2 = _mm_unpackhi_pi8(mm2, mm0); \
00535 mm6 = _mm_unpackhi_pi8(mm6, mm1); \
00536 mm0 = mm6; \
00537 mm6 = _mm_unpacklo_pi16(mm6, mm2); \
00538 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
00539 mm0 = _mm_unpackhi_pi16(mm0, mm2); \
00540 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00541
00542 #define MMX_UNPACK_32_ABGR \
00543 mm3 = _mm_setzero_si64(); \
00544 mm4 = mm1; \
00545 mm4 = _mm_unpacklo_pi8(mm4, mm2); \
00546 mm5 = mm0; \
00547 mm5 = _mm_unpacklo_pi8(mm5, mm3); \
00548 mm6 = mm4; \
00549 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
00550 *(uint64_t *)p_buffer = (uint64_t)mm4; \
00551 mm6 = _mm_unpackhi_pi16(mm6, mm5); \
00552 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
00553 mm1 = _mm_unpackhi_pi8(mm1, mm2); \
00554 mm0 = _mm_unpackhi_pi8(mm0, mm3); \
00555 mm2 = mm1; \
00556 mm1 = _mm_unpacklo_pi16(mm1, mm0); \
00557 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
00558 mm2 = _mm_unpackhi_pi16(mm2, mm0); \
00559 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
00560
00561 #endif
00562
00563 #elif defined( MODULE_NAME_IS_i420_rgb_sse2 )
00564
00565 #if defined(CAN_COMPILE_SSE2)
00566
00567
00568
00569 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00570 do { \
00571 __asm__ __volatile__( \
00572 ".p2align 3 \n\t" \
00573 SSE2_INSTRUCTIONS \
00574 : \
00575 : "r" (p_y), "r" (p_u), \
00576 "r" (p_v), "r" (p_buffer) \
00577 : "eax" ); \
00578 } while(0)
00579
00580 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
00581
00582 #define SSE2_INIT_16_ALIGNED " \n\
00583 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00584 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00585 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00586 movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00587 "
00588
00589 #define SSE2_INIT_16_UNALIGNED " \n\
00590 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00591 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00592 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00593 movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00594 prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
00595 "
00596
00597 #define SSE2_INIT_32_ALIGNED " \n\
00598 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00599 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00600 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00601 movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00602 "
00603
00604 #define SSE2_INIT_32_UNALIGNED " \n\
00605 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00606 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00607 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00608 movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00609 prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
00610 "
00611
00612 #define SSE2_YUV_MUL " \n\
00613 # convert the chroma part \n\
00614 punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00615 punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00616 movl $0x00800080, %%eax # \n\
00617 movd %%eax, %%xmm5 # \n\
00618 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\
00619 psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\
00620 psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\
00621 psllw $3, %%xmm0 # Promote precision \n\
00622 psllw $3, %%xmm1 # Promote precision \n\
00623 movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00624 movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00625 movl $0xf37df37d, %%eax # \n\
00626 movd %%eax, %%xmm5 # \n\
00627 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\
00628 pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\
00629 movl $0xe5fce5fc, %%eax # \n\
00630 movd %%eax, %%xmm5 # \n\
00631 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\
00632 pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\
00633 movl $0x40934093, %%eax # \n\
00634 movd %%eax, %%xmm5 # \n\
00635 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\
00636 pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
00637 movl $0x33123312, %%eax # \n\
00638 movd %%eax, %%xmm5 # \n\
00639 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\
00640 pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
00641 paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\
00642 \n\
00643 # convert the luma part \n\
00644 movl $0x10101010, %%eax # \n\
00645 movd %%eax, %%xmm5 # \n\
00646 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\
00647 psubusb %%xmm5, %%xmm6 # Y -= 16 \n\
00648 movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00649 movl $0x00ff00ff, %%eax # \n\
00650 movd %%eax, %%xmm5 # \n\
00651 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\
00652 pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00653 psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
00654 psllw $3, %%xmm6 # Promote precision \n\
00655 psllw $3, %%xmm7 # Promote precision \n\
00656 movl $0x253f253f, %%eax # \n\
00657 movd %%eax, %%xmm5 # \n\
00658 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\
00659 pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\
00660 pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
00661 "
00662
00663 #define SSE2_YUV_ADD " \n\
00664 # Do horizontal and vertical scaling \n\
00665 movdqa %%xmm0, %%xmm3 # Copy Cblue \n\
00666 movdqa %%xmm1, %%xmm4 # Copy Cred \n\
00667 movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\
00668 paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
00669 paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
00670 paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
00671 paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
00672 paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
00673 paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
00674 \n\
00675 # Limit RGB even to 0..255 \n\
00676 packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
00677 packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
00678 packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
00679 \n\
00680 # Limit RGB odd to 0..255 \n\
00681 packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
00682 packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
00683 packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
00684 \n\
00685 # Interleave RGB even and odd \n\
00686 punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00687 punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00688 punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00689 "
00690
00691 #define SSE2_UNPACK_15_ALIGNED " \n\
00692 # mask unneeded bits off \n\
00693 movl $0xf8f8f8f8, %%eax # \n\
00694 movd %%eax, %%xmm5 # \n\
00695 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00696 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00697 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00698 pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00699 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00700 psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00701 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00702 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00703 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00704 \n\
00705 # convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
00706 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
00707 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00708 psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00709 por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00710 movntdq %%xmm0, (%3) # store pixel 0-7 \n\
00711 \n\
00712 # convert rgb24 plane to rgb15 pack for pixel 8-15 \n\
00713 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\
00714 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00715 psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00716 por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00717 movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
00718 "
00719
00720 #define SSE2_UNPACK_15_UNALIGNED " \n\
00721 # mask unneeded bits off \n\
00722 movl $0xf8f8f8f8, %%eax # \n\
00723 movd %%eax, %%xmm5 # \n\
00724 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00725 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00726 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00727 pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00728 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00729 psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00730 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00731 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00732 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00733 \n\
00734 # convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
00735 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
00736 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00737 psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00738 por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00739 movdqu %%xmm0, (%3) # store pixel 0-7 \n\
00740 \n\
00741 # convert rgb24 plane to rgb15 pack for pixel 8-15 \n\
00742 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\
00743 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00744 psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00745 por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00746 movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
00747 "
00748
00749 #define SSE2_UNPACK_16_ALIGNED " \n\
00750 # mask unneeded bits off \n\
00751 movl $0xf8f8f8f8, %%eax # \n\
00752 movd %%eax, %%xmm5 # \n\
00753 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00754 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00755 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00756 movl $0xfcfcfcfc, %%eax # \n\
00757 movd %%eax, %%xmm5 # \n\
00758 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00759 pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00760 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00761 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00762 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00763 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00764 \n\
00765 # convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
00766 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00767 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00768 psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00769 por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00770 movntdq %%xmm0, (%3) # store pixel 0-7 \n\
00771 \n\
00772 # convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
00773 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00774 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00775 psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00776 por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00777 movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
00778 "
00779
00780 #define SSE2_UNPACK_16_UNALIGNED " \n\
00781 # mask unneeded bits off \n\
00782 movl $0xf8f8f8f8, %%eax # \n\
00783 movd %%eax, %%xmm5 # \n\
00784 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00785 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00786 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00787 movl $0xfcfcfcfc, %%eax # \n\
00788 movd %%eax, %%xmm5 # \n\
00789 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00790 pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00791 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00792 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00793 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00794 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00795 \n\
00796 # convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
00797 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00798 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00799 psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00800 por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00801 movdqu %%xmm0, (%3) # store pixel 0-7 \n\
00802 \n\
00803 # convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
00804 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00805 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00806 psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00807 por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00808 movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
00809 "
00810
00811 #define SSE2_UNPACK_32_ARGB_ALIGNED " \n\
00812 pxor %%xmm3, %%xmm3 # zero xmm3 \n\
00813 movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00814 punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00815 movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00816 punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
00817 movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00818 punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00819 movntdq %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
00820 punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
00821 movntdq %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
00822 punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00823 punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
00824 movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00825 punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00826 movntdq %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
00827 punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
00828 movntdq %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
00829 "
00830
00831 #define SSE2_UNPACK_32_ARGB_UNALIGNED " \n\
00832 pxor %%xmm3, %%xmm3 # zero xmm3 \n\
00833 movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00834 punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00835 movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00836 punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
00837 movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00838 punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00839 movdqu %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
00840 punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
00841 movdqu %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
00842 punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00843 punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
00844 movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00845 punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00846 movdqu %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
00847 punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
00848 movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
00849 "
00850
00851 #define SSE2_UNPACK_32_RGBA_ALIGNED " \n\
00852 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00853 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00854 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
00855 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
00856 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00857 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
00858 movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
00859 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
00860 movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
00861 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00862 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
00863 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
00864 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
00865 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
00866 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\
00867 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
00868 movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
00869 "
00870
00871 #define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\
00872 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00873 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00874 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
00875 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
00876 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00877 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
00878 movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
00879 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
00880 movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
00881 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00882 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
00883 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
00884 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
00885 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
00886 movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\
00887 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
00888 movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
00889 "
00890
00891 #define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
00892 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00893 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00894 punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
00895 punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
00896 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00897 punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
00898 movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
00899 punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
00900 movntdq %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
00901 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00902 punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
00903 punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
00904 movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
00905 punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
00906 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
00907 punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
00908 movntdq %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
00909 "
00910
00911 #define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\
00912 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00913 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00914 punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
00915 punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
00916 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00917 punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
00918 movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
00919 punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
00920 movdqu %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
00921 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00922 punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
00923 punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
00924 movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
00925 punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
00926 movdqu %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
00927 punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
00928 movdqu %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
00929 "
00930
00931 #define SSE2_UNPACK_32_ABGR_ALIGNED " \n\
00932 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00933 movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00934 punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00935 movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00936 punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
00937 movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00938 punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
00939 movntdq %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
00940 punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
00941 movntdq %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
00942 punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00943 punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
00944 movdqa %%xmm1, %%xmm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00945 punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
00946 movntdq %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
00947 punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
00948 movntdq %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
00949 "
00950
00951 #define SSE2_UNPACK_32_ABGR_UNALIGNED " \n\
00952 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00953 movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00954 punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00955 movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00956 punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
00957 movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00958 punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
00959 movdqu %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
00960 punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
00961 movdqu %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
00962 punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00963 punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
00964 movdqa %%xmm1, %%xmm2 # R7 00 R6 00 R5 00 R4 00 \n\
00965 punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
00966 movdqu %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
00967 punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
00968 movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
00969 "
00970
00971 #elif defined(HAVE_SSE2_INTRINSICS)
00972
00973
00974
00975 #include <emmintrin.h>
00976
00977 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00978 do { \
00979 __m128i xmm0, xmm1, xmm2, xmm3, \
00980 xmm4, xmm5, xmm6, xmm7; \
00981 SSE2_INSTRUCTIONS \
00982 } while(0)
00983
00984 #define SSE2_END _mm_sfence()
00985
00986 #define SSE2_INIT_16_ALIGNED \
00987 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
00988 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00989 xmm4 = _mm_setzero_si128(); \
00990 xmm6 = _mm_load_si128((__m128i *)p_y);
00991
00992 #define SSE2_INIT_16_UNALIGNED \
00993 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
00994 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00995 xmm4 = _mm_setzero_si128(); \
00996 xmm6 = _mm_loadu_si128((__m128i *)p_y); \
00997 _mm_prefetch(p_buffer, _MM_HINT_NTA);
00998
00999 #define SSE2_INIT_32_ALIGNED \
01000 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
01001 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
01002 xmm4 = _mm_setzero_si128(); \
01003 xmm6 = _mm_load_si128((__m128i *)p_y);
01004
01005 #define SSE2_INIT_32_UNALIGNED \
01006 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
01007 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
01008 xmm4 = _mm_setzero_si128(); \
01009 xmm6 = _mm_loadu_si128((__m128i *)p_y); \
01010 _mm_prefetch(p_buffer, _MM_HINT_NTA);
01011
01012 #define SSE2_YUV_MUL \
01013 xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
01014 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
01015 xmm5 = _mm_set1_epi32(0x00800080UL); \
01016 xmm0 = _mm_subs_epi16(xmm0, xmm5); \
01017 xmm1 = _mm_subs_epi16(xmm1, xmm5); \
01018 xmm0 = _mm_slli_epi16(xmm0, 3); \
01019 xmm1 = _mm_slli_epi16(xmm1, 3); \
01020 xmm2 = xmm0; \
01021 xmm3 = xmm1; \
01022 xmm5 = _mm_set1_epi32(0xf37df37dUL); \
01023 xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \
01024 xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \
01025 xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \
01026 xmm5 = _mm_set1_epi32(0x40934093UL); \
01027 xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \
01028 xmm5 = _mm_set1_epi32(0x33123312UL); \
01029 xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \
01030 xmm2 = _mm_adds_epi16(xmm2, xmm3); \
01031 \
01032 xmm5 = _mm_set1_epi32(0x10101010UL); \
01033 xmm6 = _mm_subs_epu8(xmm6, xmm5); \
01034 xmm7 = xmm6; \
01035 xmm5 = _mm_set1_epi32(0x00ff00ffUL); \
01036 xmm6 = _mm_and_si128(xmm6, xmm5); \
01037 xmm7 = _mm_srli_epi16(xmm7, 8); \
01038 xmm6 = _mm_slli_epi16(xmm6, 3); \
01039 xmm7 = _mm_slli_epi16(xmm7, 3); \
01040 xmm5 = _mm_set1_epi32(0x253f253fUL); \
01041 xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \
01042 xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
01043
01044 #define SSE2_YUV_ADD \
01045 xmm3 = xmm0; \
01046 xmm4 = xmm1; \
01047 xmm5 = xmm2; \
01048 xmm0 = _mm_adds_epi16(xmm0, xmm6); \
01049 xmm3 = _mm_adds_epi16(xmm3, xmm7); \
01050 xmm1 = _mm_adds_epi16(xmm1, xmm6); \
01051 xmm4 = _mm_adds_epi16(xmm4, xmm7); \
01052 xmm2 = _mm_adds_epi16(xmm2, xmm6); \
01053 xmm5 = _mm_adds_epi16(xmm5, xmm7); \
01054 \
01055 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
01056 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
01057 xmm2 = _mm_packus_epi16(xmm2, xmm2); \
01058 \
01059 xmm3 = _mm_packus_epi16(xmm3, xmm3); \
01060 xmm4 = _mm_packus_epi16(xmm4, xmm4); \
01061 xmm5 = _mm_packus_epi16(xmm5, xmm5); \
01062 \
01063 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
01064 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
01065 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
01066
01067 #define SSE2_UNPACK_15_ALIGNED \
01068 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
01069 xmm0 = _mm_and_si128(xmm0, xmm5); \
01070 xmm0 = _mm_srli_epi16(xmm0, 3); \
01071 xmm2 = _mm_and_si128(xmm2, xmm5); \
01072 xmm1 = _mm_and_si128(xmm1, xmm5); \
01073 xmm1 = _mm_srli_epi16(xmm1, 1); \
01074 xmm4 = _mm_setzero_si128(); \
01075 xmm5 = xmm0; \
01076 xmm7 = xmm2; \
01077 \
01078 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
01079 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
01080 xmm2 = _mm_slli_epi16(xmm2, 2); \
01081 xmm0 = _mm_or_si128(xmm0, xmm2); \
01082 _mm_stream_si128((__m128i*)p_buffer, xmm0); \
01083 \
01084 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
01085 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
01086 xmm7 = _mm_slli_epi16(xmm7, 2); \
01087 xmm5 = _mm_or_si128(xmm5, xmm7); \
01088 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
01089
01090 #define SSE2_UNPACK_15_UNALIGNED \
01091 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
01092 xmm0 = _mm_and_si128(xmm0, xmm5); \
01093 xmm0 = _mm_srli_epi16(xmm0, 3); \
01094 xmm2 = _mm_and_si128(xmm2, xmm5); \
01095 xmm1 = _mm_and_si128(xmm1, xmm5); \
01096 xmm1 = _mm_srli_epi16(xmm1, 1); \
01097 xmm4 = _mm_setzero_si128(); \
01098 xmm5 = xmm0; \
01099 xmm7 = xmm2; \
01100 \
01101 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
01102 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
01103 xmm2 = _mm_slli_epi16(xmm2, 2); \
01104 xmm0 = _mm_or_si128(xmm0, xmm2); \
01105 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \
01106 \
01107 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
01108 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
01109 xmm7 = _mm_slli_epi16(xmm7, 2); \
01110 xmm5 = _mm_or_si128(xmm5, xmm7); \
01111 _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5);
01112
01113 #define SSE2_UNPACK_16_ALIGNED \
01114 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
01115 xmm0 = _mm_and_si128(xmm0, xmm5); \
01116 xmm1 = _mm_and_si128(xmm1, xmm5); \
01117 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
01118 xmm2 = _mm_and_si128(xmm2, xmm5); \
01119 xmm0 = _mm_srli_epi16(xmm0, 3); \
01120 xmm4 = _mm_setzero_si128(); \
01121 xmm5 = xmm0; \
01122 xmm7 = xmm2; \
01123 \
01124 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
01125 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
01126 xmm2 = _mm_slli_epi16(xmm2, 3); \
01127 xmm0 = _mm_or_si128(xmm0, xmm2); \
01128 _mm_stream_si128((__m128i*)p_buffer, xmm0); \
01129 \
01130 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
01131 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
01132 xmm7 = _mm_slli_epi16(xmm7, 3); \
01133 xmm5 = _mm_or_si128(xmm5, xmm7); \
01134 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5);
01135
01136 #define SSE2_UNPACK_16_UNALIGNED \
01137 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
01138 xmm0 = _mm_and_si128(xmm0, xmm5); \
01139 xmm1 = _mm_and_si128(xmm1, xmm5); \
01140 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \
01141 xmm2 = _mm_and_si128(xmm2, xmm5); \
01142 xmm0 = _mm_srli_epi16(xmm0, 3); \
01143 xmm4 = _mm_setzero_si128(); \
01144 xmm5 = xmm0; \
01145 xmm7 = xmm2; \
01146 \
01147 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
01148 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
01149 xmm2 = _mm_slli_epi16(xmm2, 3); \
01150 xmm0 = _mm_or_si128(xmm0, xmm2); \
01151 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \
01152 \
01153 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \
01154 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \
01155 xmm7 = _mm_slli_epi16(xmm7, 3); \
01156 xmm5 = _mm_or_si128(xmm5, xmm7); \
01157 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5);
01158
01159 #define SSE2_UNPACK_32_ARGB_ALIGNED \
01160 xmm3 = _mm_setzero_si128(); \
01161 xmm4 = xmm0; \
01162 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
01163 xmm5 = xmm1; \
01164 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
01165 xmm6 = xmm4; \
01166 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
01167 _mm_stream_si128((__m128i*)(p_buffer), xmm4); \
01168 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
01169 _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
01170 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
01171 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
01172 xmm5 = xmm0; \
01173 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
01174 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \
01175 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
01176 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
01177
01178 #define SSE2_UNPACK_32_ARGB_UNALIGNED \
01179 xmm3 = _mm_setzero_si128(); \
01180 xmm4 = xmm0; \
01181 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
01182 xmm5 = xmm1; \
01183 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
01184 xmm6 = xmm4; \
01185 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
01186 _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
01187 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
01188 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
01189 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \
01190 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
01191 xmm5 = xmm0; \
01192 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \
01193 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \
01194 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \
01195 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
01196
01197 #define SSE2_UNPACK_32_RGBA_ALIGNED \
01198 xmm3 = _mm_setzero_si128(); \
01199 xmm4 = xmm2; \
01200 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
01201 xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \
01202 xmm5 = xmm3; \
01203 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
01204 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \
01205 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
01206 _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
01207 xmm6 = _mm_setzero_si128(); \
01208 xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \
01209 xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \
01210 xmm0 = xmm6; \
01211 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
01212 _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
01213 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
01214 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
01215
01216 #define SSE2_UNPACK_32_RGBA_UNALIGNED \
01217 xmm3 = _mm_setzero_si128(); \
01218 xmm4 = xmm2; \
01219 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
01220 xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \
01221 xmm5 = xmm3; \
01222 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
01223 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
01224 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
01225 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
01226 xmm6 = _mm_setzero_si128(); \
01227 xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \
01228 xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \
01229 xmm0 = xmm6; \
01230 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
01231 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
01232 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
01233 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
01234
01235 #define SSE2_UNPACK_32_BGRA_ALIGNED \
01236 xmm3 = _mm_setzero_si128(); \
01237 xmm4 = xmm2; \
01238 xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
01239 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
01240 xmm5 = xmm3; \
01241 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
01242 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \
01243 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
01244 _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \
01245 xmm6 = _mm_setzero_si128(); \
01246 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
01247 xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
01248 xmm0 = xmm6; \
01249 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
01250 _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \
01251 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
01252 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0);
01253
01254 #define SSE2_UNPACK_32_BGRA_UNALIGNED \
01255 xmm3 = _mm_setzero_si128(); \
01256 xmm4 = xmm2; \
01257 xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \
01258 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
01259 xmm5 = xmm3; \
01260 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \
01261 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \
01262 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \
01263 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \
01264 xmm6 = _mm_setzero_si128(); \
01265 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
01266 xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \
01267 xmm0 = xmm6; \
01268 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \
01269 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \
01270 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \
01271 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0);
01272
01273 #define SSE2_UNPACK_32_ABGR_ALIGNED \
01274 xmm3 = _mm_setzero_si128(); \
01275 xmm4 = xmm1; \
01276 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
01277 xmm5 = xmm0; \
01278 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
01279 xmm6 = xmm4; \
01280 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
01281 _mm_stream_si128((__m128i*)(p_buffer), xmm4); \
01282 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
01283 _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \
01284 xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
01285 xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
01286 xmm2 = xmm1; \
01287 xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
01288 _mm_stream_si128((__m128i*)(p_buffer+8), xmm1); \
01289 xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
01290 _mm_stream_si128((__m128i*)(p_buffer+12), xmm2);
01291
01292 #define SSE2_UNPACK_32_ABGR_UNALIGNED \
01293 xmm3 = _mm_setzero_si128(); \
01294 xmm4 = xmm1; \
01295 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \
01296 xmm5 = xmm0; \
01297 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \
01298 xmm6 = xmm4; \
01299 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \
01300 _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \
01301 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \
01302 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \
01303 xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); \
01304 xmm0 = _mm_unpackhi_epi8(xmm0, xmm3); \
01305 xmm2 = xmm1; \
01306 xmm1 = _mm_unpacklo_epi16(xmm1, xmm0); \
01307 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm1); \
01308 xmm2 = _mm_unpackhi_epi16(xmm2, xmm0); \
01309 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm2);
01310
01311 #endif
01312
01313 #endif