00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifdef MODULE_NAME_IS_i420_rgb_mmx
00028
00029
00030 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
00031 #define USED_U64(foo) \
00032 static const uint64_t foo __asm__ (#foo) __attribute__((used))
00033 #else
00034 #define USED_U64(foo) \
00035 static const uint64_t foo __asm__ (#foo) __attribute__((unused))
00036 #endif
00037 USED_U64(mmx_80w) = 0x0080008000800080ULL;
00038 USED_U64(mmx_10w) = 0x1010101010101010ULL;
00039 USED_U64(mmx_00ffw) = 0x00ff00ff00ff00ffULL;
00040 USED_U64(mmx_Y_coeff) = 0x253f253f253f253fULL;
00041
00042 USED_U64(mmx_U_green) = 0xf37df37df37df37dULL;
00043 USED_U64(mmx_U_blue) = 0x4093409340934093ULL;
00044 USED_U64(mmx_V_red) = 0x3312331233123312ULL;
00045 USED_U64(mmx_V_green) = 0xe5fce5fce5fce5fcULL;
00046
00047 USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL;
00048 USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL;
00049 #undef USED_U64
00050
00051 #if defined(CAN_COMPILE_MMX)
00052
00053
00054
00055 #define MMX_CALL(MMX_INSTRUCTIONS) \
00056 do { \
00057 __asm__ __volatile__( \
00058 ".p2align 3 \n\t" \
00059 MMX_INSTRUCTIONS \
00060 : \
00061 : "r" (p_y), "r" (p_u), \
00062 "r" (p_v), "r" (p_buffer) ); \
00063 } while(0)
00064
00065 #define MMX_END __asm__ __volatile__ ( "emms" )
00066
00067
00068 #if defined(__x86_64__) && defined(__PIC__)
00069 # define G "(%%rip)"
00070 #else
00071 # define G
00072 #endif
00073
00074 #define MMX_INIT_16 " \n\
00075 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00076 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00077 pxor %%mm4, %%mm4 # zero mm4 \n\
00078 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00079 "
00080
00081 #define MMX_INIT_16_GRAY " \n\
00082 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00083 #movl $0, (%3) # cache preload for image \n\
00084 "
00085
00086 #define MMX_INIT_32 " \n\
00087 movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00088 movl $0, (%3) # cache preload for image \n\
00089 movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00090 pxor %%mm4, %%mm4 # zero mm4 \n\
00091 movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00092 "
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102 #define MMX_YUV_MUL " \n\
00103 # convert the chroma part \n\
00104 punpcklbw %%mm4, %%mm0 # scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00105 punpcklbw %%mm4, %%mm1 # scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00106 psubsw mmx_80w"G", %%mm0 # Cb -= 128 \n\
00107 psubsw mmx_80w"G", %%mm1 # Cr -= 128 \n\
00108 psllw $3, %%mm0 # Promote precision \n\
00109 psllw $3, %%mm1 # Promote precision \n\
00110 movq %%mm0, %%mm2 # Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00111 movq %%mm1, %%mm3 # Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00112 pmulhw mmx_U_green"G", %%mm2 # Mul Cb with green coeff -> Cb green \n\
00113 pmulhw mmx_V_green"G", %%mm3 # Mul Cr with green coeff -> Cr green \n\
00114 pmulhw mmx_U_blue"G", %%mm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
00115 pmulhw mmx_V_red"G", %%mm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
00116 paddsw %%mm3, %%mm2 # Cb green + Cr green -> Cgreen \n\
00117 \n\
00118 # convert the luma part \n\
00119 psubusb mmx_10w"G", %%mm6 # Y -= 16 \n\
00120 movq %%mm6, %%mm7 # Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00121 pand mmx_00ffw"G", %%mm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00122 psrlw $8, %%mm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
00123 psllw $3, %%mm6 # Promote precision \n\
00124 psllw $3, %%mm7 # Promote precision \n\
00125 pmulhw mmx_Y_coeff"G", %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\
00126 pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
00127 "
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137 #define MMX_YUV_ADD " \n\
00138 # Do horizontal and vertical scaling \n\
00139 movq %%mm0, %%mm3 # Copy Cblue \n\
00140 movq %%mm1, %%mm4 # Copy Cred \n\
00141 movq %%mm2, %%mm5 # Copy Cgreen \n\
00142 paddsw %%mm6, %%mm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
00143 paddsw %%mm7, %%mm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
00144 paddsw %%mm6, %%mm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
00145 paddsw %%mm7, %%mm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
00146 paddsw %%mm6, %%mm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
00147 paddsw %%mm7, %%mm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
00148 \n\
00149 # Limit RGB even to 0..255 \n\
00150 packuswb %%mm0, %%mm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
00151 packuswb %%mm1, %%mm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
00152 packuswb %%mm2, %%mm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
00153 \n\
00154 # Limit RGB odd to 0..255 \n\
00155 packuswb %%mm3, %%mm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
00156 packuswb %%mm4, %%mm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
00157 packuswb %%mm5, %%mm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
00158 \n\
00159 # Interleave RGB even and odd \n\
00160 punpcklbw %%mm3, %%mm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00161 punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00162 punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00163 "
00164
00165
00166
00167
00168
00169 #define MMX_YUV_GRAY " \n\
00170 # convert the luma part \n\
00171 psubusb mmx_10w"G", %%mm6 \n\
00172 movq %%mm6, %%mm7 \n\
00173 pand mmx_00ffw"G", %%mm6 \n\
00174 psrlw $8, %%mm7 \n\
00175 psllw $3, %%mm6 \n\
00176 psllw $3, %%mm7 \n\
00177 pmulhw mmx_Y_coeff"G", %%mm6 \n\
00178 pmulhw mmx_Y_coeff"G", %%mm7 \n\
00179 packuswb %%mm6, %%mm6 \n\
00180 packuswb %%mm7, %%mm7 \n\
00181 punpcklbw %%mm7, %%mm6 \n\
00182 "
00183
00184 #define MMX_UNPACK_16_GRAY " \n\
00185 movq %%mm6, %%mm5 \n\
00186 pand mmx_mask_f8"G", %%mm6 \n\
00187 pand mmx_mask_fc"G", %%mm5 \n\
00188 movq %%mm6, %%mm7 \n\
00189 psrlw $3, %%mm7 \n\
00190 pxor %%mm3, %%mm3 \n\
00191 movq %%mm7, %%mm2 \n\
00192 movq %%mm5, %%mm0 \n\
00193 punpcklbw %%mm3, %%mm5 \n\
00194 punpcklbw %%mm6, %%mm7 \n\
00195 psllw $3, %%mm5 \n\
00196 por %%mm5, %%mm7 \n\
00197 movq %%mm7, (%3) \n\
00198 punpckhbw %%mm3, %%mm0 \n\
00199 punpckhbw %%mm6, %%mm2 \n\
00200 psllw $3, %%mm0 \n\
00201 movq 8(%0), %%mm6 \n\
00202 por %%mm0, %%mm2 \n\
00203 movq %%mm2, 8(%3) \n\
00204 "
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214 #define MMX_UNPACK_15 " \n\
00215 # mask unneeded bits off \n\
00216 pand mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00217 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00218 pand mmx_mask_f8"G", %%mm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00219 pand mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00220 psrlw $1,%%mm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00221 pxor %%mm4, %%mm4 # zero mm4 \n\
00222 movq %%mm0, %%mm5 # Copy B7-B0 \n\
00223 movq %%mm2, %%mm7 # Copy G7-G0 \n\
00224 \n\
00225 # convert rgb24 plane to rgb15 pack for pixel 0-3 \n\
00226 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3______ \n\
00227 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00228 psllw $2,%%mm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00229 por %%mm2, %%mm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00230 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00231 movq %%mm0, (%3) # store pixel 0-3 \n\
00232 \n\
00233 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00234 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3______ \n\
00235 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00236 psllw $2,%%mm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00237 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
00238 por %%mm7, %%mm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00239 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
00240 movq %%mm5, 8(%3) # store pixel 4-7 \n\
00241 "
00242
00243
00244
00245
00246
00247
00248
00249
00250 #define MMX_UNPACK_16 " \n\
00251 # mask unneeded bits off \n\
00252 pand mmx_mask_f8"G", %%mm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00253 pand mmx_mask_fc"G", %%mm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00254 pand mmx_mask_f8"G", %%mm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00255 psrlw $3,%%mm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00256 pxor %%mm4, %%mm4 # zero mm4 \n\
00257 movq %%mm0, %%mm5 # Copy B7-B0 \n\
00258 movq %%mm2, %%mm7 # Copy G7-G0 \n\
00259 \n\
00260 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00261 punpcklbw %%mm4, %%mm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00262 punpcklbw %%mm1, %%mm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00263 psllw $3,%%mm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00264 por %%mm2, %%mm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00265 movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00266 movq %%mm0, (%3) # store pixel 0-3 \n\
00267 \n\
00268 # convert rgb24 plane to rgb16 pack for pixel 0-3 \n\
00269 punpckhbw %%mm4, %%mm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00270 punpckhbw %%mm1, %%mm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00271 psllw $3,%%mm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00272 movd 4(%1), %%mm0 # Load 4 Cb __ __ __ __ u3 u2 u1 u0 \n\
00273 por %%mm7, %%mm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00274 movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\
00275 movq %%mm5, 8(%3) # store pixel 4-7 \n\
00276 "
00277
00278
00279
00280
00281
00282
00283 #define MMX_UNPACK_32_ARGB " \n\
00284 pxor %%mm3, %%mm3 # zero mm3 \n\
00285 movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00286 punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00287 movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00288 punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\
00289 movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00290 punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00291 movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\
00292 punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
00293 movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\
00294 punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00295 punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\
00296 movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00297 punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00298 movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\
00299 punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
00300 movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\
00301 "
00302
00303 #define MMX_UNPACK_32_RGBA " \n\
00304 pxor %%mm3, %%mm3 # zero mm3 \n\
00305 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00306 punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
00307 punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\
00308 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
00309 punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\
00310 movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\
00311 punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
00312 movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\
00313 pxor %%mm6, %%mm6 # zero mm6 \n\
00314 punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
00315 punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\
00316 movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\
00317 punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
00318 movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\
00319 punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
00320 movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\
00321 "
00322
00323 #define MMX_UNPACK_32_BGRA " \n\
00324 pxor %%mm3, %%mm3 # zero mm3 \n\
00325 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00326 punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
00327 punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\
00328 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\
00329 punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
00330 movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\
00331 punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
00332 movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\
00333 pxor %%mm6, %%mm6 # zero mm6 \n\
00334 punpckhbw %%mm0, %%mm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
00335 punpckhbw %%mm1, %%mm6 # R7 00 R6 00 R5 00 R4 00 \n\
00336 movq %%mm6, %%mm0 # R7 00 R6 00 R5 00 R4 00 \n\
00337 punpcklwd %%mm2, %%mm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
00338 movq %%mm6, 16(%3) # Store BGRA5 BGRA4 \n\
00339 punpckhwd %%mm2, %%mm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
00340 movq %%mm0, 24(%3) # Store BGRA7 BGRA6 \n\
00341 "
00342
00343 #define MMX_UNPACK_32_ABGR " \n\
00344 pxor %%mm3, %%mm3 # zero mm3 \n\
00345 movq %%mm1, %%mm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00346 punpcklbw %%mm2, %%mm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00347 movq %%mm0, %%mm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00348 punpcklbw %%mm3, %%mm5 # 00 B3 00 B2 00 B1 00 B0 \n\
00349 movq %%mm4, %%mm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00350 punpcklwd %%mm5, %%mm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
00351 movq %%mm4, (%3) # Store ABGR1 ABGR0 \n\
00352 punpckhwd %%mm5, %%mm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
00353 movq %%mm6, 8(%3) # Store ABGR3 ABGR2 \n\
00354 punpckhbw %%mm2, %%mm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00355 punpckhbw %%mm3, %%mm0 # 00 B7 00 B6 00 B5 00 B4 \n\
00356 movq %%mm1, %%mm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00357 punpcklwd %%mm0, %%mm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
00358 movq %%mm1, 16(%3) # Store ABGR5 ABGR4 \n\
00359 punpckhwd %%mm0, %%mm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
00360 movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\
00361 "
00362
00363 #elif defined(HAVE_MMX_INTRINSICS)
00364
00365
00366
00367 #include <mmintrin.h>
00368
00369 #define MMX_CALL(MMX_INSTRUCTIONS) \
00370 do { \
00371 __m64 mm0, mm1, mm2, mm3, \
00372 mm4, mm5, mm6, mm7; \
00373 MMX_INSTRUCTIONS \
00374 } while(0)
00375
00376 #define MMX_END _mm_empty()
00377
00378 #define MMX_INIT_16 \
00379 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
00380 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00381 mm4 = _mm_setzero_si64(); \
00382 mm6 = (__m64)*(uint64_t *)p_y;
00383
00384 #define MMX_INIT_32 \
00385 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \
00386 *(uint16_t *)p_buffer = 0; \
00387 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
00388 mm4 = _mm_setzero_si64(); \
00389 mm6 = (__m64)*(uint64_t *)p_y;
00390
00391 #define MMX_YUV_MUL \
00392 mm0 = _mm_unpacklo_pi8(mm0, mm4); \
00393 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
00394 mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \
00395 mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \
00396 mm0 = _mm_slli_pi16(mm0, 3); \
00397 mm1 = _mm_slli_pi16(mm1, 3); \
00398 mm2 = mm0; \
00399 mm3 = mm1; \
00400 mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \
00401 mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \
00402 mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \
00403 mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \
00404 mm2 = _mm_adds_pi16(mm2, mm3); \
00405 \
00406 mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \
00407 mm7 = mm6; \
00408 mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \
00409 mm7 = _mm_srli_pi16(mm7, 8); \
00410 mm6 = _mm_slli_pi16(mm6, 3); \
00411 mm7 = _mm_slli_pi16(mm7, 3); \
00412 mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \
00413 mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff);
00414
00415 #define MMX_YUV_ADD \
00416 mm3 = mm0; \
00417 mm4 = mm1; \
00418 mm5 = mm2; \
00419 mm0 = _mm_adds_pi16(mm0, mm6); \
00420 mm3 = _mm_adds_pi16(mm3, mm7); \
00421 mm1 = _mm_adds_pi16(mm1, mm6); \
00422 mm4 = _mm_adds_pi16(mm4, mm7); \
00423 mm2 = _mm_adds_pi16(mm2, mm6); \
00424 mm5 = _mm_adds_pi16(mm5, mm7); \
00425 \
00426 mm0 = _mm_packs_pu16(mm0, mm0); \
00427 mm1 = _mm_packs_pu16(mm1, mm1); \
00428 mm2 = _mm_packs_pu16(mm2, mm2); \
00429 \
00430 mm3 = _mm_packs_pu16(mm3, mm3); \
00431 mm4 = _mm_packs_pu16(mm4, mm4); \
00432 mm5 = _mm_packs_pu16(mm5, mm5); \
00433 \
00434 mm0 = _mm_unpacklo_pi8(mm0, mm3); \
00435 mm1 = _mm_unpacklo_pi8(mm1, mm4); \
00436 mm2 = _mm_unpacklo_pi8(mm2, mm5);
00437
00438 #define MMX_UNPACK_15 \
00439 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
00440 mm0 = _mm_srli_pi16(mm0, 3); \
00441 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \
00442 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
00443 mm1 = _mm_srli_pi16(mm1, 1); \
00444 mm4 = _mm_setzero_si64(); \
00445 mm5 = mm0; \
00446 mm7 = mm2; \
00447 \
00448 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
00449 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
00450 mm2 = _mm_slli_pi16(mm2, 2); \
00451 mm0 = _mm_or_si64(mm0, mm2); \
00452 mm6 = (__m64)*(uint64_t *)(p_y + 8); \
00453 *(uint64_t *)p_buffer = (uint64_t)mm0; \
00454 \
00455 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
00456 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
00457 mm7 = _mm_slli_pi16(mm7, 2); \
00458 mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
00459 mm5 = _mm_or_si64(mm5, mm7); \
00460 mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
00461 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00462
00463 #define MMX_UNPACK_16 \
00464 mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \
00465 mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \
00466 mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \
00467 mm0 = _mm_srli_pi16(mm0, 3); \
00468 mm4 = _mm_setzero_si64(); \
00469 mm5 = mm0; \
00470 mm7 = mm2; \
00471 \
00472 mm2 = _mm_unpacklo_pi8(mm2, mm4); \
00473 mm0 = _mm_unpacklo_pi8(mm0, mm1); \
00474 mm2 = _mm_slli_pi16(mm2, 3); \
00475 mm0 = _mm_or_si64(mm0, mm2); \
00476 mm6 = (__m64)*(uint64_t *)(p_y + 8); \
00477 *(uint64_t *)p_buffer = (uint64_t)mm0; \
00478 \
00479 mm7 = _mm_unpackhi_pi8(mm7, mm4); \
00480 mm5 = _mm_unpackhi_pi8(mm5, mm1); \
00481 mm7 = _mm_slli_pi16(mm7, 3); \
00482 mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \
00483 mm5 = _mm_or_si64(mm5, mm7); \
00484 mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \
00485 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;
00486
00487 #define MMX_UNPACK_32_ARGB \
00488 mm3 = _mm_setzero_si64(); \
00489 mm4 = mm0; \
00490 mm4 = _mm_unpacklo_pi8(mm4, mm2); \
00491 mm5 = mm1; \
00492 mm5 = _mm_unpacklo_pi8(mm5, mm3); \
00493 mm6 = mm4; \
00494 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
00495 *(uint64_t *)p_buffer = (uint64_t)mm4; \
00496 mm6 = _mm_unpackhi_pi16(mm6, mm5); \
00497 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
00498 mm0 = _mm_unpackhi_pi8(mm0, mm2); \
00499 mm1 = _mm_unpackhi_pi8(mm1, mm3); \
00500 mm5 = mm0; \
00501 mm5 = _mm_unpacklo_pi16(mm5, mm1); \
00502 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;\
00503 mm0 = _mm_unpackhi_pi16(mm0, mm1); \
00504 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00505
00506 #define MMX_UNPACK_32_RGBA \
00507 mm3 = _mm_setzero_si64(); \
00508 mm4 = mm2; \
00509 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
00510 mm3 = _mm_unpacklo_pi8(mm3, mm0); \
00511 mm5 = mm3; \
00512 mm3 = _mm_unpacklo_pi16(mm3, mm4); \
00513 *(uint64_t *)p_buffer = (uint64_t)mm3; \
00514 mm5 = _mm_unpackhi_pi16(mm5, mm4); \
00515 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
00516 mm6 = _mm_setzero_si64(); \
00517 mm2 = _mm_unpackhi_pi8(mm2, mm1); \
00518 mm6 = _mm_unpackhi_pi8(mm6, mm0); \
00519 mm0 = mm6; \
00520 mm6 = _mm_unpacklo_pi16(mm6, mm2); \
00521 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
00522 mm0 = _mm_unpackhi_pi16(mm0, mm2); \
00523 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00524
00525 #define MMX_UNPACK_32_BGRA \
00526 mm3 = _mm_setzero_si64(); \
00527 mm4 = mm2; \
00528 mm4 = _mm_unpacklo_pi8(mm4, mm0); \
00529 mm3 = _mm_unpacklo_pi8(mm3, mm1); \
00530 mm5 = mm3; \
00531 mm3 = _mm_unpacklo_pi16(mm3, mm4); \
00532 *(uint64_t *)p_buffer = (uint64_t)mm3; \
00533 mm5 = _mm_unpackhi_pi16(mm5, mm4); \
00534 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\
00535 mm6 = _mm_setzero_si64(); \
00536 mm2 = _mm_unpackhi_pi8(mm2, mm0); \
00537 mm6 = _mm_unpackhi_pi8(mm6, mm1); \
00538 mm0 = mm6; \
00539 mm6 = _mm_unpacklo_pi16(mm6, mm2); \
00540 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\
00541 mm0 = _mm_unpackhi_pi16(mm0, mm2); \
00542 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0;
00543
00544 #define MMX_UNPACK_32_ABGR \
00545 mm3 = _mm_setzero_si64(); \
00546 mm4 = mm1; \
00547 mm4 = _mm_unpacklo_pi8(mm4, mm2); \
00548 mm5 = mm0; \
00549 mm5 = _mm_unpacklo_pi8(mm5, mm3); \
00550 mm6 = mm4; \
00551 mm4 = _mm_unpacklo_pi16(mm4, mm5); \
00552 *(uint64_t *)p_buffer = (uint64_t)mm4; \
00553 mm6 = _mm_unpackhi_pi16(mm6, mm5); \
00554 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\
00555 mm1 = _mm_unpackhi_pi8(mm1, mm2); \
00556 mm0 = _mm_unpackhi_pi8(mm0, mm3); \
00557 mm2 = mm1; \
00558 mm1 = _mm_unpacklo_pi16(mm1, mm0); \
00559 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\
00560 mm2 = _mm_unpackhi_pi16(mm2, mm0); \
00561 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2;
00562
00563 #endif
00564
00565 #elif defined( MODULE_NAME_IS_i420_rgb_sse2 )
00566
00567 #if defined(CAN_COMPILE_SSE2)
00568
00569
00570
00571 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00572 do { \
00573 __asm__ __volatile__( \
00574 ".p2align 3 \n\t" \
00575 SSE2_INSTRUCTIONS \
00576 : \
00577 : "r" (p_y), "r" (p_u), \
00578 "r" (p_v), "r" (p_buffer) \
00579 : "eax" ); \
00580 } while(0)
00581
00582 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
00583
00584 #define SSE2_INIT_16_ALIGNED " \n\
00585 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00586 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00587 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00588 movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00589 "
00590
00591 #define SSE2_INIT_16_UNALIGNED " \n\
00592 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00593 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00594 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00595 movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00596 prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
00597 "
00598
00599 #define SSE2_INIT_32_ALIGNED " \n\
00600 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00601 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00602 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00603 movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00604 "
00605
00606 #define SSE2_INIT_32_UNALIGNED " \n\
00607 movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\
00608 movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\
00609 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00610 movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00611 prefetchnta (%3) # Tell CPU not to cache output RGB data \n\
00612 "
00613
00614 #define SSE2_YUV_MUL " \n\
00615 # convert the chroma part \n\
00616 punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00617 punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00618 movl $0x00800080, %%eax # \n\
00619 movd %%eax, %%xmm5 # \n\
00620 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\
00621 psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\
00622 psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\
00623 psllw $3, %%xmm0 # Promote precision \n\
00624 psllw $3, %%xmm1 # Promote precision \n\
00625 movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\
00626 movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\
00627 movl $0xf37df37d, %%eax # \n\
00628 movd %%eax, %%xmm5 # \n\
00629 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\
00630 pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\
00631 movl $0xe5fce5fc, %%eax # \n\
00632 movd %%eax, %%xmm5 # \n\
00633 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\
00634 pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\
00635 movl $0x40934093, %%eax # \n\
00636 movd %%eax, %%xmm5 # \n\
00637 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\
00638 pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\
00639 movl $0x33123312, %%eax # \n\
00640 movd %%eax, %%xmm5 # \n\
00641 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\
00642 pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\
00643 paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\
00644 \n\
00645 # convert the luma part \n\
00646 movl $0x10101010, %%eax # \n\
00647 movd %%eax, %%xmm5 # \n\
00648 pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\
00649 psubusb %%xmm5, %%xmm6 # Y -= 16 \n\
00650 movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
00651 movl $0x00ff00ff, %%eax # \n\
00652 movd %%eax, %%xmm5 # \n\
00653 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\
00654 pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
00655 psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\
00656 psllw $3, %%xmm6 # Promote precision \n\
00657 psllw $3, %%xmm7 # Promote precision \n\
00658 movl $0x253f253f, %%eax # \n\
00659 movd %%eax, %%xmm5 # \n\
00660 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\
00661 pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\
00662 pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\
00663 "
00664
00665 #define SSE2_YUV_ADD " \n\
00666 # Do horizontal and vertical scaling \n\
00667 movdqa %%xmm0, %%xmm3 # Copy Cblue \n\
00668 movdqa %%xmm1, %%xmm4 # Copy Cred \n\
00669 movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\
00670 paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\
00671 paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\
00672 paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\
00673 paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\
00674 paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\
00675 paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\
00676 \n\
00677 # Limit RGB even to 0..255 \n\
00678 packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\
00679 packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\
00680 packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\
00681 \n\
00682 # Limit RGB odd to 0..255 \n\
00683 packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\
00684 packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\
00685 packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\
00686 \n\
00687 # Interleave RGB even and odd \n\
00688 punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00689 punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00690 punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00691 "
00692
00693 #define SSE2_UNPACK_15_ALIGNED " \n\
00694 # mask unneeded bits off \n\
00695 movl $0xf8f8f8f8, %%eax # \n\
00696 movd %%eax, %%xmm5 # \n\
00697 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00698 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00699 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00700 pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00701 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00702 psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00703 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00704 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00705 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00706 \n\
00707 # convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
00708 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
00709 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00710 psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00711 por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00712 movntdq %%xmm0, (%3) # store pixel 0-7 \n\
00713 \n\
00714 # convert rgb24 plane to rgb15 pack for pixel 8-15 \n\
00715 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\
00716 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00717 psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00718 por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00719 movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
00720 "
00721
00722 #define SSE2_UNPACK_15_UNALIGNED " \n\
00723 # mask unneeded bits off \n\
00724 movl $0xf8f8f8f8, %%eax # \n\
00725 movd %%eax, %%xmm5 # \n\
00726 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00727 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00728 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00729 pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\
00730 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00731 psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\
00732 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00733 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00734 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00735 \n\
00736 # convert rgb24 plane to rgb15 pack for pixel 0-7 \n\
00737 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\
00738 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00739 psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\
00740 por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00741 movdqu %%xmm0, (%3) # store pixel 0-7 \n\
00742 \n\
00743 # convert rgb24 plane to rgb15 pack for pixel 8-15 \n\
00744 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\
00745 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00746 psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\
00747 por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\
00748 movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
00749 "
00750
00751 #define SSE2_UNPACK_16_ALIGNED " \n\
00752 # mask unneeded bits off \n\
00753 movl $0xf8f8f8f8, %%eax # \n\
00754 movd %%eax, %%xmm5 # \n\
00755 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00756 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00757 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00758 movl $0xfcfcfcfc, %%eax # \n\
00759 movd %%eax, %%xmm5 # \n\
00760 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00761 pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00762 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00763 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00764 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00765 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00766 \n\
00767 # convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
00768 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00769 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00770 psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00771 por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00772 movntdq %%xmm0, (%3) # store pixel 0-7 \n\
00773 \n\
00774 # convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
00775 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00776 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00777 psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00778 por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00779 movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\
00780 "
00781
00782 #define SSE2_UNPACK_16_UNALIGNED " \n\
00783 # mask unneeded bits off \n\
00784 movl $0xf8f8f8f8, %%eax # \n\
00785 movd %%eax, %%xmm5 # \n\
00786 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00787 pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\
00788 pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\
00789 movl $0xfcfcfcfc, %%eax # \n\
00790 movd %%eax, %%xmm5 # \n\
00791 pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\
00792 pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\
00793 psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\
00794 pxor %%xmm4, %%xmm4 # zero mm4 \n\
00795 movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\
00796 movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\
00797 \n\
00798 # convert rgb24 plane to rgb16 pack for pixel 0-7 \n\
00799 punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\
00800 punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00801 psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\
00802 por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00803 movdqu %%xmm0, (%3) # store pixel 0-7 \n\
00804 \n\
00805 # convert rgb24 plane to rgb16 pack for pixel 8-15 \n\
00806 punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\
00807 punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\
00808 psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\
00809 por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\
00810 movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\
00811 "
00812
00813 #define SSE2_UNPACK_32_ARGB_ALIGNED " \n\
00814 pxor %%xmm3, %%xmm3 # zero xmm3 \n\
00815 movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00816 punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00817 movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00818 punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
00819 movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00820 punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00821 movntdq %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
00822 punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
00823 movntdq %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
00824 punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00825 punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
00826 movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00827 punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00828 movntdq %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
00829 punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
00830 movntdq %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
00831 "
00832
00833 #define SSE2_UNPACK_32_ARGB_UNALIGNED " \n\
00834 pxor %%xmm3, %%xmm3 # zero xmm3 \n\
00835 movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00836 punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00837 movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00838 punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\
00839 movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\
00840 punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\
00841 movdqu %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\
00842 punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\
00843 movdqu %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\
00844 punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00845 punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\
00846 movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\
00847 punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\
00848 movdqu %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\
00849 punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\
00850 movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\
00851 "
00852
00853 #define SSE2_UNPACK_32_RGBA_ALIGNED " \n\
00854 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00855 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00856 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
00857 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
00858 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00859 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
00860 movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
00861 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
00862 movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
00863 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00864 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
00865 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
00866 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
00867 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
00868 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\
00869 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
00870 movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
00871 "
00872
00873 #define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\
00874 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00875 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00876 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\
00877 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\
00878 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00879 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\
00880 movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\
00881 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\
00882 movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\
00883 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00884 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\
00885 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\
00886 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\
00887 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\
00888 movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\
00889 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\
00890 movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\
00891 "
00892
00893 #define SSE2_UNPACK_32_BGRA_ALIGNED " \n\
00894 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00895 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00896 punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
00897 punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
00898 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00899 punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
00900 movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
00901 punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
00902 movntdq %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
00903 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00904 punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
00905 punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
00906 movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
00907 punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
00908 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
00909 punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
00910 movntdq %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
00911 "
00912
00913 #define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\
00914 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00915 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\
00916 punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\
00917 punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\
00918 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\
00919 punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\
00920 movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\
00921 punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\
00922 movdqu %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\
00923 pxor %%xmm6, %%xmm6 # zero mm6 \n\
00924 punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\
00925 punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\
00926 movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\
00927 punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\
00928 movdqu %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\
00929 punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\
00930 movdqu %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\
00931 "
00932
00933 #define SSE2_UNPACK_32_ABGR_ALIGNED " \n\
00934 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00935 movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00936 punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00937 movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00938 punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
00939 movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00940 punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
00941 movntdq %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
00942 punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
00943 movntdq %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
00944 punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00945 punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
00946 movdqa %%xmm1, %%xmm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00947 punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
00948 movntdq %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
00949 punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
00950 movntdq %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
00951 "
00952
00953 #define SSE2_UNPACK_32_ABGR_UNALIGNED " \n\
00954 pxor %%xmm3, %%xmm3 # zero mm3 \n\
00955 movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\
00956 punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00957 movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\
00958 punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\
00959 movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\
00960 punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\
00961 movdqu %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\
00962 punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\
00963 movdqu %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\
00964 punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\
00965 punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\
00966 movdqa %%xmm1, %%xmm2 # R7 00 R6 00 R5 00 R4 00 \n\
00967 punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\
00968 movdqu %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\
00969 punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\
00970 movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\
00971 "
00972
00973 #elif defined(HAVE_SSE2_INTRINSICS)
00974
00975
00976
00977 #include <emmintrin.h>
00978
00979 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
00980 do { \
00981 __m128i xmm0, xmm1, xmm2, xmm3, \
00982 xmm4, xmm5, xmm6, xmm7; \
00983 SSE2_INSTRUCTIONS \
00984 } while(0)
00985
00986 #define SSE2_END _mm_sfence()
00987
00988 #define SSE2_INIT_16_ALIGNED \
00989 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
00990 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00991 xmm4 = _mm_setzero_si128(); \
00992 xmm6 = _mm_load_si128((__m128i *)p_y);
00993
00994 #define SSE2_INIT_16_UNALIGNED \
00995 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
00996 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
00997 xmm4 = _mm_setzero_si128(); \
00998 xmm6 = _mm_loadu_si128((__m128i *)p_y); \
00999 _mm_prefetch(p_buffer, _MM_HINT_NTA);
01000
01001 #define SSE2_INIT_32_ALIGNED \
01002 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
01003 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
01004 xmm4 = _mm_setzero_si128(); \
01005 xmm6 = _mm_load_si128((__m128i *)p_y);
01006
01007 #define SSE2_INIT_32_UNALIGNED \
01008 xmm0 = _mm_loadl_epi64((__m128i *)p_u); \
01009 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
01010 xmm4 = _mm_setzero_si128(); \
01011 xmm6 = _mm_loadu_si128((__m128i *)p_y); \
01012 _mm_prefetch(p_buffer, _MM_HINT_NTA);
01013
01014 #define SSE2_YUV_MUL \
01015 xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \
01016 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
01017 xmm5 = _mm_set1_epi32(0x00800080UL); \
01018 xmm0 = _mm_subs_epi16(xmm0, xmm5); \
01019 xmm1 = _mm_subs_epi16(xmm1, xmm5); \
01020 xmm0 = _mm_slli_epi16(xmm0, 3); \
01021 xmm1 = _mm_slli_epi16(xmm1, 3); \
01022 xmm2 = xmm0; \
01023 xmm3 = xmm1; \
01024 xmm5 = _mm_set1_epi32(0xf37df37dUL); \
01025 xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \
01026 xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \
01027 xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \
01028 xmm5 = _mm_set1_epi32(0x40934093UL); \
01029 xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \
01030 xmm5 = _mm_set1_epi32(0x33123312UL); \
01031 xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \
01032 xmm2 = _mm_adds_epi16(xmm2, xmm3); \
01033 \
01034 xmm5 = _mm_set1_epi32(0x10101010UL); \
01035 xmm6 = _mm_subs_epu8(xmm6, xmm5); \
01036 xmm7 = xmm6; \
01037 xmm5 = _mm_set1_epi32(0x00ff00ffUL); \
01038 xmm6 = _mm_and_si128(xmm6, xmm5); \
01039 xmm7 = _mm_srli_epi16(xmm7, 8); \
01040 xmm6 = _mm_slli_epi16(xmm6, 3); \
01041 xmm7 = _mm_slli_epi16(xmm7, 3); \
01042 xmm5 = _mm_set1_epi32(0x253f253fUL); \
01043 xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \
01044 xmm7 = _mm_mulhi_epi16(xmm7, xmm5);
01045
01046 #define SSE2_YUV_ADD \
01047 xmm3 = xmm0; \
01048 xmm4 = xmm1; \
01049 xmm5 = xmm2; \
01050 xmm0 = _mm_adds_epi16(xmm0, xmm6); \
01051 xmm3 = _mm_adds_epi16(xmm3, xmm7); \
01052 xmm1 = _mm_adds_epi16(xmm1, xmm6); \
01053 xmm4 = _mm_adds_epi16(xmm4, xmm7); \
01054 xmm2 = _mm_adds_epi16(xmm2, xmm6); \
01055 xmm5 = _mm_adds_epi16(xmm5, xmm7); \
01056 \
01057 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
01058 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
01059 xmm2 = _mm_packus_epi16(xmm2, xmm2); \
01060 \
01061 xmm3 = _mm_packus_epi16(xmm3, xmm3); \
01062 xmm4 = _mm_packus_epi16(xmm4, xmm4); \
01063 xmm5 = _mm_packus_epi16(xmm5, xmm5); \
01064 \
01065 xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \
01066 xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \
01067 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
01068
01069 #define SSE2_UNPACK_15_ALIGNED \
01070 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \
01071 xmm0 = _mm_and_si128(xmm0, xmm5); \
01072 xmm0 = _mm_srli_epi16(xmm0, 3); \
01073 xmm2 = _mm_and_si128(xmm2, xmm5); \
01074 xmm1 = _mm_and_si128(xmm1, xmm5); \
01075 xmm1 = _mm_srli_epi16(xmm1, 1); \
01076 xmm4 = _mm_setzero_si128(); \
01077 xmm5 = xmm0; \
01078 xmm7 = xmm2; \
01079 \
01080 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \
01081 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
01082 xmm2 = _mm_slli_epi16(xmm2, 2); \
01083 xmm0 = _mm_or_si128(xmm0, xmm2); \
01084 _mm_stream_si128((__m128i*)p_buffer, xmm0); \
01085 \