00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 struct vf_priv_s {
00031 int thresh;
00032 int radius;
00033 uint16_t *buf;
00034 void (*filter_line)(uint8_t *dst, uint8_t *src, uint16_t *dc,
00035 int width, int thresh, const uint16_t *dithers);
00036 void (*blur_line)(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
00037 uint8_t *src, int sstride, int width);
00038 };
00039
00040 static const uint16_t __attribute__((aligned(16))) pw_7f[8] = {127,127,127,127,127,127,127,127};
00041 static const uint16_t __attribute__((aligned(16))) pw_ff[8] = {255,255,255,255,255,255,255,255};
00042 static const uint16_t __attribute__((aligned(16))) dither[8][8] = {
00043 { 0, 96, 24,120, 6,102, 30,126 },
00044 { 64, 32, 88, 56, 70, 38, 94, 62 },
00045 { 16,112, 8,104, 22,118, 14,110 },
00046 { 80, 48, 72, 40, 86, 54, 78, 46 },
00047 { 4,100, 28,124, 2, 98, 26,122 },
00048 { 68, 36, 92, 60, 66, 34, 90, 58 },
00049 { 20,116, 12,108, 18,114, 10,106 },
00050 { 84, 52, 76, 44, 82, 50, 74, 42 },
00051 };
00052
00053 static void filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc,
00054 int width, int thresh, const uint16_t *dithers)
00055 {
00056 int x;
00057 for (x=0; x<width; x++, dc+=x&1) {
00058 int pix = src[x]<<7;
00059 int delta = dc[0] - pix;
00060 int m = abs(delta) * thresh >> 16;
00061 m = FFMAX(0, 127-m);
00062 m = m*m*delta >> 14;
00063 pix += m + dithers[x&7];
00064 dst[x] = av_clip_uint8(pix>>7);
00065 }
00066 }
00067
00068 static void blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
00069 uint8_t *src, int sstride, int width)
00070 {
00071 int x, v, old;
00072 for (x=0; x<width; x++) {
00073 v = buf1[x] + src[2*x] + src[2*x+1] + src[2*x+sstride] + src[2*x+1+sstride];
00074 old = buf[x];
00075 buf[x] = v;
00076 dc[x] = v - old;
00077 }
00078 }
00079
00080 #if HAVE_MMX2
00081 static void filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc,
00082 int width, int thresh, const uint16_t *dithers)
00083 {
00084 intptr_t x;
00085 if (width&3) {
00086 x = width&~3;
00087 filter_line_c(dst+x, src+x, dc+x/2, width-x, thresh, dithers);
00088 width = x;
00089 }
00090 x = -width;
00091 __asm__ volatile(
00092 "movd %4, %%mm5 \n"
00093 "pxor %%mm7, %%mm7 \n"
00094 "pshufw $0, %%mm5, %%mm5 \n"
00095 "movq %6, %%mm6 \n"
00096 "movq %5, %%mm4 \n"
00097 "1: \n"
00098 "movd (%2,%0), %%mm0 \n"
00099 "movd (%3,%0), %%mm1 \n"
00100 "punpcklbw %%mm7, %%mm0 \n"
00101 "punpcklwd %%mm1, %%mm1 \n"
00102 "psllw $7, %%mm0 \n"
00103 "pxor %%mm2, %%mm2 \n"
00104 "psubw %%mm0, %%mm1 \n"
00105 "psubw %%mm1, %%mm2 \n"
00106 "pmaxsw %%mm1, %%mm2 \n"
00107 "pmulhuw %%mm5, %%mm2 \n"
00108 "psubw %%mm6, %%mm2 \n"
00109 "pminsw %%mm7, %%mm2 \n"
00110 "pmullw %%mm2, %%mm2 \n"
00111 "paddw %%mm4, %%mm0 \n"
00112 "pmulhw %%mm2, %%mm1 \n"
00113 "psllw $2, %%mm1 \n"
00114 "paddw %%mm1, %%mm0 \n"
00115 "psraw $7, %%mm0 \n"
00116 "packuswb %%mm0, %%mm0 \n"
00117 "movd %%mm0, (%1,%0) \n"
00118 "add $4, %0 \n"
00119 "jl 1b \n"
00120 "emms \n"
00121 :"+r"(x)
00122 :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
00123 "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
00124 :"memory"
00125 );
00126 }
00127 #endif
00128
00129 #if HAVE_SSSE3
00130 static void filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc,
00131 int width, int thresh, const uint16_t *dithers)
00132 {
00133 intptr_t x;
00134 if (width&7) {
00135
00136 x = width&~7;
00137 filter_line_c(dst+x, src+x, dc+x/2, width-x, thresh, dithers);
00138 width = x;
00139 }
00140 x = -width;
00141 __asm__ volatile(
00142 "movd %4, %%xmm5 \n"
00143 "pxor %%xmm7, %%xmm7 \n"
00144 "pshuflw $0,%%xmm5, %%xmm5 \n"
00145 "movdqa %6, %%xmm6 \n"
00146 "punpcklqdq %%xmm5, %%xmm5 \n"
00147 "movdqa %5, %%xmm4 \n"
00148 "1: \n"
00149 "movq (%2,%0), %%xmm0 \n"
00150 "movq (%3,%0), %%xmm1 \n"
00151 "punpcklbw %%xmm7, %%xmm0 \n"
00152 "punpcklwd %%xmm1, %%xmm1 \n"
00153 "psllw $7, %%xmm0 \n"
00154 "psubw %%xmm0, %%xmm1 \n"
00155 "pabsw %%xmm1, %%xmm2 \n"
00156 "pmulhuw %%xmm5, %%xmm2 \n"
00157 "psubw %%xmm6, %%xmm2 \n"
00158 "pminsw %%xmm7, %%xmm2 \n"
00159 "pmullw %%xmm2, %%xmm2 \n"
00160 "psllw $1, %%xmm2 \n"
00161 "paddw %%xmm4, %%xmm0 \n"
00162 "pmulhrsw %%xmm2, %%xmm1 \n"
00163 "paddw %%xmm1, %%xmm0 \n"
00164 "psraw $7, %%xmm0 \n"
00165 "packuswb %%xmm0, %%xmm0 \n"
00166 "movq %%xmm0, (%1,%0) \n"
00167 "add $8, %0 \n"
00168 "jl 1b \n"
00169 :"+&r"(x)
00170 :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
00171 "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
00172 :"memory"
00173 );
00174 }
00175 #endif // HAVE_SSSE3
00176
00177 #if HAVE_SSE2 && HAVE_6REGS
00178 #define BLURV(load)\
00179 intptr_t x = -2*width;\
00180 __asm__ volatile(\
00181 "movdqa %6, %%xmm7 \n"\
00182 "1: \n"\
00183 load" (%4,%0), %%xmm0 \n"\
00184 load" (%5,%0), %%xmm1 \n"\
00185 "movdqa %%xmm0, %%xmm2 \n"\
00186 "movdqa %%xmm1, %%xmm3 \n"\
00187 "psrlw $8, %%xmm0 \n"\
00188 "psrlw $8, %%xmm1 \n"\
00189 "pand %%xmm7, %%xmm2 \n"\
00190 "pand %%xmm7, %%xmm3 \n"\
00191 "paddw %%xmm1, %%xmm0 \n"\
00192 "paddw %%xmm3, %%xmm2 \n"\
00193 "paddw %%xmm2, %%xmm0 \n"\
00194 "paddw (%2,%0), %%xmm0 \n"\
00195 "movdqa (%1,%0), %%xmm1 \n"\
00196 "movdqa %%xmm0, (%1,%0) \n"\
00197 "psubw %%xmm1, %%xmm0 \n"\
00198 "movdqa %%xmm0, (%3,%0) \n"\
00199 "add $16, %0 \n"\
00200 "jl 1b \n"\
00201 :"+&r"(x)\
00202 :"r"(buf+width),\
00203 "r"(buf1+width),\
00204 "r"(dc+width),\
00205 "r"(src+width*2),\
00206 "r"(src+width*2+sstride),\
00207 "m"(*pw_ff)\
00208 :"memory"\
00209 );
00210
00211 static void blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
00212 uint8_t *src, int sstride, int width)
00213 {
00214 if (((intptr_t)src|sstride)&15) {
00215 BLURV("movdqu");
00216 } else {
00217 BLURV("movdqa");
00218 }
00219 }
00220 #endif // HAVE_6REGS && HAVE_SSE2
00221
00222 static void filter_plane(struct vf_priv_s *ctx, uint8_t *dst, uint8_t *src,
00223 int width, int height, int dstride, int sstride, int r)
00224 {
00225 int bstride = ((width+15)&~15)/2;
00226 int y;
00227 uint32_t dc_factor = (1<<21)/(r*r);
00228 uint16_t *dc = ctx->buf+16;
00229 uint16_t *buf = ctx->buf+bstride+32;
00230 int thresh = ctx->thresh;
00231
00232 memset(dc, 0, (bstride+16)*sizeof(*buf));
00233 for (y=0; y<r; y++)
00234 ctx->blur_line(dc, buf+y*bstride, buf+(y-1)*bstride, src+2*y*sstride, sstride, width/2);
00235 for (;;) {
00236 if (y < height-r) {
00237 int mod = ((y+r)/2)%r;
00238 uint16_t *buf0 = buf+mod*bstride;
00239 uint16_t *buf1 = buf+(mod?mod-1:r-1)*bstride;
00240 int x, v;
00241 ctx->blur_line(dc, buf0, buf1, src+(y+r)*sstride, sstride, width/2);
00242 for (x=v=0; x<r; x++)
00243 v += dc[x];
00244 for (; x<width/2; x++) {
00245 v += dc[x] - dc[x-r];
00246 dc[x-r] = v * dc_factor >> 16;
00247 }
00248 for (; x<(width+r+1)/2; x++)
00249 dc[x-r] = v * dc_factor >> 16;
00250 for (x=-r/2; x<0; x++)
00251 dc[x] = dc[0];
00252 }
00253 if (y == r) {
00254 for (y=0; y<r; y++)
00255 ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
00256 }
00257 ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
00258 if (++y >= height) break;
00259 ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
00260 if (++y >= height) break;
00261 }
00262 }
00263