diff --git a/postproc/swscale.c b/postproc/swscale.c
index c5a5b0d5979c1f8811c19f475c286c64f1fea427..d86edf19c9b55a5a5cff1d0724d6d4b0025e9961 100644
--- a/postproc/swscale.c
+++ b/postproc/swscale.c
@@ -34,6 +34,7 @@
 #include "swscale.h"
 #include "../cpudetect.h"
 #include "../libvo/img_format.h"
+#include "rgb2rgb.h"
 #undef MOVNTQ
 #undef PAVGB
 
@@ -69,6 +70,7 @@
 			|| (x)==IMGFMT_Y800)
 #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
 			|| (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
+#define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
 
 #define RGB2YUV_SHIFT 16
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
@@ -92,7 +94,6 @@ Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 
 TODO
 more intelligent missalignment avoidance for the horizontal scaler
-change the distance of the u & v buffer
 write special vertical cubic upscale version
 Optimize C code (yv12 / minmax)
 add support for packed pixel yuv input & output
@@ -100,6 +101,7 @@ add support for Y8 output
 optimize bgr24 & bgr32
 add BGR4 output support
 write special BGR->BGR scaler
+deglobalize yuv2rgb*.c
 */
 
 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
@@ -1107,12 +1109,22 @@ cpuCaps= gCpuCaps;
 #endif //!RUNTIME_CPUDETECT
 }
 
+/* Warper functions for yuv2bgr */
+static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+
+	if(c->srcFormat==IMGFMT_YV12)
+		yuv2rgb( dst[0],src[0],src[1],src[2],c->srcW,c->srcH,dstStride[0],srcStride[0],srcStride[1] );
+	else /* I420 & IYUV */
+		yuv2rgb( dst[0],src[0],src[2],src[1],c->srcW,c->srcH,dstStride[0],srcStride[0],srcStride[1] );
+}
 
 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
                          SwsFilter *srcFilter, SwsFilter *dstFilter){
 
 	SwsContext *c;
 	int i;
+	int usesFilter;
 	SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
 
 #ifdef ARCH_X86
@@ -1162,6 +1174,33 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 	c->dstFormat= dstFormat;
 	c->srcFormat= srcFormat;
 
+	usesFilter=0;
+	if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
+	if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
+	if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
+	if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
+	if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
+	if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
+	if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
+	if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
+	
+	/* special Cases */
+	if(srcW==dstW && srcH==dstH && !usesFilter)
+	{
+		/* yuv2bgr */
+		if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
+		{
+			// FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
+			yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
+			c->swScale= planarYuvToBgr;
+			
+			if(flags&SWS_PRINT_INFO)
+				printf("SwScaler: using unscaled %s -> %s special converter\n", 
+					vo_format_name(srcFormat), vo_format_name(dstFormat));
+			return c;
+		}
+	}
+
 	if(cpuCaps.hasMMX2)
 	{
 		c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
@@ -1403,7 +1442,8 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 		printf("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
 			c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
 	}
-	
+
+	c->swScale= swScale;
 	return c;
 }
 
diff --git a/postproc/swscale.h b/postproc/swscale.h
index 1440745078870cc0ff21cfcf5d1d001f7b62fb7c..03b63a6501862854c7ec7669f21f4649dc71ad1a 100644
--- a/postproc/swscale.h
+++ b/postproc/swscale.h
@@ -37,7 +37,7 @@
 #define SWS_MAX_REDUCE_CUTOFF 0.002
 
 /* this struct should be aligned on at least 32-byte boundary */
-typedef struct{
+typedef struct SwsContext{
 	int srcW, srcH, dstW, dstH;
 	int chrSrcW, chrSrcH, chrDstW, chrDstH;
 	int lumXInc, chrXInc;
@@ -78,6 +78,9 @@ typedef struct{
 	int chrBufIndex;
 	int dstY;
 	int flags;
+
+	void (*swScale)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]);
 } SwsContext;
 //FIXME check init (where 0)
 
@@ -116,9 +119,6 @@ SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW
 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
 			 SwsFilter *srcFilter, SwsFilter *dstFilter);
 
-extern void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
-             int srcSliceH, uint8_t* dst[], int dstStride[]);
-
 SwsVector *getGaussianVec(double variance, double quality);
 SwsVector *getConstVec(double c, int length);
 SwsVector *getIdentityVec(void);
diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c
index ba8a8c1fe127aed492498fbbc2771a2624f355d1..b56d41498482e911e775e0cf22b06ecb529e763b 100644
--- a/postproc/swscale_template.c
+++ b/postproc/swscale_template.c
@@ -2192,7 +2192,7 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 	uint8_t *src[3];
 	uint8_t *dst[3];
 	
-	if((c->srcFormat == IMGFMT_IYUV) || (c->srcFormat == IMGFMT_I420)){
+	if(c->srcFormat == IMGFMT_I420){
 		src[0]= srcParam[0];
 		src[1]= srcParam[2];
 		src[2]= srcParam[1];
@@ -2225,7 +2225,7 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 		srcStride[2]= 0;
 	}
 
-	if((c->dstFormat == IMGFMT_IYUV) || (c->dstFormat == IMGFMT_I420)){
+	if(c->dstFormat == IMGFMT_I420){
 		dst[0]= dstParam[0];
 		dst[1]= dstParam[2];
 		dst[2]= dstParam[1];
@@ -2235,7 +2235,9 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 		dst[1]= dstParam[1];
 		dst[2]= dstParam[2];
 	}
-	
+
+//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
+//dstStride[0],dstStride[1],dstStride[2]);
 
 	if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
 	{