rgb2rgb_template.c

/*
 * software RGB to RGB converter
 * pluralize by software PAL8 to RGB converter
 *              software YUV to YUV converter
 *              software YUV to RGB converter
 * Written by Nick Kurshev.
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
 * lot of big-endian byte order fixes by Alex Beregszaszi
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * The C code (not assembly, MMX, ...) of this file can be used
 * under the LGPL license.
 */

#include <stddef.h>

#undef PREFETCH
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#undef MMREG_SIZE
#undef PREFETCHW
#undef PAVGB

#if HAVE_SSE2
#define MMREG_SIZE 16
#else
#define MMREG_SIZE 8
#endif

#if HAVE_AMD3DNOW
#define PREFETCH  "prefetch"
#define PREFETCHW "prefetchw"
#define PAVGB     "pavgusb"
#elif HAVE_MMX2
#define PREFETCH "prefetchnta"
#define PREFETCHW "prefetcht0"
#define PAVGB     "pavgb"
#else
#define PREFETCH  " # nop"
#define PREFETCHW " # nop"
#endif

#if HAVE_AMD3DNOW
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
#define EMMS     "femms"
#else
#define EMMS     "emms"
#endif

#if HAVE_MMX2
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE " # nop"
#endif

static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
{
    uint8_t *dest = dst;
    const uint8_t *s = src;
    const uint8_t *end;
    #if HAVE_MMX
        const uint8_t *mm_end;
    #endif
    end = s + src_size;
    #if HAVE_MMX
        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
        mm_end = end - 23;
        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
        while (s < mm_end)
        {
            __asm__ volatile(
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "punpckldq    3%1, %%mm0    \n\t"
            "movd         6%1, %%mm1    \n\t"
            "punpckldq    9%1, %%mm1    \n\t"
            "movd        12%1, %%mm2    \n\t"
            "punpckldq   15%1, %%mm2    \n\t"
            "movd        18%1, %%mm3    \n\t"
            "punpckldq   21%1, %%mm3    \n\t"
            "pand       %%mm7, %%mm0    \n\t"
            "pand       %%mm7, %%mm1    \n\t"
            "pand       %%mm7, %%mm2    \n\t"
            "pand       %%mm7, %%mm3    \n\t"
            MOVNTQ"     %%mm0,   %0     \n\t"
            MOVNTQ"     %%mm1,  8%0     \n\t"
            MOVNTQ"     %%mm2, 16%0     \n\t"
            MOVNTQ"     %%mm3, 24%0"
            :"=m"(*dest)
            :"m"(*s)
            :"memory");
            dest += 32;
            s += 24;
        }
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    #endif
    while (s < end)
    {
    #ifdef WORDS_BIGENDIAN
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
        *dest++ = 0;
        *dest++ = s[2];
        *dest++ = s[1];
        *dest++ = s[0];
        s+=3;
    #else
        *dest++ = *s++;
        *dest++ = *s++;
        *dest++ = *s++;
        *dest++ = 0;
    #endif
    }
}

static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
{
    uint8_t *dest = dst;
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    mm_end = end - 31;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movq          %1, %%mm0    \n\t"
        "movq         8%1, %%mm1    \n\t"
        "movq        16%1, %%mm4    \n\t"
        "movq        24%1, %%mm5    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm1, %%mm3    \n\t"
        "movq       %%mm4, %%mm6    \n\t"
        "movq       %%mm5, %%mm7    \n\t"
        "psrlq         $8, %%mm2    \n\t"
        "psrlq         $8, %%mm3    \n\t"
        "psrlq         $8, %%mm6    \n\t"
        "psrlq         $8, %%mm7    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %2, %%mm1    \n\t"
        "pand          %2, %%mm4    \n\t"
        "pand          %2, %%mm5    \n\t"
        "pand          %3, %%mm2    \n\t"
        "pand          %3, %%mm3    \n\t"
        "pand          %3, %%mm6    \n\t"
        "pand          %3, %%mm7    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm3, %%mm1    \n\t"
        "por        %%mm6, %%mm4    \n\t"
        "por        %%mm7, %%mm5    \n\t"

        "movq       %%mm1, %%mm2    \n\t"
        "movq       %%mm4, %%mm3    \n\t"
        "psllq        $48, %%mm2    \n\t"
        "psllq        $32, %%mm3    \n\t"
        "pand          %4, %%mm2    \n\t"
        "pand          %5, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "psrlq        $16, %%mm1    \n\t"
        "psrlq        $32, %%mm4    \n\t"
        "psllq        $16, %%mm5    \n\t"
        "por        %%mm3, %%mm1    \n\t"
        "pand          %6, %%mm5    \n\t"
        "por        %%mm5, %%mm4    \n\t"

        MOVNTQ"     %%mm0,   %0     \n\t"
        MOVNTQ"     %%mm1,  8%0     \n\t"
        MOVNTQ"     %%mm4, 16%0"
        :"=m"(*dest)
        :"m"(*s),"m"(mask24l),
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
        :"memory");
        dest += 24;
        s += 32;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
#ifdef WORDS_BIGENDIAN
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
        s++;
        dest[2] = *s++;
        dest[1] = *s++;
        dest[0] = *s++;
        dest += 3;
#else
        *dest++ = *s++;
        *dest++ = *s++;
        *dest++ = *s++;
        s++;
#endif
    }
}

/*
 original by Strepto/Astral
 ported to gcc & bugfixed: A'rpi
 MMX2, 3DNOW optimization by Nick Kurshev
 32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
{
    register const uint8_t* s=src;
    register uint8_t* d=dst;
    register const uint8_t *end;
    const uint8_t *mm_end;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
    mm_end = end - 15;
    while (s<mm_end)
    {
        __asm__ volatile(
        PREFETCH"  32%1         \n\t"
        "movq        %1, %%mm0  \n\t"
        "movq       8%1, %%mm2  \n\t"
        "movq     %%mm0, %%mm1  \n\t"
        "movq     %%mm2, %%mm3  \n\t"
        "pand     %%mm4, %%mm0  \n\t"
        "pand     %%mm4, %%mm2  \n\t"
        "paddw    %%mm1, %%mm0  \n\t"
        "paddw    %%mm3, %%mm2  \n\t"
        MOVNTQ"   %%mm0,  %0    \n\t"
        MOVNTQ"   %%mm2, 8%0"
        :"=m"(*d)
        :"m"(*s)
        );
        d+=16;
        s+=16;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    mm_end = end - 3;
    while (s < mm_end)
    {
        register unsigned x= *((const uint32_t *)s);
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
        d+=4;
        s+=4;
    }
    if (s < end)
    {
        register unsigned short x= *((const uint16_t *)s);
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
    }
}

static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
{
    register const uint8_t* s=src;
    register uint8_t* d=dst;
    register const uint8_t *end;
    const uint8_t *mm_end;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
    mm_end = end - 15;
    while (s<mm_end)
    {
        __asm__ volatile(
        PREFETCH"  32%1         \n\t"
        "movq        %1, %%mm0  \n\t"
        "movq       8%1, %%mm2  \n\t"
        "movq     %%mm0, %%mm1  \n\t"
        "movq     %%mm2, %%mm3  \n\t"
        "psrlq       $1, %%mm0  \n\t"
        "psrlq       $1, %%mm2  \n\t"
        "pand     %%mm7, %%mm0  \n\t"
        "pand     %%mm7, %%mm2  \n\t"
        "pand     %%mm6, %%mm1  \n\t"
        "pand     %%mm6, %%mm3  \n\t"
        "por      %%mm1, %%mm0  \n\t"
        "por      %%mm3, %%mm2  \n\t"
        MOVNTQ"   %%mm0,  %0    \n\t"
        MOVNTQ"   %%mm2, 8%0"
        :"=m"(*d)
        :"m"(*s)
        );
        d+=16;
        s+=16;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    mm_end = end - 3;
    while (s < mm_end)
    {
        register uint32_t x= *((const uint32_t*)s);
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
        s+=4;
        d+=4;
    }
    if (s < end)
    {
        register uint16_t x= *((const uint16_t*)s);
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
        s+=2;
        d+=2;
    }
}

static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    mm_end = end - 15;
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
    __asm__ volatile(
    "movq           %3, %%mm5   \n\t"
    "movq           %4, %%mm6   \n\t"
    "movq           %5, %%mm7   \n\t"
    "jmp 2f                     \n\t"
    ASMALIGN(4)
    "1:                         \n\t"
    PREFETCH"   32(%1)          \n\t"
    "movd         (%1), %%mm0   \n\t"
    "movd        4(%1), %%mm3   \n\t"
    "punpckldq   8(%1), %%mm0   \n\t"
    "punpckldq  12(%1), %%mm3   \n\t"
    "movq        %%mm0, %%mm1   \n\t"
    "movq        %%mm3, %%mm4   \n\t"
    "pand        %%mm6, %%mm0   \n\t"
    "pand        %%mm6, %%mm3   \n\t"
    "pmaddwd     %%mm7, %%mm0   \n\t"
    "pmaddwd     %%mm7, %%mm3   \n\t"
    "pand        %%mm5, %%mm1   \n\t"
    "pand        %%mm5, %%mm4   \n\t"
    "por         %%mm1, %%mm0   \n\t"
    "por         %%mm4, %%mm3   \n\t"
    "psrld          $5, %%mm0   \n\t"
    "pslld         $11, %%mm3   \n\t"
    "por         %%mm3, %%mm0   \n\t"
    MOVNTQ"      %%mm0, (%0)    \n\t"
    "add           $16,  %1     \n\t"
    "add            $8,  %0     \n\t"
    "2:                         \n\t"
    "cmp            %2,  %1     \n\t"
    " jb            1b          \n\t"
    : "+r" (d), "+r"(s)
    : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
    );
#else
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq    %0, %%mm7    \n\t"
        "movq    %1, %%mm6    \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         4%1, %%mm3    \n\t"
        "punpckldq    8%1, %%mm0    \n\t"
        "punpckldq   12%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psrlq         $3, %%mm0    \n\t"
        "psrlq         $3, %%mm3    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %2, %%mm3    \n\t"
        "psrlq         $5, %%mm1    \n\t"
        "psrlq         $5, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq         $8, %%mm2    \n\t"
        "psrlq         $8, %%mm5    \n\t"
        "pand       %%mm7, %%mm2    \n\t"
        "pand       %%mm7, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
        d += 4;
        s += 16;
    }
#endif
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
    }
}

static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq          %0, %%mm7    \n\t"
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         4%1, %%mm3    \n\t"
        "punpckldq    8%1, %%mm0    \n\t"
        "punpckldq   12%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psllq         $8, %%mm0    \n\t"
        "psllq         $8, %%mm3    \n\t"
        "pand       %%mm7, %%mm0    \n\t"
        "pand       %%mm7, %%mm3    \n\t"
        "psrlq         $5, %%mm1    \n\t"
        "psrlq         $5, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq        $19, %%mm2    \n\t"
        "psrlq        $19, %%mm5    \n\t"
        "pand          %2, %%mm2    \n\t"
        "pand          %2, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
        d += 4;
        s += 16;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
    }
}

static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    mm_end = end - 15;
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
    __asm__ volatile(
    "movq           %3, %%mm5   \n\t"
    "movq           %4, %%mm6   \n\t"
    "movq           %5, %%mm7   \n\t"
    "jmp            2f          \n\t"
    ASMALIGN(4)
    "1:                         \n\t"
    PREFETCH"   32(%1)          \n\t"
    "movd         (%1), %%mm0   \n\t"
    "movd        4(%1), %%mm3   \n\t"
    "punpckldq   8(%1), %%mm0   \n\t"
    "punpckldq  12(%1), %%mm3   \n\t"
    "movq        %%mm0, %%mm1   \n\t"
    "movq        %%mm3, %%mm4   \n\t"
    "pand        %%mm6, %%mm0   \n\t"
    "pand        %%mm6, %%mm3   \n\t"
    "pmaddwd     %%mm7, %%mm0   \n\t"
    "pmaddwd     %%mm7, %%mm3   \n\t"
    "pand        %%mm5, %%mm1   \n\t"
    "pand        %%mm5, %%mm4   \n\t"
    "por         %%mm1, %%mm0   \n\t"
    "por         %%mm4, %%mm3   \n\t"
    "psrld          $6, %%mm0   \n\t"
    "pslld         $10, %%mm3   \n\t"
    "por         %%mm3, %%mm0   \n\t"
    MOVNTQ"      %%mm0, (%0)    \n\t"
    "add           $16,  %1     \n\t"
    "add            $8,  %0     \n\t"
    "2:                         \n\t"
    "cmp            %2,  %1     \n\t"
    " jb            1b          \n\t"
    : "+r" (d), "+r"(s)
    : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
    );
#else
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq          %0, %%mm7    \n\t"
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         4%1, %%mm3    \n\t"
        "punpckldq    8%1, %%mm0    \n\t"
        "punpckldq   12%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psrlq         $3, %%mm0    \n\t"
        "psrlq         $3, %%mm3    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %2, %%mm3    \n\t"
        "psrlq         $6, %%mm1    \n\t"
        "psrlq         $6, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq         $9, %%mm2    \n\t"
        "psrlq         $9, %%mm5    \n\t"
        "pand       %%mm7, %%mm2    \n\t"
        "pand       %%mm7, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
        d += 4;
        s += 16;
    }
#endif
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
    }
}

static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq          %0, %%mm7    \n\t"
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         4%1, %%mm3    \n\t"
        "punpckldq    8%1, %%mm0    \n\t"
        "punpckldq   12%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psllq         $7, %%mm0    \n\t"
        "psllq         $7, %%mm3    \n\t"
        "pand       %%mm7, %%mm0    \n\t"
        "pand       %%mm7, %%mm3    \n\t"
        "psrlq         $6, %%mm1    \n\t"
        "psrlq         $6, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq        $19, %%mm2    \n\t"
        "psrlq        $19, %%mm5    \n\t"
        "pand          %2, %%mm2    \n\t"
        "pand          %2, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
        d += 4;
        s += 16;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        register int rgb = *(const uint32_t*)s; s += 4;
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
    }
}

static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq         %0, %%mm7     \n\t"
        "movq         %1, %%mm6     \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    mm_end = end - 11;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         3%1, %%mm3    \n\t"
        "punpckldq    6%1, %%mm0    \n\t"
        "punpckldq    9%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psrlq         $3, %%mm0    \n\t"
        "psrlq         $3, %%mm3    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %2, %%mm3    \n\t"
        "psrlq         $5, %%mm1    \n\t"
        "psrlq         $5, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq         $8, %%mm2    \n\t"
        "psrlq         $8, %%mm5    \n\t"
        "pand       %%mm7, %%mm2    \n\t"
        "pand       %%mm7, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
        d += 4;
        s += 12;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        const int b = *s++;
        const int g = *s++;
        const int r = *s++;
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
    }
}

static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq         %0, %%mm7     \n\t"
        "movq         %1, %%mm6     \n\t"
        ::"m"(red_16mask),"m"(green_16mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         3%1, %%mm3    \n\t"
        "punpckldq    6%1, %%mm0    \n\t"
        "punpckldq    9%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psllq         $8, %%mm0    \n\t"
        "psllq         $8, %%mm3    \n\t"
        "pand       %%mm7, %%mm0    \n\t"
        "pand       %%mm7, %%mm3    \n\t"
        "psrlq         $5, %%mm1    \n\t"
        "psrlq         $5, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq        $19, %%mm2    \n\t"
        "psrlq        $19, %%mm5    \n\t"
        "pand          %2, %%mm2    \n\t"
        "pand          %2, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
        d += 4;
        s += 12;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        const int r = *s++;
        const int g = *s++;
        const int b = *s++;
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
    }
}

static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq          %0, %%mm7    \n\t"
        "movq          %1, %%mm6    \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    mm_end = end - 11;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movd          %1, %%mm0    \n\t"
        "movd         3%1, %%mm3    \n\t"
        "punpckldq    6%1, %%mm0    \n\t"
        "punpckldq    9%1, %%mm3    \n\t"
        "movq       %%mm0, %%mm1    \n\t"
        "movq       %%mm0, %%mm2    \n\t"
        "movq       %%mm3, %%mm4    \n\t"
        "movq       %%mm3, %%mm5    \n\t"
        "psrlq         $3, %%mm0    \n\t"
        "psrlq         $3, %%mm3    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %2, %%mm3    \n\t"
        "psrlq         $6, %%mm1    \n\t"
        "psrlq         $6, %%mm4    \n\t"
        "pand       %%mm6, %%mm1    \n\t"
        "pand       %%mm6, %%mm4    \n\t"
        "psrlq         $9, %%mm2    \n\t"
        "psrlq         $9, %%mm5    \n\t"
        "pand       %%mm7, %%mm2    \n\t"
        "pand       %%mm7, %%mm5    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "por        %%mm5, %%mm3    \n\t"
        "psllq        $16, %%mm3    \n\t"
        "por        %%mm3, %%mm0    \n\t"
        MOVNTQ"     %%mm0, %0       \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
        d += 4;
        s += 12;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        const int b = *s++;
        const int g = *s++;
        const int r = *s++;
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
    }
}

static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint8_t *s = src;
    const uint8_t *end;
#if HAVE_MMX
    const uint8_t *mm_end;
#endif
    uint16_t *d = (uint16_t *)dst;
    end = s + src_size;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
    __asm__ volatile(
        "movq         %0, %%mm7     \n\t"
        "movq         %1, %%mm6     \n\t"
        ::"m"(red_15mask),"m"(green_15mask));
    mm_end = end - 15;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"   32%1            \n\t"
        "movd         %1, %%mm0     \n\t"
        "movd        3%1, %%mm3     \n\t"
        "punpckldq   6%1, %%mm0     \n\t"
        "punpckldq   9%1, %%mm3     \n\t"
        "movq      %%mm0, %%mm1     \n\t"
        "movq      %%mm0, %%mm2     \n\t"
        "movq      %%mm3, %%mm4     \n\t"
        "movq      %%mm3, %%mm5     \n\t"
        "psllq        $7, %%mm0     \n\t"
        "psllq        $7, %%mm3     \n\t"
        "pand      %%mm7, %%mm0     \n\t"
        "pand      %%mm7, %%mm3     \n\t"
        "psrlq        $6, %%mm1     \n\t"
        "psrlq        $6, %%mm4     \n\t"
        "pand      %%mm6, %%mm1     \n\t"
        "pand      %%mm6, %%mm4     \n\t"
        "psrlq       $19, %%mm2     \n\t"
        "psrlq       $19, %%mm5     \n\t"
        "pand         %2, %%mm2     \n\t"
        "pand         %2, %%mm5     \n\t"
        "por       %%mm1, %%mm0     \n\t"
        "por       %%mm4, %%mm3     \n\t"
        "por       %%mm2, %%mm0     \n\t"
        "por       %%mm5, %%mm3     \n\t"
        "psllq       $16, %%mm3     \n\t"
        "por       %%mm3, %%mm0     \n\t"
        MOVNTQ"    %%mm0, %0        \n\t"
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
        d += 4;
        s += 12;
    }
    __asm__ volatile(SFENCE:::"memory");
    __asm__ volatile(EMMS:::"memory");
#endif
    while (s < end)
    {
        const int r = *s++;
        const int g = *s++;
        const int b = *s++;
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
    }
}

/*
  I use less accurate approximation here by simply left-shifting the input
  value and filling the low order bits with zeroes. This method improves PNG
  compression but this scheme cannot reproduce white exactly, since it does
  not generate an all-ones maximum value; the net effect is to darken the
  image slightly.

  The better method should be "left bit replication":

   4 3 2 1 0
   ---------
   1 1 0 1 1

   7 6 5 4 3  2 1 0
   ----------------
   1 1 0 1 1  1 1 0
   |=======|  |===|
       |      leftmost bits repeated to fill open bits
       |
   original bits
*/
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
{
    const uint16_t *end;
#if HAVE_MMX
    const uint16_t *mm_end;
#endif
    uint8_t *d = dst;
    const uint16_t *s = (const uint16_t*)src;
    end = s + src_size/2;
#if HAVE_MMX
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    mm_end = end - 7;
    while (s < mm_end)
    {
        __asm__ volatile(
        PREFETCH"    32%1           \n\t"
        "movq          %1, %%mm0    \n\t"
        "movq          %1, %%mm1    \n\t"
        "movq          %1, %%mm2    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %3, %%mm1    \n\t"
        "pand          %4, %%mm2    \n\t"
        "psllq         $3, %%mm0    \n\t"
        "psrlq         $2, %%mm1    \n\t"
        "psrlq         $7, %%mm2    \n\t"
        "movq       %%mm0, %%mm3    \n\t"
        "movq       %%mm1, %%mm4    \n\t"
        "movq       %%mm2, %%mm5    \n\t"
        "punpcklwd     %5, %%mm0    \n\t"
        "punpcklwd     %5, %%mm1    \n\t"
        "punpcklwd     %5, %%mm2    \n\t"
        "punpckhwd     %5, %%mm3    \n\t"
        "punpckhwd     %5, %%mm4    \n\t"
        "punpckhwd     %5, %%mm5    \n\t"
        "psllq         $8, %%mm1    \n\t"
        "psllq        $16, %%mm2    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "psllq         $8, %%mm4    \n\t"
        "psllq        $16, %%mm5    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm5, %%mm3    \n\t"

        "movq       %%mm0, %%mm6    \n\t"
        "movq       %%mm3, %%mm7    \n\t"

        "movq         8%1, %%mm0    \n\t"
        "movq         8%1, %%mm1    \n\t"
        "movq         8%1, %%mm2    \n\t"
        "pand          %2, %%mm0    \n\t"
        "pand          %3, %%mm1    \n\t"
        "pand          %4, %%mm2    \n\t"
        "psllq         $3, %%mm0    \n\t"
        "psrlq         $2, %%mm1    \n\t"
        "psrlq         $7, %%mm2    \n\t"
        "movq       %%mm0, %%mm3    \n\t"
        "movq       %%mm1, %%mm4    \n\t"
        "movq       %%mm2, %%mm5    \n\t"
        "punpcklwd     %5, %%mm0    \n\t"
        "punpcklwd     %5, %%mm1    \n\t"
        "punpcklwd     %5, %%mm2    \n\t"
        "punpckhwd     %5, %%mm3    \n\t"
        "punpckhwd     %5, %%mm4    \n\t"
        "punpckhwd     %5, %%mm5    \n\t"
        "psllq         $8, %%mm1    \n\t"
        "psllq        $16, %%mm2    \n\t"
        "por        %%mm1, %%mm0    \n\t"
        "por        %%mm2, %%mm0    \n\t"
        "psllq         $8, %%mm4    \n\t"
        "psllq        $16, %%mm5    \n\t"
        "por        %%mm4, %%mm3    \n\t"
        "por        %%mm5, %%mm3    \n\t"

        :"=m"(*d)
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
        :"memory");
        /* borrowed 32 to 24 */
        __asm__ volatile(
        "movq       %%mm0, %%mm4    \n\t"