Skip to content
Snippets Groups Projects
rgb2rgb_template.c 98.3 KiB
Newer Older
  • Learn to ignore specific revisions
  •  * software RGB to RGB converter
     * pluralize by software PAL8 to RGB converter
     *              software YUV to YUV converter
     *              software YUV to RGB converter
     * Written by Nick Kurshev.
     * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
     * lot of big-endian byte order fixes by Alex Beregszaszi
    
     * This file is part of FFmpeg.
     *
     * FFmpeg is free software; you can redistribute it and/or modify
    
     * it under the terms of the GNU General Public License as published by
     * the Free Software Foundation; either version 2 of the License, or
     * (at your option) any later version.
     *
    
     * FFmpeg is distributed in the hope that it will be useful,
    
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
    
     * along with FFmpeg; if not, write to the Free Software
    
     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    
     * The C code (not assembly, MMX, ...) of this file can be used
    
     * under the LGPL license.
    
    Arpi's avatar
    Arpi committed
    #include <stddef.h>
    
    
    #undef PREFETCH
    #undef MOVNTQ
    #undef EMMS
    #undef SFENCE
    #undef MMREG_SIZE
    #undef PREFETCHW
    #undef PAVGB
    
    
    #define MMREG_SIZE 16
    #else
    #define MMREG_SIZE 8
    #endif
    
    
    #define PREFETCH  "prefetch"
    #define PREFETCHW "prefetchw"
    
    #define PREFETCH "prefetchnta"
    #define PREFETCHW "prefetcht0"
    
    #else
    
    #define PREFETCH  " # nop"
    #define PREFETCHW " # nop"
    
    Diego Biurrun's avatar
    Diego Biurrun committed
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
    
    #define EMMS     "femms"
    #else
    #define EMMS     "emms"
    
    Nick Kurshev's avatar
    Nick Kurshev committed
    
    
    #define MOVNTQ "movntq"
    #define SFENCE "sfence"
    #else
    #define MOVNTQ "movq"
    
    static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
    
    Nick Kurshev's avatar
    Nick Kurshev committed
    {
    
        uint8_t *dest = dst;
        const uint8_t *s = src;
        const uint8_t *end;
    
            const uint8_t *mm_end;
        #endif
        end = s + src_size;
    
            __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    
            __asm__ volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
    
                __asm__ volatile(
    
                PREFETCH"    32%1           \n\t"
                "movd          %1, %%mm0    \n\t"
                "punpckldq    3%1, %%mm0    \n\t"
                "movd         6%1, %%mm1    \n\t"
                "punpckldq    9%1, %%mm1    \n\t"
                "movd        12%1, %%mm2    \n\t"
                "punpckldq   15%1, %%mm2    \n\t"
                "movd        18%1, %%mm3    \n\t"
                "punpckldq   21%1, %%mm3    \n\t"
                "pand       %%mm7, %%mm0    \n\t"
                "pand       %%mm7, %%mm1    \n\t"
                "pand       %%mm7, %%mm2    \n\t"
                "pand       %%mm7, %%mm3    \n\t"
                MOVNTQ"     %%mm0,   %0     \n\t"
                MOVNTQ"     %%mm1,  8%0     \n\t"
                MOVNTQ"     %%mm2, 16%0     \n\t"
                MOVNTQ"     %%mm3, 24%0"
                :"=m"(*dest)
                :"m"(*s)
                :"memory");
                dest += 32;
                s += 24;
            }
    
            __asm__ volatile(SFENCE:::"memory");
            __asm__ volatile(EMMS:::"memory");
    
        #endif
        while (s < end)
        {
        #ifdef WORDS_BIGENDIAN
            /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
            *dest++ = 0;
            *dest++ = s[2];
            *dest++ = s[1];
            *dest++ = s[0];
            s+=3;
        #else
            *dest++ = *s++;
            *dest++ = *s++;
            *dest++ = *s++;
            *dest++ = 0;
        #endif
        }
    
    static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
    
        uint8_t *dest = dst;
        const uint8_t *s = src;
        const uint8_t *end;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
            "movq         8%1, %%mm1    \n\t"
            "movq        16%1, %%mm4    \n\t"
            "movq        24%1, %%mm5    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm1, %%mm3    \n\t"
            "movq       %%mm4, %%mm6    \n\t"
            "movq       %%mm5, %%mm7    \n\t"
            "psrlq         $8, %%mm2    \n\t"
            "psrlq         $8, %%mm3    \n\t"
            "psrlq         $8, %%mm6    \n\t"
            "psrlq         $8, %%mm7    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %2, %%mm1    \n\t"
            "pand          %2, %%mm4    \n\t"
            "pand          %2, %%mm5    \n\t"
            "pand          %3, %%mm2    \n\t"
            "pand          %3, %%mm3    \n\t"
            "pand          %3, %%mm6    \n\t"
            "pand          %3, %%mm7    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm3, %%mm1    \n\t"
            "por        %%mm6, %%mm4    \n\t"
            "por        %%mm7, %%mm5    \n\t"
    
            "movq       %%mm1, %%mm2    \n\t"
            "movq       %%mm4, %%mm3    \n\t"
            "psllq        $48, %%mm2    \n\t"
            "psllq        $32, %%mm3    \n\t"
            "pand          %4, %%mm2    \n\t"
            "pand          %5, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "psrlq        $16, %%mm1    \n\t"
            "psrlq        $32, %%mm4    \n\t"
            "psllq        $16, %%mm5    \n\t"
            "por        %%mm3, %%mm1    \n\t"
            "pand          %6, %%mm5    \n\t"
            "por        %%mm5, %%mm4    \n\t"
    
            MOVNTQ"     %%mm0,   %0     \n\t"
            MOVNTQ"     %%mm1,  8%0     \n\t"
            MOVNTQ"     %%mm4, 16%0"
            :"=m"(*dest)
            :"m"(*s),"m"(mask24l),
             "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
            :"memory");
            dest += 24;
            s += 32;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
    Alex Beregszaszi's avatar
    Alex Beregszaszi committed
    #ifdef WORDS_BIGENDIAN
    
            /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
            s++;
            dest[2] = *s++;
            dest[1] = *s++;
            dest[0] = *s++;
            dest += 3;
    
    Alex Beregszaszi's avatar
    Alex Beregszaszi committed
    #else
    
            *dest++ = *s++;
            *dest++ = *s++;
            *dest++ = *s++;
            s++;
    
    Alex Beregszaszi's avatar
    Alex Beregszaszi committed
    #endif
    
     original by Strepto/Astral
     ported to gcc & bugfixed: A'rpi
    
    Nick Kurshev's avatar
    Nick Kurshev committed
     MMX2, 3DNOW optimization by Nick Kurshev
    
     32-bit C version, and and&add trick by Michael Niedermayer
    
    Benoit Fouet's avatar
    Benoit Fouet committed
    static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
    
        register const uint8_t* s=src;
        register uint8_t* d=dst;
        register const uint8_t *end;
        const uint8_t *mm_end;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*s));
        __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
    
            __asm__ volatile(
    
            PREFETCH"  32%1         \n\t"
            "movq        %1, %%mm0  \n\t"
            "movq       8%1, %%mm2  \n\t"
            "movq     %%mm0, %%mm1  \n\t"
            "movq     %%mm2, %%mm3  \n\t"
            "pand     %%mm4, %%mm0  \n\t"
            "pand     %%mm4, %%mm2  \n\t"
            "paddw    %%mm1, %%mm0  \n\t"
            "paddw    %%mm3, %%mm2  \n\t"
            MOVNTQ"   %%mm0,  %0    \n\t"
            MOVNTQ"   %%mm2, 8%0"
            :"=m"(*d)
            :"m"(*s)
            );
            d+=16;
            s+=16;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
    #endif
    
        mm_end = end - 3;
    
    Arpi's avatar
    Arpi committed
        {
    
            register unsigned x= *((const uint32_t *)s);
    
            *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
            d+=4;
            s+=4;
    
    Arpi's avatar
    Arpi committed
        }
    
    Arpi's avatar
    Arpi committed
        {
    
            register unsigned short x= *((const uint16_t *)s);
    
            *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
    
    Arpi's avatar
    Arpi committed
        }
    
    Benoit Fouet's avatar
    Benoit Fouet committed
    static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
    
        register const uint8_t* s=src;
        register uint8_t* d=dst;
        register const uint8_t *end;
        const uint8_t *mm_end;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*s));
        __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
        __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
    
            __asm__ volatile(
    
            PREFETCH"  32%1         \n\t"
            "movq        %1, %%mm0  \n\t"
            "movq       8%1, %%mm2  \n\t"
            "movq     %%mm0, %%mm1  \n\t"
            "movq     %%mm2, %%mm3  \n\t"
            "psrlq       $1, %%mm0  \n\t"
            "psrlq       $1, %%mm2  \n\t"
            "pand     %%mm7, %%mm0  \n\t"
            "pand     %%mm7, %%mm2  \n\t"
            "pand     %%mm6, %%mm1  \n\t"
            "pand     %%mm6, %%mm3  \n\t"
            "por      %%mm1, %%mm0  \n\t"
            "por      %%mm3, %%mm2  \n\t"
            MOVNTQ"   %%mm0,  %0    \n\t"
            MOVNTQ"   %%mm2, 8%0"
            :"=m"(*d)
            :"m"(*s)
            );
            d+=16;
            s+=16;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
            register uint32_t x= *((const uint32_t*)s);
    
            *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
            s+=4;
            d+=4;
    
            register uint16_t x= *((const uint16_t*)s);
    
            *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
            s+=2;
            d+=2;
    
    static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
    Arpi's avatar
    Arpi committed
    #endif
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
    #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
    
        __asm__ volatile(
    
        "movq           %3, %%mm5   \n\t"
        "movq           %4, %%mm6   \n\t"
        "movq           %5, %%mm7   \n\t"
        "jmp 2f                     \n\t"
        ASMALIGN(4)
        "1:                         \n\t"
        PREFETCH"   32(%1)          \n\t"
        "movd         (%1), %%mm0   \n\t"
        "movd        4(%1), %%mm3   \n\t"
        "punpckldq   8(%1), %%mm0   \n\t"
        "punpckldq  12(%1), %%mm3   \n\t"
        "movq        %%mm0, %%mm1   \n\t"
        "movq        %%mm3, %%mm4   \n\t"
        "pand        %%mm6, %%mm0   \n\t"
        "pand        %%mm6, %%mm3   \n\t"
        "pmaddwd     %%mm7, %%mm0   \n\t"
        "pmaddwd     %%mm7, %%mm3   \n\t"
        "pand        %%mm5, %%mm1   \n\t"
        "pand        %%mm5, %%mm4   \n\t"
        "por         %%mm1, %%mm0   \n\t"
        "por         %%mm4, %%mm3   \n\t"
        "psrld          $5, %%mm0   \n\t"
        "pslld         $11, %%mm3   \n\t"
        "por         %%mm3, %%mm0   \n\t"
        MOVNTQ"      %%mm0, (%0)    \n\t"
        "add           $16,  %1     \n\t"
        "add            $8,  %0     \n\t"
        "2:                         \n\t"
        "cmp            %2,  %1     \n\t"
        " jb            1b          \n\t"
        : "+r" (d), "+r"(s)
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
        );
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq    %0, %%mm7    \n\t"
            "movq    %1, %%mm6    \n\t"
            ::"m"(red_16mask),"m"(green_16mask));
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         4%1, %%mm3    \n\t"
            "punpckldq    8%1, %%mm0    \n\t"
            "punpckldq   12%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psrlq         $3, %%mm0    \n\t"
            "psrlq         $3, %%mm3    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %2, %%mm3    \n\t"
            "psrlq         $5, %%mm1    \n\t"
            "psrlq         $5, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq         $8, %%mm2    \n\t"
            "psrlq         $8, %%mm5    \n\t"
            "pand       %%mm7, %%mm2    \n\t"
            "pand       %%mm7, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
            d += 4;
            s += 16;
        }
    #endif
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
            register int rgb = *(const uint32_t*)s; s += 4;
    
            *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
        }
    
    static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq          %0, %%mm7    \n\t"
            "movq          %1, %%mm6    \n\t"
            ::"m"(red_16mask),"m"(green_16mask));
        mm_end = end - 15;
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         4%1, %%mm3    \n\t"
            "punpckldq    8%1, %%mm0    \n\t"
            "punpckldq   12%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psllq         $8, %%mm0    \n\t"
            "psllq         $8, %%mm3    \n\t"
            "pand       %%mm7, %%mm0    \n\t"
            "pand       %%mm7, %%mm3    \n\t"
            "psrlq         $5, %%mm1    \n\t"
            "psrlq         $5, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq        $19, %%mm2    \n\t"
            "psrlq        $19, %%mm5    \n\t"
            "pand          %2, %%mm2    \n\t"
            "pand          %2, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
            d += 4;
            s += 16;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
            register int rgb = *(const uint32_t*)s; s += 4;
    
            *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
        }
    
    static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
    Arpi's avatar
    Arpi committed
    #endif
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
    #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
    
        __asm__ volatile(
    
        "movq           %3, %%mm5   \n\t"
        "movq           %4, %%mm6   \n\t"
        "movq           %5, %%mm7   \n\t"
        "jmp            2f          \n\t"
        ASMALIGN(4)
        "1:                         \n\t"
        PREFETCH"   32(%1)          \n\t"
        "movd         (%1), %%mm0   \n\t"
        "movd        4(%1), %%mm3   \n\t"
        "punpckldq   8(%1), %%mm0   \n\t"
        "punpckldq  12(%1), %%mm3   \n\t"
        "movq        %%mm0, %%mm1   \n\t"
        "movq        %%mm3, %%mm4   \n\t"
        "pand        %%mm6, %%mm0   \n\t"
        "pand        %%mm6, %%mm3   \n\t"
        "pmaddwd     %%mm7, %%mm0   \n\t"
        "pmaddwd     %%mm7, %%mm3   \n\t"
        "pand        %%mm5, %%mm1   \n\t"
        "pand        %%mm5, %%mm4   \n\t"
        "por         %%mm1, %%mm0   \n\t"
        "por         %%mm4, %%mm3   \n\t"
        "psrld          $6, %%mm0   \n\t"
        "pslld         $10, %%mm3   \n\t"
        "por         %%mm3, %%mm0   \n\t"
        MOVNTQ"      %%mm0, (%0)    \n\t"
        "add           $16,  %1     \n\t"
        "add            $8,  %0     \n\t"
        "2:                         \n\t"
        "cmp            %2,  %1     \n\t"
        " jb            1b          \n\t"
        : "+r" (d), "+r"(s)
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
        );
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq          %0, %%mm7    \n\t"
            "movq          %1, %%mm6    \n\t"
            ::"m"(red_15mask),"m"(green_15mask));
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         4%1, %%mm3    \n\t"
            "punpckldq    8%1, %%mm0    \n\t"
            "punpckldq   12%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psrlq         $3, %%mm0    \n\t"
            "psrlq         $3, %%mm3    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %2, %%mm3    \n\t"
            "psrlq         $6, %%mm1    \n\t"
            "psrlq         $6, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq         $9, %%mm2    \n\t"
            "psrlq         $9, %%mm5    \n\t"
            "pand       %%mm7, %%mm2    \n\t"
            "pand       %%mm7, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
            d += 4;
            s += 16;
        }
    #endif
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
            register int rgb = *(const uint32_t*)s; s += 4;
    
            *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
        }
    
    static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq          %0, %%mm7    \n\t"
            "movq          %1, %%mm6    \n\t"
            ::"m"(red_15mask),"m"(green_15mask));
        mm_end = end - 15;
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         4%1, %%mm3    \n\t"
            "punpckldq    8%1, %%mm0    \n\t"
            "punpckldq   12%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psllq         $7, %%mm0    \n\t"
            "psllq         $7, %%mm3    \n\t"
            "pand       %%mm7, %%mm0    \n\t"
            "pand       %%mm7, %%mm3    \n\t"
            "psrlq         $6, %%mm1    \n\t"
            "psrlq         $6, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq        $19, %%mm2    \n\t"
            "psrlq        $19, %%mm5    \n\t"
            "pand          %2, %%mm2    \n\t"
            "pand          %2, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
            d += 4;
            s += 16;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
            register int rgb = *(const uint32_t*)s; s += 4;
    
            *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
        }
    
    static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
    Arpi's avatar
    Arpi committed
    #endif
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq         %0, %%mm7     \n\t"
            "movq         %1, %%mm6     \n\t"
            ::"m"(red_16mask),"m"(green_16mask));
        mm_end = end - 11;
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         3%1, %%mm3    \n\t"
            "punpckldq    6%1, %%mm0    \n\t"
            "punpckldq    9%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psrlq         $3, %%mm0    \n\t"
            "psrlq         $3, %%mm3    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %2, %%mm3    \n\t"
            "psrlq         $5, %%mm1    \n\t"
            "psrlq         $5, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq         $8, %%mm2    \n\t"
            "psrlq         $8, %%mm5    \n\t"
            "pand       %%mm7, %%mm2    \n\t"
            "pand       %%mm7, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
            d += 4;
            s += 12;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
    #endif
        while (s < end)
        {
            const int b = *s++;
            const int g = *s++;
            const int r = *s++;
            *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
        }
    
    static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq         %0, %%mm7     \n\t"
            "movq         %1, %%mm6     \n\t"
            ::"m"(red_16mask),"m"(green_16mask));
        mm_end = end - 15;
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         3%1, %%mm3    \n\t"
            "punpckldq    6%1, %%mm0    \n\t"
            "punpckldq    9%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psllq         $8, %%mm0    \n\t"
            "psllq         $8, %%mm3    \n\t"
            "pand       %%mm7, %%mm0    \n\t"
            "pand       %%mm7, %%mm3    \n\t"
            "psrlq         $5, %%mm1    \n\t"
            "psrlq         $5, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq        $19, %%mm2    \n\t"
            "psrlq        $19, %%mm5    \n\t"
            "pand          %2, %%mm2    \n\t"
            "pand          %2, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
            d += 4;
            s += 12;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
    #endif
        while (s < end)
        {
            const int r = *s++;
            const int g = *s++;
            const int b = *s++;
            *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
        }
    
    static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
    Arpi's avatar
    Arpi committed
    #endif
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq          %0, %%mm7    \n\t"
            "movq          %1, %%mm6    \n\t"
            ::"m"(red_15mask),"m"(green_15mask));
        mm_end = end - 11;
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movd          %1, %%mm0    \n\t"
            "movd         3%1, %%mm3    \n\t"
            "punpckldq    6%1, %%mm0    \n\t"
            "punpckldq    9%1, %%mm3    \n\t"
            "movq       %%mm0, %%mm1    \n\t"
            "movq       %%mm0, %%mm2    \n\t"
            "movq       %%mm3, %%mm4    \n\t"
            "movq       %%mm3, %%mm5    \n\t"
            "psrlq         $3, %%mm0    \n\t"
            "psrlq         $3, %%mm3    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %2, %%mm3    \n\t"
            "psrlq         $6, %%mm1    \n\t"
            "psrlq         $6, %%mm4    \n\t"
            "pand       %%mm6, %%mm1    \n\t"
            "pand       %%mm6, %%mm4    \n\t"
            "psrlq         $9, %%mm2    \n\t"
            "psrlq         $9, %%mm5    \n\t"
            "pand       %%mm7, %%mm2    \n\t"
            "pand       %%mm7, %%mm5    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "por        %%mm5, %%mm3    \n\t"
            "psllq        $16, %%mm3    \n\t"
            "por        %%mm3, %%mm0    \n\t"
            MOVNTQ"     %%mm0, %0       \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
            d += 4;
            s += 12;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
    #endif
        while (s < end)
        {
            const int b = *s++;
            const int g = *s++;
            const int r = *s++;
            *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
        }
    
    Arpi's avatar
    Arpi committed
    }
    
    
    static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
    
        const uint8_t *s = src;
        const uint8_t *end;
    
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
        __asm__ volatile(
    
            "movq         %0, %%mm7     \n\t"
            "movq         %1, %%mm6     \n\t"
            ::"m"(red_15mask),"m"(green_15mask));
        mm_end = end - 15;
        while (s < mm_end)
        {
    
            __asm__ volatile(
    
            PREFETCH"   32%1            \n\t"
            "movd         %1, %%mm0     \n\t"
            "movd        3%1, %%mm3     \n\t"
            "punpckldq   6%1, %%mm0     \n\t"
            "punpckldq   9%1, %%mm3     \n\t"
            "movq      %%mm0, %%mm1     \n\t"
            "movq      %%mm0, %%mm2     \n\t"
            "movq      %%mm3, %%mm4     \n\t"
            "movq      %%mm3, %%mm5     \n\t"
            "psllq        $7, %%mm0     \n\t"
            "psllq        $7, %%mm3     \n\t"
            "pand      %%mm7, %%mm0     \n\t"
            "pand      %%mm7, %%mm3     \n\t"
            "psrlq        $6, %%mm1     \n\t"
            "psrlq        $6, %%mm4     \n\t"
            "pand      %%mm6, %%mm1     \n\t"
            "pand      %%mm6, %%mm4     \n\t"
            "psrlq       $19, %%mm2     \n\t"
            "psrlq       $19, %%mm5     \n\t"
            "pand         %2, %%mm2     \n\t"
            "pand         %2, %%mm5     \n\t"
            "por       %%mm1, %%mm0     \n\t"
            "por       %%mm4, %%mm3     \n\t"
            "por       %%mm2, %%mm0     \n\t"
            "por       %%mm5, %%mm3     \n\t"
            "psllq       $16, %%mm3     \n\t"
            "por       %%mm3, %%mm0     \n\t"
            MOVNTQ"    %%mm0, %0        \n\t"
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
            d += 4;
            s += 12;
        }
    
        __asm__ volatile(SFENCE:::"memory");
        __asm__ volatile(EMMS:::"memory");
    
    #endif
        while (s < end)
        {
            const int r = *s++;
            const int g = *s++;
            const int b = *s++;
            *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
        }
    
    Arpi's avatar
    Arpi committed
    /*
    
      I use less accurate approximation here by simply left-shifting the input
      value and filling the low order bits with zeroes. This method improves PNG
      compression but this scheme cannot reproduce white exactly, since it does
      not generate an all-ones maximum value; the net effect is to darken the
    
    Arpi's avatar
    Arpi committed
      image slightly.
    
      The better method should be "left bit replication":
    
       4 3 2 1 0
       ---------
       1 1 0 1 1
    
       7 6 5 4 3  2 1 0
       ----------------
       1 1 0 1 1  1 1 0
       |=======|  |===|
    
           |      leftmost bits repeated to fill open bits
    
    Arpi's avatar
    Arpi committed
           |
    
       original bits
    
    Arpi's avatar
    Arpi committed
    */
    
    static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
    
    Arpi's avatar
    Arpi committed
    {
    
    Arpi's avatar
    Arpi committed
    #endif
    
    Baptiste Coudurier's avatar
    Baptiste Coudurier committed
        uint8_t *d = dst;
    
        const uint16_t *s = (const uint16_t*)src;
    
        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
    
            __asm__ volatile(
    
            PREFETCH"    32%1           \n\t"
            "movq          %1, %%mm0    \n\t"
            "movq          %1, %%mm1    \n\t"
            "movq          %1, %%mm2    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %3, %%mm1    \n\t"
            "pand          %4, %%mm2    \n\t"
            "psllq         $3, %%mm0    \n\t"
            "psrlq         $2, %%mm1    \n\t"
            "psrlq         $7, %%mm2    \n\t"
            "movq       %%mm0, %%mm3    \n\t"
            "movq       %%mm1, %%mm4    \n\t"
            "movq       %%mm2, %%mm5    \n\t"
            "punpcklwd     %5, %%mm0    \n\t"
            "punpcklwd     %5, %%mm1    \n\t"
            "punpcklwd     %5, %%mm2    \n\t"
            "punpckhwd     %5, %%mm3    \n\t"
            "punpckhwd     %5, %%mm4    \n\t"
            "punpckhwd     %5, %%mm5    \n\t"
            "psllq         $8, %%mm1    \n\t"
            "psllq        $16, %%mm2    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "psllq         $8, %%mm4    \n\t"
            "psllq        $16, %%mm5    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm5, %%mm3    \n\t"
    
            "movq       %%mm0, %%mm6    \n\t"
            "movq       %%mm3, %%mm7    \n\t"
    
            "movq         8%1, %%mm0    \n\t"
            "movq         8%1, %%mm1    \n\t"
            "movq         8%1, %%mm2    \n\t"
            "pand          %2, %%mm0    \n\t"
            "pand          %3, %%mm1    \n\t"
            "pand          %4, %%mm2    \n\t"
            "psllq         $3, %%mm0    \n\t"
            "psrlq         $2, %%mm1    \n\t"
            "psrlq         $7, %%mm2    \n\t"
            "movq       %%mm0, %%mm3    \n\t"
            "movq       %%mm1, %%mm4    \n\t"
            "movq       %%mm2, %%mm5    \n\t"
            "punpcklwd     %5, %%mm0    \n\t"
            "punpcklwd     %5, %%mm1    \n\t"
            "punpcklwd     %5, %%mm2    \n\t"
            "punpckhwd     %5, %%mm3    \n\t"
            "punpckhwd     %5, %%mm4    \n\t"
            "punpckhwd     %5, %%mm5    \n\t"
            "psllq         $8, %%mm1    \n\t"
            "psllq        $16, %%mm2    \n\t"
            "por        %%mm1, %%mm0    \n\t"
            "por        %%mm2, %%mm0    \n\t"
            "psllq         $8, %%mm4    \n\t"
            "psllq        $16, %%mm5    \n\t"
            "por        %%mm4, %%mm3    \n\t"
            "por        %%mm5, %%mm3    \n\t"
    
            :"=m"(*d)
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
            :"memory");
    
            /* borrowed 32 to 24 */
    
            __asm__ volatile(