// -*- c++ -*-
/////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2001 Tom Barry.  All rights reserved.
/////////////////////////////////////////////////////////////////////////////
//
//  This file is subject to the terms of the GNU General Public License as
//  published by the Free Software Foundation.  A copy of this license is
//  included with this software distribution in the file COPYING.  If you
//  do not have a copy, you may obtain a copy by writing to the Free
//  Software Foundation, 51 Franklin Steet, Fifth Floor, Cambridge, MA 02110-1301, USA.
//
//  This software is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details
/////////////////////////////////////////////////////////////////////////////

// This might need to be tuned for different processors.
// Prefetching seems to get an extra ~30% on my 133MHz bus SDR Athlon

#ifndef PREFETCH_STRIDE
#define PREFETCH_STRIDE "128"
#endif

// Processor specific averaging:
// Set destMM to average of destMM and sourceMM
// Note that this is a somewhat unconventional averaging function: It rounds toward
// the first operand if it is (even and larger) or (odd and smaller).  This is faster
// and just as effective here as "round toward even."
// Explanation of the MMX version: 1 is added to the source pixel if it is odd (and != 255)
// Then half the (adjusted) source pixel (rounding down -- which is effectively the same as
// rounding the unadjusted pixel up unless source == 255) is added to half the destination
// pixel (also rounding down). This gives the same result as the much faster and less 
// complicated versions for other processors
//.Yes, shiftMask and noLowBitsMask could be the same, but this is a little easier to
// follow.

// tempMM is changed 

#undef AVERAGE
#if defined(IS_SSE)
#define AVERAGE(destMM, sourceMM, tempMM, shiftMask, noLowBitsMask) \
    "pand  "noLowBitsMask", "destMM"\n\t"                           \
    "pavgb "sourceMM",      "destMM"\n\t"
#elif defined(IS_3DNOW)
#define AVERAGE(destMM, sourceMM, tempMM, shiftMask, noLowBitsMask) \
    "pand    "noLowBitsMask", "destMM"\n\t"                         \
    "pavgusb "sourceMM",      "destMM"\n\t"
#else
#define AVERAGE(destMM, sourceMM, tempMM, shiftMask, noLowBitsMask) \
    "movq    "noLowBitsMask", "tempMM"\n\t"                         \
    "pandn   "sourceMM",      "tempMM"\n\t"                         \
    "paddusb "sourceMM",      "tempMM"\n\t"                         \
    "pand    "shiftMask",     "tempMM"\n\t"                         \
    "psrlw   $1,              "tempMM"\n\t"                         \
    "pand    "shiftMask",     "destMM"\n\t"                         \
    "psrlw   $1,              "destMM"\n\t"                         \
    "paddusb "tempMM",        "destMM"\n\t"
#endif // processor specific averaging routine

// Hidden in the preprocessor stuff below is the actual routine

#if defined( USE_PREFETCH )
#if defined(IS_SSE)
void filterSSE_P(KdetvImageFilterContext* ctx)
#else // IS_3DNOW
void filter3DNOW_P(KdetvImageFilterContext* ctx)
#endif
#else // no prefetching
#if defined(IS_SSE)
void filterSSE(KdetvImageFilterContext* ctx)
#elif defined(IS_3DNOW)
void filter3DNOW(KdetvImageFilterContext* ctx)
#else
void filterMMX(KdetvImageFilterContext* ctx)
#endif
#endif // main procedure name
{
    unsigned char* Pixels    = ctx->out->buffer();
    int64_t qwYMask          = 0x00ff00ff00ff00ffull;
    int64_t qwShiftMask      = 0xfefffefffefffeffull;
    int64_t qwNoLowBitsMask  = 0xfefefefefefefefeull;
    int64_t qwRounding       = 0x0080008000800080ull;
    long    Cycles           = ctx->out->bytesPerLine() / 8 - 2;
    int64_t qwSharpness      = Sharpness;
    int     fieldHeight      = ctx->out->size().height();
    unsigned int outputpitch = ctx->out->bytesPerLine() + ctx->out->stride();
    
    qwSharpness |= (qwSharpness << 48) | (qwSharpness << 32) | (qwSharpness << 16);
    
    for (int y=0; y<fieldHeight; y++) {
        __asm__ __volatile__
            (
             MOVX"    %[Pixels],      %%"XAX"\n\t"
             MOVX"    %[Cycles],      %%"XCX"\n\t"
	     
             "movq    (%%"XAX"),      %%mm1\n\t"
             ADDX"    $8,             %%"XAX"\n\t"
             "movq    (%%"XAX"),      %%mm2\n\t"
             "movq    %[qwYMask],     %%mm5\n\t"
             "movq    %[qwSharpness], %%mm4\n\t"

             "1:\n\t"
             "movq    %%mm1,          %%mm0\n\t"
             "movq    %%mm2,          %%mm1\n\t"
             "movq    8(%%"XAX"),     %%mm2\n\t"

             // do edge enhancement. 
             "movq    %%mm1,          %%mm7\n\t"  // work copy of curr pixel val
             "psrlq   $48,            %%mm0\n\t"  // right justify 1 pixel from qword to left
             "psllq   $16,            %%mm7\n\t"  // left justify 3 pixels
             "por     %%mm7,          %%mm0\n\t"  // and combine

             "movq    %%mm2,          %%mm6\n\t"  // copy of right qword pixel val
             "movq    %%mm1,          %%mm7\n\t"  // another copy of L2N current
             "psllq   $48,            %%mm6\n\t"  // left just 1 pixel from qword to right
             "psrlq   $16,            %%mm7\n\t"  // right just 3 pixels
             "por     %%mm7,          %%mm6\n\t"  // combine

             AVERAGE("%%mm0", "%%mm6", "%%mm7", "%[qwShiftMask]", "%[qwNoLowBitsMask]") // avg of forward and prev by 1 pixel

             // we handle the possible plus and minus sharpness adjustments separately
             "movq    %%mm1,          %%mm7\n\t"  // another copy of L2N
             "psubusb %%mm0,          %%mm7\n\t"  // curr - surround
             "psubusb %%mm1,          %%mm0\n\t"  // surround - curr
             "pand    %%mm5,          %%mm7\n\t"  // YMask
             "pand    %%mm5,          %%mm0\n\t"  // YMask
             "pmullw  %%mm4,          %%mm7\n\t"  // mult by sharpness factor
             "pmullw  %%mm4,          %%mm0\n\t"  // mult by sharpness factor
             "paddusw %[qwRounding],  %%mm7\n\t"  // correct for rounding
             "paddusw %[qwRounding],  %%mm0\n\t"  // correct for rounding

#if defined( USE_PREFETCH )
    #if defined(IS_SSE)
             "prefetchnta "PREFETCH_STRIDE"(%%"XAX")\n\t"
    #elif defined(IS_3DNOW)
             "prefetchw "PREFETCH_STRIDE"(%%"XAX")\n\t"
    #endif
#endif // prefetching
             "psrlw   $8,             %%mm7\n\t"  // now have diff*EdgeEnhAmt/256 ratio
             "psrlw   $8,             %%mm0\n\t"  // now have diff*EdgeEnhAmt/256 ratio

             "paddusb %%mm1,          %%mm7\n\t"  // edge enhancement up
             "psubusb %%mm0,          %%mm7\n\t"  // edge enhancement down, mm7 now our sharpened value

             "movq    %%mm7,          (%%"XAX")\n\t"
             ADDX"    $8,             %%"XAX"\n\t"
             DECX"    %%"XCX"\n\t"
             "jne     1b\n\t"

             : /* no outputs */

             : [Pixels]          "g"(Pixels),
               [Cycles]          "g"(Cycles),
               [qwYMask]         "m"(qwYMask),
               [qwShiftMask]     "m"(qwShiftMask),
               [qwNoLowBitsMask] "m"(qwNoLowBitsMask),
               [qwRounding]      "m"(qwRounding),
               [qwSharpness]     "m"(qwSharpness)

             : XAX, XCX,
#ifdef ARCH_386
               "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
#endif
               "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
               "memory", "cc"
             );

        Pixels += outputpitch;
    }

#ifdef ARCH_386
    __asm__ __volatile__ ("emms\n\t");
#endif
}
