mirror of
https://github.com/libjpeg-turbo/libjpeg-turbo
synced 2025-03-28 21:13:18 +00:00
C/SSE2 optimization of encode_mcu_AC_refine()
This commit adds C and SSE2 optimizations for the encode_mcu_AC_refine() function used in progressive Huffman encoding. The image used for testing can be retrieved from this page: https://blog.cloudflare.com/doubling-the-speed-of-jpegtran All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz` clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)` gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0` gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0` Here are the results in comparison to libjpeg-turbo@3c54642 using `time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg` C clang x86_64: +7% gcc-5 x86_64: +30% gcc-7 x86_64: +33% clang i386: +0% gcc-5 i386: +24% gcc-7 i386: +23% SSE2 clang x86_64: +42% gcc-5 x86_64: +53% gcc-7 x86_64: +64% clang i386: +35% gcc-5 i386: +46% gcc-7 i386: +49% Discussion in libjpeg-turbo/libjpeg-turbo#46
This commit is contained in:
parent
81baa1b501
commit
16bd984557
@ -341,10 +341,19 @@ set(EFFECTIVE_LD_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMA
|
||||
message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}")
|
||||
|
||||
include(CheckCSourceCompiles)
|
||||
include(CheckIncludeFiles)
|
||||
include(CheckTypeSize)
|
||||
|
||||
check_type_size("size_t" SIZE_T)
|
||||
check_type_size("unsigned long" UNSIGNED_LONG)
|
||||
|
||||
if(SIZE_T EQUAL UNSIGNED_LONG)
|
||||
check_c_source_compiles("int main(int argc, char **argv) { unsigned long a = argc; return __builtin_ctzl(a); }"
|
||||
HAVE_BUILTIN_CTZL)
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
# Check for headers
|
||||
include(CheckIncludeFiles)
|
||||
check_include_files(locale.h HAVE_LOCALE_H)
|
||||
check_include_files(stddef.h HAVE_STDDEF_H)
|
||||
check_include_files(stdlib.h HAVE_STDLIB_H)
|
||||
@ -359,10 +368,8 @@ if(UNIX)
|
||||
endif()
|
||||
|
||||
# Check for types
|
||||
include(CheckTypeSize)
|
||||
check_type_size("unsigned char" UNSIGNED_CHAR)
|
||||
check_type_size("unsigned short" UNSIGNED_SHORT)
|
||||
check_type_size("size_t" SIZE_T)
|
||||
|
||||
# Check for compiler features
|
||||
check_c_source_compiles("int main(void) { typedef struct undefined_structure *undef_struct_ptr; }"
|
||||
@ -408,6 +415,7 @@ if(UNIX)
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
check_include_files("intrin.h" HAVE_INTRIN_H)
|
||||
set(INLINE_OPTIONS "__inline;inline")
|
||||
else()
|
||||
set(INLINE_OPTIONS "__inline__;inline")
|
||||
|
@ -13,9 +13,19 @@
|
||||
/* Version number of package */
|
||||
#define VERSION "@VERSION@"
|
||||
|
||||
#ifndef _WIN32
|
||||
|
||||
/* The size of `size_t', as computed by sizeof. */
|
||||
#define SIZEOF_SIZE_T @SIZE_T@
|
||||
|
||||
/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
|
||||
#cmakedefine HAVE_BUILTIN_CTZL
|
||||
|
||||
/* Define to 1 if you have the <intrin.h> header file. */
|
||||
#cmakedefine HAVE_INTRIN_H
|
||||
|
||||
#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
|
||||
#if (SIZEOF_SIZE_T == 8)
|
||||
#define HAVE_BITSCANFORWARD64
|
||||
#elif (SIZEOF_SIZE_T == 4)
|
||||
#define HAVE_BITSCANFORWARD
|
||||
#endif
|
||||
#endif
|
||||
|
290
jcphuff.c
290
jcphuff.c
@ -5,6 +5,7 @@
|
||||
* Copyright (C) 1995-1997, Thomas G. Lane.
|
||||
* libjpeg-turbo Modifications:
|
||||
* Copyright (C) 2011, 2015, 2018, D. R. Commander.
|
||||
* Copyright (C) 2016, 2018, Matthieu Darbois.
|
||||
* For conditions of distribution and use, see the accompanying README.ijg
|
||||
* file.
|
||||
*
|
||||
@ -18,9 +19,22 @@
|
||||
#define JPEG_INTERNALS
|
||||
#include "jinclude.h"
|
||||
#include "jpeglib.h"
|
||||
#include "jchuff.h" /* Declarations shared with jchuff.c */
|
||||
#include "jsimd.h"
|
||||
#include "jconfigint.h"
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef HAVE_INTRIN_H
|
||||
#include <intrin.h>
|
||||
#ifdef _MSC_VER
|
||||
#ifdef HAVE_BITSCANFORWARD64
|
||||
#pragma intrinsic(_BitScanForward64)
|
||||
#endif
|
||||
#ifdef HAVE_BITSCANFORWARD
|
||||
#pragma intrinsic(_BitScanForward)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef C_PROGRESSIVE_SUPPORTED
|
||||
|
||||
/*
|
||||
@ -59,6 +73,11 @@
|
||||
typedef struct {
|
||||
struct jpeg_entropy_encoder pub; /* public fields */
|
||||
|
||||
/* Pointer to routine to prepare data for encode_mcu_AC_refine() */
|
||||
int (*AC_refine_prepare) (const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits);
|
||||
|
||||
/* Mode flag: TRUE for optimization, FALSE for actual data output */
|
||||
boolean gather_statistics;
|
||||
|
||||
@ -120,6 +139,8 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
|
||||
#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
|
||||
#endif
|
||||
|
||||
#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
|
||||
|
||||
/* Forward declarations */
|
||||
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
|
||||
JBLOCKROW *MCU_data);
|
||||
@ -127,12 +148,41 @@ METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
|
||||
JBLOCKROW *MCU_data);
|
||||
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
|
||||
JBLOCKROW *MCU_data);
|
||||
METHODDEF(int) encode_mcu_AC_refine_prepare
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *absvalues, size_t *bits);
|
||||
METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
|
||||
JBLOCKROW *MCU_data);
|
||||
METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
|
||||
METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
|
||||
|
||||
|
||||
/* Count bit loop zeroes */
|
||||
INLINE
|
||||
METHODDEF(int)
|
||||
count_zeroes(size_t *x)
|
||||
{
|
||||
int result;
|
||||
#if defined(HAVE_BUILTIN_CTZL)
|
||||
result = __builtin_ctzl(*x);
|
||||
*x >>= result;
|
||||
#elif defined(HAVE_BITSCANFORWARD64)
|
||||
_BitScanForward64(&result, *x);
|
||||
*x >>= result;
|
||||
#elif defined(HAVE_BITSCANFORWARD)
|
||||
_BitScanForward(&result, *x);
|
||||
*x >>= result;
|
||||
#else
|
||||
result = 0;
|
||||
while ((*x & 1) == 0) {
|
||||
++result;
|
||||
*x >>= 1;
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialize for a Huffman-compressed scan using progressive JPEG.
|
||||
*/
|
||||
@ -163,6 +213,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
|
||||
entropy->pub.encode_mcu = encode_mcu_DC_refine;
|
||||
else {
|
||||
entropy->pub.encode_mcu = encode_mcu_AC_refine;
|
||||
if (jsimd_can_encode_mcu_AC_refine_prepare())
|
||||
entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
|
||||
else
|
||||
entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
|
||||
/* AC refinement needs a correction bit buffer */
|
||||
if (entropy->bit_buffer == NULL)
|
||||
entropy->bit_buffer = (char *)
|
||||
@ -637,23 +691,149 @@ encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Data preparation for encode_mcu_AC_refine().
|
||||
*/
|
||||
|
||||
#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
|
||||
/* It is convenient to make a pre-pass to determine the transformed \
|
||||
* coefficients' absolute values and the EOB position. \
|
||||
*/ \
|
||||
for (k = 0; k < Sl; k++) { \
|
||||
temp = block[jpeg_natural_order_start[k]]; \
|
||||
/* We must apply the point transform by Al. For AC coefficients this \
|
||||
* is an integer division with rounding towards 0. To do this portably \
|
||||
* in C, we shift after obtaining the absolute value. \
|
||||
*/ \
|
||||
temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
|
||||
temp ^= temp2; \
|
||||
temp -= temp2; /* temp is abs value of input */ \
|
||||
temp >>= Al; /* apply the point transform */ \
|
||||
if (temp != 0) { \
|
||||
zerobits |= ((size_t)1U) << k; \
|
||||
signbits |= ((size_t)(temp2 + 1)) << k; \
|
||||
} \
|
||||
absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
|
||||
if (temp == 1) \
|
||||
EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \
|
||||
} \
|
||||
}
|
||||
|
||||
METHODDEF(int)
|
||||
encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
register int k, temp, temp2;
|
||||
int EOB = 0;
|
||||
size_t zerobits = 0U, signbits = 0U;
|
||||
int Sl0 = Sl;
|
||||
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
if (Sl0 > 32)
|
||||
Sl0 = 32;
|
||||
#endif
|
||||
|
||||
COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
|
||||
|
||||
bits[0] = zerobits;
|
||||
#if SIZEOF_SIZE_T == 8
|
||||
bits[1] = signbits;
|
||||
#else
|
||||
bits[2] = signbits;
|
||||
|
||||
zerobits = 0U;
|
||||
signbits = 0U;
|
||||
|
||||
if (Sl > 32) {
|
||||
Sl -= 32;
|
||||
jpeg_natural_order_start += 32;
|
||||
absvalues += 32;
|
||||
|
||||
COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
|
||||
}
|
||||
|
||||
bits[1] = zerobits;
|
||||
bits[3] = signbits;
|
||||
#endif
|
||||
|
||||
return EOB;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* MCU encoding for AC successive approximation refinement scan.
|
||||
*/
|
||||
|
||||
#define ENCODE_COEFS_AC_REFINE(label) { \
|
||||
while (zerobits) { \
|
||||
int idx = count_zeroes(&zerobits); \
|
||||
r += idx; \
|
||||
cabsvalue += idx; \
|
||||
signbits >>= idx; \
|
||||
label \
|
||||
/* Emit any required ZRLs, but not if they can be folded into EOB */ \
|
||||
while (r > 15 && (cabsvalue <= EOBPTR)) { \
|
||||
/* emit any pending EOBRUN and the BE correction bits */ \
|
||||
emit_eobrun(entropy); \
|
||||
/* Emit ZRL */ \
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
|
||||
r -= 16; \
|
||||
/* Emit buffered correction bits that must be associated with ZRL */ \
|
||||
emit_buffered_bits(entropy, BR_buffer, BR); \
|
||||
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
|
||||
BR = 0; \
|
||||
} \
|
||||
\
|
||||
temp = *cabsvalue++; \
|
||||
\
|
||||
/* If the coef was previously nonzero, it only needs a correction bit. \
|
||||
* NOTE: a straight translation of the spec's figure G.7 would suggest \
|
||||
* that we also need to test r > 15. But if r > 15, we can only get here \
|
||||
* if k > EOB, which implies that this coefficient is not 1. \
|
||||
*/ \
|
||||
if (temp > 1) { \
|
||||
/* The correction bit is the next bit of the absolute value. */ \
|
||||
BR_buffer[BR++] = (char)(temp & 1); \
|
||||
signbits >>= 1; \
|
||||
zerobits >>= 1; \
|
||||
continue; \
|
||||
} \
|
||||
\
|
||||
/* Emit any pending EOBRUN and the BE correction bits */ \
|
||||
emit_eobrun(entropy); \
|
||||
\
|
||||
/* Count/emit Huffman symbol for run length / number of bits */ \
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
|
||||
\
|
||||
/* Emit output bit for newly-nonzero coef */ \
|
||||
temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
|
||||
emit_bits(entropy, (unsigned int)temp, 1); \
|
||||
\
|
||||
/* Emit buffered correction bits that must be associated with this code */ \
|
||||
emit_buffered_bits(entropy, BR_buffer, BR); \
|
||||
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
|
||||
BR = 0; \
|
||||
r = 0; /* reset zero run length */ \
|
||||
signbits >>= 1; \
|
||||
zerobits >>= 1; \
|
||||
} \
|
||||
}
|
||||
|
||||
METHODDEF(boolean)
|
||||
encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
{
|
||||
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
|
||||
register int temp, temp3;
|
||||
register int r, k;
|
||||
int EOB;
|
||||
register int temp, r;
|
||||
char *BR_buffer;
|
||||
unsigned int BR;
|
||||
int Se = cinfo->Se;
|
||||
int Sl = cinfo->Se - cinfo->Ss + 1;
|
||||
int Al = cinfo->Al;
|
||||
JBLOCKROW block;
|
||||
int absvalues[DCTSIZE2];
|
||||
JCOEF absvalues_unaligned[DCTSIZE2 + 15];
|
||||
JCOEF *absvalues;
|
||||
const JCOEF *cabsvalue, *EOBPTR;
|
||||
size_t zerobits, signbits;
|
||||
size_t bits[16 / SIZEOF_SIZE_T];
|
||||
|
||||
entropy->next_output_byte = cinfo->dest->next_output_byte;
|
||||
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
|
||||
@ -663,27 +843,17 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
if (entropy->restarts_to_go == 0)
|
||||
emit_restart(entropy, entropy->next_restart_num);
|
||||
|
||||
/* Encode the MCU data block */
|
||||
block = MCU_data[0];
|
||||
#ifdef WITH_SIMD
|
||||
cabsvalue = absvalues = (JCOEF *)PAD((size_t)absvalues_unaligned, 16);
|
||||
#else
|
||||
/* Not using SIMD, so alignment is not needed */
|
||||
cabsvalue = absvalues = absvalues_unaligned;
|
||||
#endif
|
||||
|
||||
/* It is convenient to make a pre-pass to determine the transformed
|
||||
* coefficients' absolute values and the EOB position.
|
||||
*/
|
||||
EOB = 0;
|
||||
for (k = cinfo->Ss; k <= Se; k++) {
|
||||
temp = (*block)[jpeg_natural_order[k]];
|
||||
/* We must apply the point transform by Al. For AC coefficients this
|
||||
* is an integer division with rounding towards 0. To do this portably
|
||||
* in C, we shift after obtaining the absolute value.
|
||||
*/
|
||||
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
|
||||
temp ^= temp3;
|
||||
temp -= temp3; /* temp is abs value of input */
|
||||
temp >>= Al; /* apply the point transform */
|
||||
absvalues[k] = temp; /* save abs value for main pass */
|
||||
if (temp == 1)
|
||||
EOB = k; /* EOB = index of last newly-nonzero coef */
|
||||
}
|
||||
/* Prepare data */
|
||||
EOBPTR = absvalues +
|
||||
entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
|
||||
Sl, Al, absvalues, bits);
|
||||
|
||||
/* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
|
||||
|
||||
@ -691,53 +861,33 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
|
||||
BR = 0; /* BR = count of buffered bits added now */
|
||||
BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
|
||||
|
||||
for (k = cinfo->Ss; k <= Se; k++) {
|
||||
if ((temp = absvalues[k]) == 0) {
|
||||
r++;
|
||||
continue;
|
||||
}
|
||||
zerobits = bits[0];
|
||||
#if SIZEOF_SIZE_T == 8
|
||||
signbits = bits[1];
|
||||
#else
|
||||
signbits = bits[2];
|
||||
#endif
|
||||
ENCODE_COEFS_AC_REFINE();
|
||||
|
||||
/* Emit any required ZRLs, but not if they can be folded into EOB */
|
||||
while (r > 15 && k <= EOB) {
|
||||
/* emit any pending EOBRUN and the BE correction bits */
|
||||
emit_eobrun(entropy);
|
||||
/* Emit ZRL */
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
|
||||
r -= 16;
|
||||
/* Emit buffered correction bits that must be associated with ZRL */
|
||||
emit_buffered_bits(entropy, BR_buffer, BR);
|
||||
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
|
||||
BR = 0;
|
||||
}
|
||||
#if SIZEOF_SIZE_T == 4
|
||||
zerobits = bits[1];
|
||||
signbits = bits[3];
|
||||
|
||||
/* If the coef was previously nonzero, it only needs a correction bit.
|
||||
* NOTE: a straight translation of the spec's figure G.7 would suggest
|
||||
* that we also need to test r > 15. But if r > 15, we can only get here
|
||||
* if k > EOB, which implies that this coefficient is not 1.
|
||||
*/
|
||||
if (temp > 1) {
|
||||
/* The correction bit is the next bit of the absolute value. */
|
||||
BR_buffer[BR++] = (char)(temp & 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Emit any pending EOBRUN and the BE correction bits */
|
||||
emit_eobrun(entropy);
|
||||
|
||||
/* Count/emit Huffman symbol for run length / number of bits */
|
||||
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
|
||||
|
||||
/* Emit output bit for newly-nonzero coef */
|
||||
temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
|
||||
emit_bits(entropy, (unsigned int)temp, 1);
|
||||
|
||||
/* Emit buffered correction bits that must be associated with this code */
|
||||
emit_buffered_bits(entropy, BR_buffer, BR);
|
||||
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
|
||||
BR = 0;
|
||||
r = 0; /* reset zero run length */
|
||||
if (zerobits) {
|
||||
int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
|
||||
int idx = count_zeroes(&zerobits);
|
||||
signbits >>= idx;
|
||||
idx += diff;
|
||||
r += idx;
|
||||
cabsvalue += idx;
|
||||
goto first_iter_ac_refine;
|
||||
}
|
||||
|
||||
ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
|
||||
#endif
|
||||
|
||||
r |= (int)((absvalues + Sl) - cabsvalue);
|
||||
|
||||
if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
|
||||
entropy->EOBRUN++; /* count an EOB */
|
||||
entropy->BE += BR; /* concat my correction bits to older ones */
|
||||
|
8
jsimd.h
8
jsimd.h
@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2011, 2014, D. R. Commander.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -103,3 +103,9 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
|
||||
JCOEFPTR block, int last_dc_val,
|
||||
c_derived_tbl *dctbl,
|
||||
c_derived_tbl *actbl);
|
||||
|
||||
EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
|
||||
|
||||
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *absvalues, size_t *bits);
|
||||
|
16
jsimd_none.c
16
jsimd_none.c
@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014, D. R. Commander.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -389,3 +389,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -108,10 +108,11 @@ endif()
|
||||
if(CPU_TYPE STREQUAL "x86_64")
|
||||
set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
|
||||
x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm
|
||||
x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm x86_64/jdmerge-sse2.asm
|
||||
x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm x86_64/jfdctint-sse2.asm
|
||||
x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm x86_64/jidctint-sse2.asm
|
||||
x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm x86_64/jquanti-sse2.asm
|
||||
x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm
|
||||
x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm
|
||||
x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm
|
||||
x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm
|
||||
x86_64/jquanti-sse2.asm
|
||||
x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm
|
||||
x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm
|
||||
x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm)
|
||||
@ -124,10 +125,11 @@ else()
|
||||
i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm
|
||||
i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm
|
||||
i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm
|
||||
i386/jcsample-sse2.asm i386/jdcolor-sse2.asm i386/jdmerge-sse2.asm
|
||||
i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm i386/jfdctint-sse2.asm
|
||||
i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm i386/jidctint-sse2.asm
|
||||
i386/jidctred-sse2.asm i386/jquantf-sse2.asm i386/jquanti-sse2.asm
|
||||
i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm
|
||||
i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm
|
||||
i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm
|
||||
i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm
|
||||
i386/jquanti-sse2.asm
|
||||
i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm
|
||||
i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm
|
||||
i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm)
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
|
||||
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -691,3 +691,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
|
||||
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2015-2016, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -769,3 +769,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
|
||||
last_dc_val, dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
486
simd/i386/jcphuff-sse2.asm
Normal file
486
simd/i386/jcphuff-sse2.asm
Normal file
@ -0,0 +1,486 @@
|
||||
;
|
||||
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
|
||||
;
|
||||
; Copyright (C) 2016, 2018, Matthieu Darbois
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation of data preparation for progressive
|
||||
; Huffman encoding. See jcphuff.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
|
||||
%macro LOAD16 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
|
||||
mov T0, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0, INT [LUT + 1*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
mov T0, INT [LUT + 2*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
mov T0, INT [LUT + 3*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
mov T0, INT [LUT + 4*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
mov T0, INT [LUT + 5*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
mov T0, INT [LUT + 6*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
|
||||
mov T0, INT [LUT + 7*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 15*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD15 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
pxor X1, X1
|
||||
|
||||
mov T0, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD15
|
||||
mov T1, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD15:
|
||||
%endmacro
|
||||
|
||||
%macro LOAD8 0
|
||||
pxor N0, N0
|
||||
|
||||
mov T0, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
|
||||
mov T0, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD7 0
|
||||
pxor N0, N0
|
||||
pxor X0, X0
|
||||
|
||||
mov T1, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 0
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD7
|
||||
mov T1, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD7:
|
||||
%endmacro
|
||||
|
||||
%macro REDUCE0 0
|
||||
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
|
||||
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
|
||||
movdqa xmm2, XMMWORD [VALUES + (16*2)]
|
||||
movdqa xmm3, XMMWORD [VALUES + (24*2)]
|
||||
movdqa xmm4, XMMWORD [VALUES + (32*2)]
|
||||
movdqa xmm5, XMMWORD [VALUES + (40*2)]
|
||||
movdqa xmm6, XMMWORD [VALUES + (48*2)]
|
||||
|
||||
pcmpeqw xmm0, ZERO
|
||||
pcmpeqw xmm1, ZERO
|
||||
pcmpeqw xmm2, ZERO
|
||||
pcmpeqw xmm3, ZERO
|
||||
pcmpeqw xmm4, ZERO
|
||||
pcmpeqw xmm5, ZERO
|
||||
pcmpeqw xmm6, ZERO
|
||||
pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
|
||||
|
||||
packsswb xmm0, xmm1
|
||||
packsswb xmm2, xmm3
|
||||
packsswb xmm4, xmm5
|
||||
packsswb xmm6, xmm7
|
||||
|
||||
pmovmskb eax, xmm0
|
||||
pmovmskb ecx, xmm2
|
||||
pmovmskb edx, xmm4
|
||||
pmovmskb esi, xmm6
|
||||
|
||||
shl ecx, 16
|
||||
shl esi, 16
|
||||
|
||||
or eax, ecx
|
||||
or edx, esi
|
||||
|
||||
not eax
|
||||
not edx
|
||||
|
||||
mov edi, ZEROBITS
|
||||
|
||||
mov INT [edi], eax
|
||||
mov INT [edi+SIZEOF_INT], edx
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_refine().
|
||||
;
|
||||
; GLOBAL(int)
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *absvalues,
|
||||
; size_t *bits)
|
||||
;
|
||||
; eax + 8 = const JCOEF *block
|
||||
; eax + 12 = const int *jpeg_natural_order_start
|
||||
; eax + 16 = int Sl
|
||||
; eax + 20 = int Al
|
||||
; eax + 24 = JCOEF *values
|
||||
; eax + 28 = size_t *bits
|
||||
|
||||
%define ZERO xmm7
|
||||
%define ONE xmm5
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define LENEND eax
|
||||
%define LUT ebx
|
||||
%define T0 ecx
|
||||
%define T0w cx
|
||||
%define T1 edx
|
||||
%define BLOCK esi
|
||||
%define VALUES edi
|
||||
%define KK ebp
|
||||
|
||||
%define ZEROBITS INT [esp + 5 * 4]
|
||||
%define EOB INT [esp + 5 * 4 + 4]
|
||||
%define LEN INT [esp + 5 * 4 + 8]
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
||||
push ebp
|
||||
mov eax, esp ; eax = original ebp
|
||||
sub esp, byte 4
|
||||
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [esp], eax
|
||||
mov ebp, esp ; ebp = aligned ebp
|
||||
sub esp, 16
|
||||
push ebx
|
||||
push ecx
|
||||
; push edx ; need not be preserved
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
|
||||
pcmpeqw ONE, ONE
|
||||
psrlw ONE, 15
|
||||
mov BLOCK, INT [eax + 8]
|
||||
mov LUT, INT [eax + 12]
|
||||
mov VALUES, INT [eax + 24]
|
||||
movd AL, INT [eax + 20]
|
||||
mov T0, INT [eax + 28]
|
||||
mov K, INT [eax + 16]
|
||||
mov INT [T0 + 2 * SIZEOF_INT], -1
|
||||
mov INT [T0 + 3 * SIZEOF_INT], -1
|
||||
mov ZEROBITS, T0
|
||||
mov LEN, K
|
||||
pxor ZERO, ZERO
|
||||
and K, -16
|
||||
mov EOB, 0
|
||||
xor KK, KK
|
||||
shr K, 4
|
||||
jz .ELOOPR16
|
||||
.BLOOPR16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER16 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER16:
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
add KK, 2
|
||||
dec K
|
||||
jnz .BLOOPR16
|
||||
.ELOOPR16:
|
||||
mov LENEND, LEN
|
||||
|
||||
test LENEND, 8
|
||||
jz .TRYR7
|
||||
test LENEND, 7
|
||||
jz .TRYR8
|
||||
|
||||
and LENEND, 7
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER15 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER15:
|
||||
add VALUES, 16*2
|
||||
jmp .PADDINGR
|
||||
.TRYR8:
|
||||
LOAD8
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER8 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER8:
|
||||
add VALUES, 8*2
|
||||
jmp .PADDINGR
|
||||
.TRYR7:
|
||||
and LENEND, 7
|
||||
LOAD7
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
mov T1, ZEROBITS
|
||||
not T0
|
||||
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
|
||||
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER7 ; if (idx) {
|
||||
lea T1, [T1+KK*8]
|
||||
mov EOB, T1 ; EOB = k + idx;
|
||||
.CONTINUER7:
|
||||
add VALUES, 8*2
|
||||
.PADDINGR:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDINGR
|
||||
align 16
|
||||
.ZEROLOOPR:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOPR
|
||||
.EPADDINGR:
|
||||
sub VALUES, DCTSIZE2*2
|
||||
|
||||
REDUCE0
|
||||
|
||||
mov eax, EOB
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
; pop edx ; need not be preserved
|
||||
pop ecx
|
||||
pop ebx
|
||||
mov esp, ebp ; esp <- aligned ebp
|
||||
pop esp ; esp <- original ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef ONE
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef KK
|
||||
%undef EOB
|
||||
%undef SIGN
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T1
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
%undef LENEND
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -21,6 +21,7 @@
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "jconfigint.h"
|
||||
|
||||
/*
|
||||
* In the PIC cases, we have no guarantee that constants will keep
|
||||
@ -1197,3 +1198,35 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (SIZEOF_SIZE_T != 4)
|
||||
return 0;
|
||||
if (!(simd_support & JSIMD_SSE2))
|
||||
return 0;
|
||||
#if defined(HAVE_BUILTIN_CTZL)
|
||||
return 1;
|
||||
#elif defined(HAVE_BITSCANFORWARD)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
|
||||
jpeg_natural_order_start,
|
||||
Sl, Al, absvalues, bits);
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
* Copyright (C) 2011, 2014-2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
|
||||
* Copyright (C) 2014, Linaro Limited.
|
||||
* Copyright (C) 2015-2016, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
@ -1072,3 +1072,8 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
|
||||
EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
|
||||
(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
|
||||
c_derived_tbl *dctbl, c_derived_tbl *actbl);
|
||||
|
||||
/* Progressive Huffman encoding */
|
||||
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
|
||||
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
|
||||
JCOEF *absvalues, size_t *bits);
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015, 2018, Matthieu Darbois.
|
||||
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
@ -581,3 +581,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -1086,3 +1086,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -843,3 +843,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
474
simd/x86_64/jcphuff-sse2.asm
Normal file
474
simd/x86_64/jcphuff-sse2.asm
Normal file
@ -0,0 +1,474 @@
|
||||
;
|
||||
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
|
||||
; (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2016, 2018, Matthieu Darbois
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains an SSE2 implementation of data preparation for progressive
|
||||
; Huffman encoding. See jcphuff.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
|
||||
|
||||
%macro LOAD16 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
|
||||
mov T0d, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0d, INT [LUT + 1*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
mov T0d, INT [LUT + 2*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
mov T0d, INT [LUT + 3*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
mov T0d, INT [LUT + 4*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
mov T0d, INT [LUT + 5*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
mov T0d, INT [LUT + 6*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
|
||||
mov T0d, INT [LUT + 7*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 15*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD15 0
|
||||
pxor N0, N0
|
||||
pxor N1, N1
|
||||
pxor X1, X1
|
||||
|
||||
mov T0d, INT [LUT + 0*SIZEOF_INT]
|
||||
mov T1d, INT [LUT + 8*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 0
|
||||
|
||||
mov T0d, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0d, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0d, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0d, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0d, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0d, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0d, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 9*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 10*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 11*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 12*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 13*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD15
|
||||
mov T1d, INT [LUT + 14*SIZEOF_INT]
|
||||
pinsrw X1, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD15:
|
||||
%endmacro
|
||||
|
||||
%macro LOAD8 0
|
||||
pxor N0, N0
|
||||
|
||||
mov T0d, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 0
|
||||
|
||||
mov T0d, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 1
|
||||
|
||||
mov T0d, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 2
|
||||
|
||||
mov T0d, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 3
|
||||
|
||||
mov T0d, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 4
|
||||
|
||||
mov T0d, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 5
|
||||
|
||||
mov T0d, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 6
|
||||
|
||||
mov T0d, INT [LUT + 7*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T0 * 2], 7
|
||||
%endmacro
|
||||
|
||||
%macro LOAD7 0
|
||||
pxor N0, N0
|
||||
pxor X0, X0
|
||||
|
||||
mov T1d, INT [LUT + 0*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 0
|
||||
|
||||
cmp LENEND, 2
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 1*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 1
|
||||
|
||||
cmp LENEND, 3
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 2*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 2
|
||||
|
||||
cmp LENEND, 4
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 3*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 3
|
||||
|
||||
cmp LENEND, 5
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 4*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 4
|
||||
|
||||
cmp LENEND, 6
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 5*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 5
|
||||
|
||||
cmp LENEND, 7
|
||||
jl %%.ELOAD7
|
||||
mov T1d, INT [LUT + 6*SIZEOF_INT]
|
||||
pinsrw X0, word [BLOCK + T1 * 2], 6
|
||||
%%.ELOAD7:
|
||||
%endmacro
|
||||
|
||||
%macro REDUCE0 0
|
||||
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
|
||||
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
|
||||
movdqa xmm2, XMMWORD [VALUES + (16*2)]
|
||||
movdqa xmm3, XMMWORD [VALUES + (24*2)]
|
||||
movdqa xmm4, XMMWORD [VALUES + (32*2)]
|
||||
movdqa xmm5, XMMWORD [VALUES + (40*2)]
|
||||
movdqa xmm6, XMMWORD [VALUES + (48*2)]
|
||||
movdqa xmm7, XMMWORD [VALUES + (56*2)]
|
||||
|
||||
pcmpeqw xmm0, ZERO
|
||||
pcmpeqw xmm1, ZERO
|
||||
pcmpeqw xmm2, ZERO
|
||||
pcmpeqw xmm3, ZERO
|
||||
pcmpeqw xmm4, ZERO
|
||||
pcmpeqw xmm5, ZERO
|
||||
pcmpeqw xmm6, ZERO
|
||||
pcmpeqw xmm7, ZERO
|
||||
|
||||
packsswb xmm0, xmm1
|
||||
packsswb xmm2, xmm3
|
||||
packsswb xmm4, xmm5
|
||||
packsswb xmm6, xmm7
|
||||
|
||||
pmovmskb eax, xmm0
|
||||
pmovmskb ecx, xmm2
|
||||
pmovmskb edx, xmm4
|
||||
pmovmskb esi, xmm6
|
||||
|
||||
shl rcx, 16
|
||||
shl rdx, 32
|
||||
shl rsi, 48
|
||||
|
||||
or rax, rcx
|
||||
or rdx, rsi
|
||||
or rax, rdx
|
||||
|
||||
not rax
|
||||
|
||||
mov MMWORD [r15], rax
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Prepare data for jsimd_encode_mcu_AC_refine().
|
||||
;
|
||||
; GLOBAL(int)
|
||||
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
|
||||
; const int *jpeg_natural_order_start,
|
||||
; int Sl, int Al, JCOEF *absvalues,
|
||||
; size_t *bits)
|
||||
;
|
||||
; r10 = const JCOEF *block
|
||||
; r11 = const int *jpeg_natural_order_start
|
||||
; r12 = int Sl
|
||||
; r13 = int Al
|
||||
; r14 = JCOEF *values
|
||||
; r15 = size_t *bits
|
||||
|
||||
%define ZERO xmm9
|
||||
%define ONE xmm5
|
||||
%define X0 xmm0
|
||||
%define X1 xmm1
|
||||
%define N0 xmm2
|
||||
%define N1 xmm3
|
||||
%define AL xmm4
|
||||
%define K eax
|
||||
%define KK r9d
|
||||
%define EOB r8d
|
||||
%define SIGN rdi
|
||||
%define LUT r11
|
||||
%define T0 rcx
|
||||
%define T0d ecx
|
||||
%define T1 rdx
|
||||
%define T1d edx
|
||||
%define BLOCK r10
|
||||
%define VALUES r14
|
||||
%define LEN r12d
|
||||
%define LENEND r13d
|
||||
|
||||
align 32
|
||||
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [rbp - 16]
|
||||
collect_args 6
|
||||
|
||||
movdqa XMMWORD [rbp - 16], ZERO
|
||||
|
||||
xor SIGN, SIGN
|
||||
xor EOB, EOB
|
||||
xor KK, KK
|
||||
movd AL, r13d
|
||||
pxor ZERO, ZERO
|
||||
pcmpeqw ONE, ONE
|
||||
psrlw ONE, 15
|
||||
mov K, LEN
|
||||
mov LENEND, LEN
|
||||
and K, -16
|
||||
and LENEND, 7
|
||||
shr K, 4
|
||||
jz .ELOOPR16
|
||||
.BLOOPR16:
|
||||
LOAD16
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 16 ; make room for sizebits
|
||||
shl T0, 48
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER16 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER16:
|
||||
add VALUES, 16*2
|
||||
add LUT, 16*SIZEOF_INT
|
||||
add KK, 16
|
||||
dec K
|
||||
jnz .BLOOPR16
|
||||
.ELOOPR16:
|
||||
test LEN, 8
|
||||
jz .TRYR7
|
||||
test LEN, 7
|
||||
jz .TRYR8
|
||||
|
||||
LOAD15
|
||||
pcmpgtw N0, X0
|
||||
pcmpgtw N1, X1
|
||||
paddw X0, N0
|
||||
paddw X1, N1
|
||||
pxor X0, N0
|
||||
pxor X1, N1
|
||||
psrlw X0, AL
|
||||
psrlw X1, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
movdqa XMMWORD [VALUES + (8) * 2], X1
|
||||
pcmpeqw X0, ONE
|
||||
pcmpeqw X1, ONE
|
||||
packsswb N0, N1
|
||||
packsswb X0, X1
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 16 ; make room for sizebits
|
||||
shl T0, 48
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER15 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER15:
|
||||
add VALUES, 16*2
|
||||
jmp .PADDINGR
|
||||
.TRYR8:
|
||||
LOAD8
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 8 ; make room for sizebits
|
||||
shl T0, 56
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER8 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER8:
|
||||
add VALUES, 8*2
|
||||
jmp .PADDINGR
|
||||
.TRYR7:
|
||||
LOAD7
|
||||
|
||||
pcmpgtw N0, X0
|
||||
paddw X0, N0
|
||||
pxor X0, N0
|
||||
psrlw X0, AL
|
||||
movdqa XMMWORD [VALUES + (0) * 2], X0
|
||||
pcmpeqw X0, ONE
|
||||
packsswb N0, ZERO
|
||||
packsswb X0, ZERO
|
||||
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
|
||||
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
|
||||
shr SIGN, 8 ; make room for sizebits
|
||||
shl T0, 56
|
||||
or SIGN, T0
|
||||
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
|
||||
jz .CONTINUER7 ; if (idx) {
|
||||
mov EOB, KK
|
||||
add EOB, T1d ; EOB = k + idx;
|
||||
.CONTINUER7:
|
||||
add VALUES, 8*2
|
||||
.PADDINGR:
|
||||
mov K, LEN
|
||||
add K, 7
|
||||
and K, -8
|
||||
shr K, 3
|
||||
sub K, DCTSIZE2/8
|
||||
jz .EPADDINGR
|
||||
align 16
|
||||
.ZEROLOOPR:
|
||||
movdqa XMMWORD [VALUES + 0], ZERO
|
||||
shr SIGN, 8
|
||||
add VALUES, 8*2
|
||||
inc K
|
||||
jnz .ZEROLOOPR
|
||||
.EPADDINGR:
|
||||
not SIGN
|
||||
sub VALUES, DCTSIZE2*2
|
||||
mov MMWORD [r15+SIZEOF_MMWORD], SIGN
|
||||
|
||||
REDUCE0
|
||||
|
||||
mov eax, EOB
|
||||
movdqa ZERO, XMMWORD [rbp - 16]
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%undef ZERO
|
||||
%undef ONE
|
||||
%undef X0
|
||||
%undef X1
|
||||
%undef N0
|
||||
%undef N1
|
||||
%undef AL
|
||||
%undef K
|
||||
%undef KK
|
||||
%undef EOB
|
||||
%undef SIGN
|
||||
%undef LUT
|
||||
%undef T0
|
||||
%undef T0d
|
||||
%undef T1
|
||||
%undef T1d
|
||||
%undef BLOCK
|
||||
%undef VALUES
|
||||
%undef LEN
|
||||
%undef LENEND
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
@ -3,7 +3,7 @@
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
|
||||
* Copyright (C) 2015, Matthieu Darbois.
|
||||
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@ -21,6 +21,7 @@
|
||||
#include "../../jdct.h"
|
||||
#include "../../jsimddct.h"
|
||||
#include "../jsimd.h"
|
||||
#include "jconfigint.h"
|
||||
|
||||
/*
|
||||
* In the PIC cases, we have no guarantee that constants will keep
|
||||
@ -1020,3 +1021,35 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
|
||||
return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
|
||||
dctbl, actbl);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_encode_mcu_AC_refine_prepare(void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (SIZEOF_SIZE_T != 8)
|
||||
return 0;
|
||||
if (!(simd_support & JSIMD_SSE2))
|
||||
return 0;
|
||||
#if defined(HAVE_BUILTIN_CTZL)
|
||||
return 1;
|
||||
#elif defined(HAVE_BITSCANFORWARD64)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
|
||||
const int *jpeg_natural_order_start, int Sl,
|
||||
int Al, JCOEF *absvalues, size_t *bits)
|
||||
{
|
||||
return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
|
||||
jpeg_natural_order_start,
|
||||
Sl, Al, absvalues, bits);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user