C/SSE2 optimization of encode_mcu_AC_refine()

This commit adds C and SSE2 optimizations for the encode_mcu_AC_refine()
function used in progressive Huffman encoding.

The image used for testing can be retrieved from this page:
https://blog.cloudflare.com/doubling-the-speed-of-jpegtran

All timings done on `Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz`
clang version is `Apple LLVM version 9.0.0 (clang-900.0.39.2)`
gcc-5 version is `gcc-5 (Homebrew GCC 5.5.0) 5.5.0`
gcc-7 version is `gcc-7 (Homebrew GCC 7.2.0) 7.2.0`

Here are the results in comparison to libjpeg-turbo@3c54642 using
`time ./jpegtran -outfile /dev/null -progressive -optimise -copy none print_poster_0025.jpg`

C
clang x86_64: +7%
gcc-5 x86_64: +30%
gcc-7 x86_64: +33%
clang i386: +0%
gcc-5 i386: +24%
gcc-7 i386: +23%

SSE2
clang x86_64: +42%
gcc-5 x86_64: +53%
gcc-7 x86_64: +64%
clang i386: +35%
gcc-5 i386: +46%
gcc-7 i386: +49%

Discussion in libjpeg-turbo/libjpeg-turbo#46
This commit is contained in:
mayeut 2018-03-02 22:33:19 +01:00 committed by DRC
parent 81baa1b501
commit 16bd984557
16 changed files with 1384 additions and 93 deletions

View File

@ -341,10 +341,19 @@ set(EFFECTIVE_LD_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMA
message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}")
include(CheckCSourceCompiles)
include(CheckIncludeFiles)
include(CheckTypeSize)
check_type_size("size_t" SIZE_T)
check_type_size("unsigned long" UNSIGNED_LONG)
if(SIZE_T EQUAL UNSIGNED_LONG)
check_c_source_compiles("int main(int argc, char **argv) { unsigned long a = argc; return __builtin_ctzl(a); }"
HAVE_BUILTIN_CTZL)
endif()
if(UNIX)
# Check for headers
include(CheckIncludeFiles)
check_include_files(locale.h HAVE_LOCALE_H)
check_include_files(stddef.h HAVE_STDDEF_H)
check_include_files(stdlib.h HAVE_STDLIB_H)
@ -359,10 +368,8 @@ if(UNIX)
endif()
# Check for types
include(CheckTypeSize)
check_type_size("unsigned char" UNSIGNED_CHAR)
check_type_size("unsigned short" UNSIGNED_SHORT)
check_type_size("size_t" SIZE_T)
# Check for compiler features
check_c_source_compiles("int main(void) { typedef struct undefined_structure *undef_struct_ptr; }"
@ -408,6 +415,7 @@ if(UNIX)
endif()
if(MSVC)
check_include_files("intrin.h" HAVE_INTRIN_H)
set(INLINE_OPTIONS "__inline;inline")
else()
set(INLINE_OPTIONS "__inline__;inline")

View File

@ -13,9 +13,19 @@
/* Version number of package */
#define VERSION "@VERSION@"
#ifndef _WIN32
/* The size of `size_t', as computed by sizeof. */
#define SIZEOF_SIZE_T @SIZE_T@
/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
#cmakedefine HAVE_BUILTIN_CTZL
/* Define to 1 if you have the <intrin.h> header file. */
#cmakedefine HAVE_INTRIN_H
#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
#if (SIZEOF_SIZE_T == 8)
#define HAVE_BITSCANFORWARD64
#elif (SIZEOF_SIZE_T == 4)
#define HAVE_BITSCANFORWARD
#endif
#endif

290
jcphuff.c
View File

@ -5,6 +5,7 @@
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2011, 2015, 2018, D. R. Commander.
* Copyright (C) 2016, 2018, Matthieu Darbois.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@ -18,9 +19,22 @@
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jchuff.h" /* Declarations shared with jchuff.c */
#include "jsimd.h"
#include "jconfigint.h"
#include <limits.h>
#ifdef HAVE_INTRIN_H
#include <intrin.h>
#ifdef _MSC_VER
#ifdef HAVE_BITSCANFORWARD64
#pragma intrinsic(_BitScanForward64)
#endif
#ifdef HAVE_BITSCANFORWARD
#pragma intrinsic(_BitScanForward)
#endif
#endif
#endif
#ifdef C_PROGRESSIVE_SUPPORTED
/*
@ -59,6 +73,11 @@
typedef struct {
struct jpeg_entropy_encoder pub; /* public fields */
/* Pointer to routine to prepare data for encode_mcu_AC_refine() */
int (*AC_refine_prepare) (const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits);
/* Mode flag: TRUE for optimization, FALSE for actual data output */
boolean gather_statistics;
@ -120,6 +139,8 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
#define IRIGHT_SHIFT(x, shft) ((x) >> (shft))
#endif
#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
/* Forward declarations */
METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
@ -127,12 +148,41 @@ METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(int) encode_mcu_AC_refine_prepare
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);
METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
JBLOCKROW *MCU_data);
METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
/* Count bit loop zeroes */
INLINE
METHODDEF(int)
count_zeroes(size_t *x)
{
int result;
#if defined(HAVE_BUILTIN_CTZL)
result = __builtin_ctzl(*x);
*x >>= result;
#elif defined(HAVE_BITSCANFORWARD64)
_BitScanForward64(&result, *x);
*x >>= result;
#elif defined(HAVE_BITSCANFORWARD)
_BitScanForward(&result, *x);
*x >>= result;
#else
result = 0;
while ((*x & 1) == 0) {
++result;
*x >>= 1;
}
#endif
return result;
}
/*
* Initialize for a Huffman-compressed scan using progressive JPEG.
*/
@ -163,6 +213,10 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->pub.encode_mcu = encode_mcu_DC_refine;
else {
entropy->pub.encode_mcu = encode_mcu_AC_refine;
if (jsimd_can_encode_mcu_AC_refine_prepare())
entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
else
entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
/* AC refinement needs a correction bit buffer */
if (entropy->bit_buffer == NULL)
entropy->bit_buffer = (char *)
@ -637,23 +691,149 @@ encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
}
/*
* Data preparation for encode_mcu_AC_refine().
*/
#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
/* It is convenient to make a pre-pass to determine the transformed \
* coefficients' absolute values and the EOB position. \
*/ \
for (k = 0; k < Sl; k++) { \
temp = block[jpeg_natural_order_start[k]]; \
/* We must apply the point transform by Al. For AC coefficients this \
* is an integer division with rounding towards 0. To do this portably \
* in C, we shift after obtaining the absolute value. \
*/ \
temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
temp ^= temp2; \
temp -= temp2; /* temp is abs value of input */ \
temp >>= Al; /* apply the point transform */ \
if (temp != 0) { \
zerobits |= ((size_t)1U) << k; \
signbits |= ((size_t)(temp2 + 1)) << k; \
} \
absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
if (temp == 1) \
EOB = k + koffset; /* EOB = index of last newly-nonzero coef */ \
} \
}
METHODDEF(int)
encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
register int k, temp, temp2;
int EOB = 0;
size_t zerobits = 0U, signbits = 0U;
int Sl0 = Sl;
#if SIZEOF_SIZE_T == 4
if (Sl0 > 32)
Sl0 = 32;
#endif
COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
bits[0] = zerobits;
#if SIZEOF_SIZE_T == 8
bits[1] = signbits;
#else
bits[2] = signbits;
zerobits = 0U;
signbits = 0U;
if (Sl > 32) {
Sl -= 32;
jpeg_natural_order_start += 32;
absvalues += 32;
COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
}
bits[1] = zerobits;
bits[3] = signbits;
#endif
return EOB;
}
/*
* MCU encoding for AC successive approximation refinement scan.
*/
#define ENCODE_COEFS_AC_REFINE(label) { \
while (zerobits) { \
int idx = count_zeroes(&zerobits); \
r += idx; \
cabsvalue += idx; \
signbits >>= idx; \
label \
/* Emit any required ZRLs, but not if they can be folded into EOB */ \
while (r > 15 && (cabsvalue <= EOBPTR)) { \
/* emit any pending EOBRUN and the BE correction bits */ \
emit_eobrun(entropy); \
/* Emit ZRL */ \
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
r -= 16; \
/* Emit buffered correction bits that must be associated with ZRL */ \
emit_buffered_bits(entropy, BR_buffer, BR); \
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
BR = 0; \
} \
\
temp = *cabsvalue++; \
\
/* If the coef was previously nonzero, it only needs a correction bit. \
* NOTE: a straight translation of the spec's figure G.7 would suggest \
* that we also need to test r > 15. But if r > 15, we can only get here \
* if k > EOB, which implies that this coefficient is not 1. \
*/ \
if (temp > 1) { \
/* The correction bit is the next bit of the absolute value. */ \
BR_buffer[BR++] = (char)(temp & 1); \
signbits >>= 1; \
zerobits >>= 1; \
continue; \
} \
\
/* Emit any pending EOBRUN and the BE correction bits */ \
emit_eobrun(entropy); \
\
/* Count/emit Huffman symbol for run length / number of bits */ \
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
\
/* Emit output bit for newly-nonzero coef */ \
temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
emit_bits(entropy, (unsigned int)temp, 1); \
\
/* Emit buffered correction bits that must be associated with this code */ \
emit_buffered_bits(entropy, BR_buffer, BR); \
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
BR = 0; \
r = 0; /* reset zero run length */ \
signbits >>= 1; \
zerobits >>= 1; \
} \
}
METHODDEF(boolean)
encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
{
phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
register int temp, temp3;
register int r, k;
int EOB;
register int temp, r;
char *BR_buffer;
unsigned int BR;
int Se = cinfo->Se;
int Sl = cinfo->Se - cinfo->Ss + 1;
int Al = cinfo->Al;
JBLOCKROW block;
int absvalues[DCTSIZE2];
JCOEF absvalues_unaligned[DCTSIZE2 + 15];
JCOEF *absvalues;
const JCOEF *cabsvalue, *EOBPTR;
size_t zerobits, signbits;
size_t bits[16 / SIZEOF_SIZE_T];
entropy->next_output_byte = cinfo->dest->next_output_byte;
entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@ -663,27 +843,17 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
if (entropy->restarts_to_go == 0)
emit_restart(entropy, entropy->next_restart_num);
/* Encode the MCU data block */
block = MCU_data[0];
#ifdef WITH_SIMD
cabsvalue = absvalues = (JCOEF *)PAD((size_t)absvalues_unaligned, 16);
#else
/* Not using SIMD, so alignment is not needed */
cabsvalue = absvalues = absvalues_unaligned;
#endif
/* It is convenient to make a pre-pass to determine the transformed
* coefficients' absolute values and the EOB position.
*/
EOB = 0;
for (k = cinfo->Ss; k <= Se; k++) {
temp = (*block)[jpeg_natural_order[k]];
/* We must apply the point transform by Al. For AC coefficients this
* is an integer division with rounding towards 0. To do this portably
* in C, we shift after obtaining the absolute value.
*/
temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
temp ^= temp3;
temp -= temp3; /* temp is abs value of input */
temp >>= Al; /* apply the point transform */
absvalues[k] = temp; /* save abs value for main pass */
if (temp == 1)
EOB = k; /* EOB = index of last newly-nonzero coef */
}
/* Prepare data */
EOBPTR = absvalues +
entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
Sl, Al, absvalues, bits);
/* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
@ -691,53 +861,33 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
BR = 0; /* BR = count of buffered bits added now */
BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
for (k = cinfo->Ss; k <= Se; k++) {
if ((temp = absvalues[k]) == 0) {
r++;
continue;
}
zerobits = bits[0];
#if SIZEOF_SIZE_T == 8
signbits = bits[1];
#else
signbits = bits[2];
#endif
ENCODE_COEFS_AC_REFINE();
/* Emit any required ZRLs, but not if they can be folded into EOB */
while (r > 15 && k <= EOB) {
/* emit any pending EOBRUN and the BE correction bits */
emit_eobrun(entropy);
/* Emit ZRL */
emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
r -= 16;
/* Emit buffered correction bits that must be associated with ZRL */
emit_buffered_bits(entropy, BR_buffer, BR);
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
BR = 0;
}
#if SIZEOF_SIZE_T == 4
zerobits = bits[1];
signbits = bits[3];
/* If the coef was previously nonzero, it only needs a correction bit.
* NOTE: a straight translation of the spec's figure G.7 would suggest
* that we also need to test r > 15. But if r > 15, we can only get here
* if k > EOB, which implies that this coefficient is not 1.
*/
if (temp > 1) {
/* The correction bit is the next bit of the absolute value. */
BR_buffer[BR++] = (char)(temp & 1);
continue;
}
/* Emit any pending EOBRUN and the BE correction bits */
emit_eobrun(entropy);
/* Count/emit Huffman symbol for run length / number of bits */
emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
/* Emit output bit for newly-nonzero coef */
temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
emit_bits(entropy, (unsigned int)temp, 1);
/* Emit buffered correction bits that must be associated with this code */
emit_buffered_bits(entropy, BR_buffer, BR);
BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
BR = 0;
r = 0; /* reset zero run length */
if (zerobits) {
int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
int idx = count_zeroes(&zerobits);
signbits >>= idx;
idx += diff;
r += idx;
cabsvalue += idx;
goto first_iter_ac_refine;
}
ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
#endif
r |= (int)((absvalues + Sl) - cabsvalue);
if (r > 0 || BR > 0) { /* If there are trailing zeroes, */
entropy->EOBRUN++; /* count an EOB */
entropy->BE += BR; /* concat my correction bits to older ones */

View File

@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, 2014, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -103,3 +103,9 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl);
EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);

View File

@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -389,3 +389,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
{
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

View File

@ -108,10 +108,11 @@ endif()
if(CPU_TYPE STREQUAL "x86_64")
set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm
x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm x86_64/jdmerge-sse2.asm
x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm x86_64/jfdctint-sse2.asm
x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm x86_64/jidctint-sse2.asm
x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm x86_64/jquanti-sse2.asm
x86_64/jcphuff-sse2.asm x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm
x86_64/jdmerge-sse2.asm x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm
x86_64/jfdctint-sse2.asm x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm
x86_64/jidctint-sse2.asm x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm
x86_64/jquanti-sse2.asm
x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm
x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm
x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm)
@ -124,10 +125,11 @@ else()
i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm
i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm
i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm
i386/jcsample-sse2.asm i386/jdcolor-sse2.asm i386/jdmerge-sse2.asm
i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm i386/jfdctint-sse2.asm
i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm i386/jidctint-sse2.asm
i386/jidctred-sse2.asm i386/jquantf-sse2.asm i386/jquanti-sse2.asm
i386/jcphuff-sse2.asm i386/jcsample-sse2.asm i386/jdcolor-sse2.asm
i386/jdmerge-sse2.asm i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm
i386/jfdctint-sse2.asm i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm
i386/jidctint-sse2.asm i386/jidctred-sse2.asm i386/jquantf-sse2.asm
i386/jquanti-sse2.asm
i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm
i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm
i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm)

View File

@ -4,7 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2015-2016, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -691,3 +691,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

View File

@ -4,7 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2015-2016, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -769,3 +769,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
last_dc_val, dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

486
simd/i386/jcphuff-sse2.asm Normal file
View File

@ -0,0 +1,486 @@
;
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
;
; Copyright (C) 2016, 2018, Matthieu Darbois
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation of data preparation for progressive
; Huffman encoding. See jcphuff.c for more details.
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
; --------------------------------------------------------------------------
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
pxor N1, N1
mov T0, INT [LUT + 0*SIZEOF_INT]
mov T1, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
mov T1, INT [LUT + 9*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
pinsrw X1, word [BLOCK + T1 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
mov T1, INT [LUT + 10*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
pinsrw X1, word [BLOCK + T1 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
mov T1, INT [LUT + 11*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
pinsrw X1, word [BLOCK + T1 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
mov T1, INT [LUT + 12*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
pinsrw X1, word [BLOCK + T1 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
mov T1, INT [LUT + 13*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
pinsrw X1, word [BLOCK + T1 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
mov T1, INT [LUT + 14*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
pinsrw X1, word [BLOCK + T1 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
mov T1, INT [LUT + 15*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
pinsrw X1, word [BLOCK + T1 * 2], 7
%endmacro
%macro LOAD15 0
pxor N0, N0
pxor N1, N1
pxor X1, X1
mov T0, INT [LUT + 0*SIZEOF_INT]
mov T1, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
cmp LENEND, 2
jl %%.ELOAD15
mov T1, INT [LUT + 9*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD15
mov T1, INT [LUT + 10*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD15
mov T1, INT [LUT + 11*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD15
mov T1, INT [LUT + 12*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD15
mov T1, INT [LUT + 13*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD15
mov T1, INT [LUT + 14*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 6
%%.ELOAD15:
%endmacro
%macro LOAD8 0
pxor N0, N0
mov T0, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
mov T0, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
%endmacro
%macro LOAD7 0
pxor N0, N0
pxor X0, X0
mov T1, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 0
cmp LENEND, 2
jl %%.ELOAD7
mov T1, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD7
mov T1, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD7
mov T1, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD7
mov T1, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD7
mov T1, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD7
mov T1, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 6
%%.ELOAD7:
%endmacro
%macro REDUCE0 0
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
movdqa xmm2, XMMWORD [VALUES + (16*2)]
movdqa xmm3, XMMWORD [VALUES + (24*2)]
movdqa xmm4, XMMWORD [VALUES + (32*2)]
movdqa xmm5, XMMWORD [VALUES + (40*2)]
movdqa xmm6, XMMWORD [VALUES + (48*2)]
pcmpeqw xmm0, ZERO
pcmpeqw xmm1, ZERO
pcmpeqw xmm2, ZERO
pcmpeqw xmm3, ZERO
pcmpeqw xmm4, ZERO
pcmpeqw xmm5, ZERO
pcmpeqw xmm6, ZERO
pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
packsswb xmm0, xmm1
packsswb xmm2, xmm3
packsswb xmm4, xmm5
packsswb xmm6, xmm7
pmovmskb eax, xmm0
pmovmskb ecx, xmm2
pmovmskb edx, xmm4
pmovmskb esi, xmm6
shl ecx, 16
shl esi, 16
or eax, ecx
or edx, esi
not eax
not edx
mov edi, ZEROBITS
mov INT [edi], eax
mov INT [edi+SIZEOF_INT], edx
%endmacro
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;
; GLOBAL(int)
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *absvalues,
; size_t *bits)
;
; eax + 8 = const JCOEF *block
; eax + 12 = const int *jpeg_natural_order_start
; eax + 16 = int Sl
; eax + 20 = int Al
; eax + 24 = JCOEF *values
; eax + 28 = size_t *bits
%define ZERO xmm7
%define ONE xmm5
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define LENEND eax
%define LUT ebx
%define T0 ecx
%define T0w cx
%define T1 edx
%define BLOCK esi
%define VALUES edi
%define KK ebp
%define ZEROBITS INT [esp + 5 * 4]
%define EOB INT [esp + 5 * 4 + 4]
%define LEN INT [esp + 5 * 4 + 8]
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
push ebp
mov eax, esp ; eax = original ebp
sub esp, byte 4
and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [esp], eax
mov ebp, esp ; ebp = aligned ebp
sub esp, 16
push ebx
push ecx
; push edx ; need not be preserved
push esi
push edi
push ebp
pcmpeqw ONE, ONE
psrlw ONE, 15
mov BLOCK, INT [eax + 8]
mov LUT, INT [eax + 12]
mov VALUES, INT [eax + 24]
movd AL, INT [eax + 20]
mov T0, INT [eax + 28]
mov K, INT [eax + 16]
mov INT [T0 + 2 * SIZEOF_INT], -1
mov INT [T0 + 3 * SIZEOF_INT], -1
mov ZEROBITS, T0
mov LEN, K
pxor ZERO, ZERO
and K, -16
mov EOB, 0
xor KK, KK
shr K, 4
jz .ELOOPR16
.BLOOPR16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER16 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER16:
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
add KK, 2
dec K
jnz .BLOOPR16
.ELOOPR16:
mov LENEND, LEN
test LENEND, 8
jz .TRYR7
test LENEND, 7
jz .TRYR8
and LENEND, 7
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER15 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER15:
add VALUES, 16*2
jmp .PADDINGR
.TRYR8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER8 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER8:
add VALUES, 8*2
jmp .PADDINGR
.TRYR7:
and LENEND, 7
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
mov T1, ZEROBITS
not T0
mov word [T1 + 2 * SIZEOF_INT + KK], T0w
pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER7 ; if (idx) {
lea T1, [T1+KK*8]
mov EOB, T1 ; EOB = k + idx;
.CONTINUER7:
add VALUES, 8*2
.PADDINGR:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDINGR
align 16
.ZEROLOOPR:
movdqa XMMWORD [VALUES + 0], ZERO
add VALUES, 8*2
inc K
jnz .ZEROLOOPR
.EPADDINGR:
sub VALUES, DCTSIZE2*2
REDUCE0
mov eax, EOB
pop ebp
pop edi
pop esi
; pop edx ; need not be preserved
pop ecx
pop ebx
mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp
pop ebp
ret
%undef ZERO
%undef ONE
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef KK
%undef EOB
%undef SIGN
%undef LUT
%undef T0
%undef T1
%undef BLOCK
%undef VALUES
%undef LEN
%undef LENEND
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

View File

@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -21,6 +21,7 @@
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "jconfigint.h"
/*
* In the PIC cases, we have no guarantee that constants will keep
@ -1197,3 +1198,35 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (SIZEOF_SIZE_T != 4)
return 0;
if (!(simd_support & JSIMD_SSE2))
return 0;
#if defined(HAVE_BUILTIN_CTZL)
return 1;
#elif defined(HAVE_BITSCANFORWARD)
return 1;
#else
return 0;
#endif
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
jpeg_natural_order_start,
Sl, Al, absvalues, bits);
}

View File

@ -5,7 +5,7 @@
* Copyright (C) 2011, 2014-2016, 2018, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2014, Linaro Limited.
* Copyright (C) 2015-2016, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
*
* Based on the x86 SIMD extension for IJG JPEG library,
@ -1072,3 +1072,8 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl, c_derived_tbl *actbl);
/* Progressive Huffman encoding */
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);

View File

@ -4,7 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015, 2018, Matthieu Darbois.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
*
* Based on the x86 SIMD extension for IJG JPEG library,
@ -581,3 +581,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
{
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

View File

@ -4,7 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -1086,3 +1086,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
{
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

View File

@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -843,3 +843,17 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
{
return NULL;
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
return 0;
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return 0;
}

View File

@ -0,0 +1,474 @@
;
; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
; (64-bit SSE2)
;
; Copyright (C) 2016, 2018, Matthieu Darbois
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation of data preparation for progressive
; Huffman encoding. See jcphuff.c for more details.
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
; --------------------------------------------------------------------------
; Macros to load data for jsimd_encode_mcu_AC_refine_prepare_sse2()
%macro LOAD16 0
pxor N0, N0
pxor N1, N1
mov T0d, INT [LUT + 0*SIZEOF_INT]
mov T1d, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0d, INT [LUT + 1*SIZEOF_INT]
mov T1d, INT [LUT + 9*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
pinsrw X1, word [BLOCK + T1 * 2], 1
mov T0d, INT [LUT + 2*SIZEOF_INT]
mov T1d, INT [LUT + 10*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
pinsrw X1, word [BLOCK + T1 * 2], 2
mov T0d, INT [LUT + 3*SIZEOF_INT]
mov T1d, INT [LUT + 11*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
pinsrw X1, word [BLOCK + T1 * 2], 3
mov T0d, INT [LUT + 4*SIZEOF_INT]
mov T1d, INT [LUT + 12*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
pinsrw X1, word [BLOCK + T1 * 2], 4
mov T0d, INT [LUT + 5*SIZEOF_INT]
mov T1d, INT [LUT + 13*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
pinsrw X1, word [BLOCK + T1 * 2], 5
mov T0d, INT [LUT + 6*SIZEOF_INT]
mov T1d, INT [LUT + 14*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
pinsrw X1, word [BLOCK + T1 * 2], 6
mov T0d, INT [LUT + 7*SIZEOF_INT]
mov T1d, INT [LUT + 15*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
pinsrw X1, word [BLOCK + T1 * 2], 7
%endmacro
%macro LOAD15 0
pxor N0, N0
pxor N1, N1
pxor X1, X1
mov T0d, INT [LUT + 0*SIZEOF_INT]
mov T1d, INT [LUT + 8*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
pinsrw X1, word [BLOCK + T1 * 2], 0
mov T0d, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0d, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0d, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0d, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0d, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0d, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0d, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
cmp LENEND, 2
jl %%.ELOAD15
mov T1d, INT [LUT + 9*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD15
mov T1d, INT [LUT + 10*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD15
mov T1d, INT [LUT + 11*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD15
mov T1d, INT [LUT + 12*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD15
mov T1d, INT [LUT + 13*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD15
mov T1d, INT [LUT + 14*SIZEOF_INT]
pinsrw X1, word [BLOCK + T1 * 2], 6
%%.ELOAD15:
%endmacro
%macro LOAD8 0
pxor N0, N0
mov T0d, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 0
mov T0d, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 1
mov T0d, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 2
mov T0d, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 3
mov T0d, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 4
mov T0d, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 5
mov T0d, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 6
mov T0d, INT [LUT + 7*SIZEOF_INT]
pinsrw X0, word [BLOCK + T0 * 2], 7
%endmacro
%macro LOAD7 0
pxor N0, N0
pxor X0, X0
mov T1d, INT [LUT + 0*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 0
cmp LENEND, 2
jl %%.ELOAD7
mov T1d, INT [LUT + 1*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 1
cmp LENEND, 3
jl %%.ELOAD7
mov T1d, INT [LUT + 2*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 2
cmp LENEND, 4
jl %%.ELOAD7
mov T1d, INT [LUT + 3*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 3
cmp LENEND, 5
jl %%.ELOAD7
mov T1d, INT [LUT + 4*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 4
cmp LENEND, 6
jl %%.ELOAD7
mov T1d, INT [LUT + 5*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 5
cmp LENEND, 7
jl %%.ELOAD7
mov T1d, INT [LUT + 6*SIZEOF_INT]
pinsrw X0, word [BLOCK + T1 * 2], 6
%%.ELOAD7:
%endmacro
%macro REDUCE0 0
movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
movdqa xmm2, XMMWORD [VALUES + (16*2)]
movdqa xmm3, XMMWORD [VALUES + (24*2)]
movdqa xmm4, XMMWORD [VALUES + (32*2)]
movdqa xmm5, XMMWORD [VALUES + (40*2)]
movdqa xmm6, XMMWORD [VALUES + (48*2)]
movdqa xmm7, XMMWORD [VALUES + (56*2)]
pcmpeqw xmm0, ZERO
pcmpeqw xmm1, ZERO
pcmpeqw xmm2, ZERO
pcmpeqw xmm3, ZERO
pcmpeqw xmm4, ZERO
pcmpeqw xmm5, ZERO
pcmpeqw xmm6, ZERO
pcmpeqw xmm7, ZERO
packsswb xmm0, xmm1
packsswb xmm2, xmm3
packsswb xmm4, xmm5
packsswb xmm6, xmm7
pmovmskb eax, xmm0
pmovmskb ecx, xmm2
pmovmskb edx, xmm4
pmovmskb esi, xmm6
shl rcx, 16
shl rdx, 32
shl rsi, 48
or rax, rcx
or rdx, rsi
or rax, rdx
not rax
mov MMWORD [r15], rax
%endmacro
;
; Prepare data for jsimd_encode_mcu_AC_refine().
;
; GLOBAL(int)
; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
; const int *jpeg_natural_order_start,
; int Sl, int Al, JCOEF *absvalues,
; size_t *bits)
;
; r10 = const JCOEF *block
; r11 = const int *jpeg_natural_order_start
; r12 = int Sl
; r13 = int Al
; r14 = JCOEF *values
; r15 = size_t *bits
%define ZERO xmm9
%define ONE xmm5
%define X0 xmm0
%define X1 xmm1
%define N0 xmm2
%define N1 xmm3
%define AL xmm4
%define K eax
%define KK r9d
%define EOB r8d
%define SIGN rdi
%define LUT r11
%define T0 rcx
%define T0d ecx
%define T1 rdx
%define T1d edx
%define BLOCK r10
%define VALUES r14
%define LEN r12d
%define LENEND r13d
align 32
GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [rbp - 16]
collect_args 6
movdqa XMMWORD [rbp - 16], ZERO
xor SIGN, SIGN
xor EOB, EOB
xor KK, KK
movd AL, r13d
pxor ZERO, ZERO
pcmpeqw ONE, ONE
psrlw ONE, 15
mov K, LEN
mov LENEND, LEN
and K, -16
and LENEND, 7
shr K, 4
jz .ELOOPR16
.BLOOPR16:
LOAD16
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
shr SIGN, 16 ; make room for sizebits
shl T0, 48
or SIGN, T0
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER16 ; if (idx) {
mov EOB, KK
add EOB, T1d ; EOB = k + idx;
.CONTINUER16:
add VALUES, 16*2
add LUT, 16*SIZEOF_INT
add KK, 16
dec K
jnz .BLOOPR16
.ELOOPR16:
test LEN, 8
jz .TRYR7
test LEN, 7
jz .TRYR8
LOAD15
pcmpgtw N0, X0
pcmpgtw N1, X1
paddw X0, N0
paddw X1, N1
pxor X0, N0
pxor X1, N1
psrlw X0, AL
psrlw X1, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
movdqa XMMWORD [VALUES + (8) * 2], X1
pcmpeqw X0, ONE
pcmpeqw X1, ONE
packsswb N0, N1
packsswb X0, X1
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
shr SIGN, 16 ; make room for sizebits
shl T0, 48
or SIGN, T0
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER15 ; if (idx) {
mov EOB, KK
add EOB, T1d ; EOB = k + idx;
.CONTINUER15:
add VALUES, 16*2
jmp .PADDINGR
.TRYR8:
LOAD8
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
shr SIGN, 8 ; make room for sizebits
shl T0, 56
or SIGN, T0
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER8 ; if (idx) {
mov EOB, KK
add EOB, T1d ; EOB = k + idx;
.CONTINUER8:
add VALUES, 8*2
jmp .PADDINGR
.TRYR7:
LOAD7
pcmpgtw N0, X0
paddw X0, N0
pxor X0, N0
psrlw X0, AL
movdqa XMMWORD [VALUES + (0) * 2], X0
pcmpeqw X0, ONE
packsswb N0, ZERO
packsswb X0, ZERO
pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
shr SIGN, 8 ; make room for sizebits
shl T0, 56
or SIGN, T0
bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
jz .CONTINUER7 ; if (idx) {
mov EOB, KK
add EOB, T1d ; EOB = k + idx;
.CONTINUER7:
add VALUES, 8*2
.PADDINGR:
mov K, LEN
add K, 7
and K, -8
shr K, 3
sub K, DCTSIZE2/8
jz .EPADDINGR
align 16
.ZEROLOOPR:
movdqa XMMWORD [VALUES + 0], ZERO
shr SIGN, 8
add VALUES, 8*2
inc K
jnz .ZEROLOOPR
.EPADDINGR:
not SIGN
sub VALUES, DCTSIZE2*2
mov MMWORD [r15+SIZEOF_MMWORD], SIGN
REDUCE0
mov eax, EOB
movdqa ZERO, XMMWORD [rbp - 16]
uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
%undef ZERO
%undef ONE
%undef X0
%undef X1
%undef N0
%undef N1
%undef AL
%undef K
%undef KK
%undef EOB
%undef SIGN
%undef LUT
%undef T0
%undef T0d
%undef T1
%undef T1d
%undef BLOCK
%undef VALUES
%undef LEN
%undef LENEND
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

View File

@ -3,7 +3,7 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@ -21,6 +21,7 @@
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
#include "jconfigint.h"
/*
* In the PIC cases, we have no guarantee that constants will keep
@ -1020,3 +1021,35 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
dctbl, actbl);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
init_simd();
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (SIZEOF_SIZE_T != 8)
return 0;
if (!(simd_support & JSIMD_SSE2))
return 0;
#if defined(HAVE_BUILTIN_CTZL)
return 1;
#elif defined(HAVE_BITSCANFORWARD64)
return 1;
#else
return 0;
#endif
}
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
jpeg_natural_order_start,
Sl, Al, absvalues, bits);
}