mirror of
https://github.com/team-charls/charls
synced 2025-03-28 21:03:13 +00:00

By design the benchmark project is not build as it relies on Google Benchmark that is retrieved using vcpkg. Vcpkg is now part of Visual Studio 2022, so building with VS 2022 works. One of the build steps of the CI pipeline build CharLS however with VS 2019 to ensure that VS 2019 still can be used. Enabled benchmark in the solution file for x86 and X64 would break VS 2019. ARM64 build are only support in VS 2022, so enabling that version doesn't break VS 2019.
620 lines
16 KiB
C++
620 lines
16 KiB
C++
// Copyright (c) Team CharLS.
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
#include <benchmark/benchmark.h>
|
|
|
|
#include "../src/jpegls_preset_coding_parameters.hpp"
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)
|
|
|
|
|
|
static int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
|
|
{
|
|
constexpr int32_t near_lossless{};
|
|
|
|
if (di <= -preset.threshold3)
|
|
return -4;
|
|
if (di <= -preset.threshold2)
|
|
return -3;
|
|
if (di <= -preset.threshold1)
|
|
return -2;
|
|
if (di < -near_lossless)
|
|
return -1;
|
|
if (di <= near_lossless)
|
|
return 0;
|
|
if (di < preset.threshold1)
|
|
return 1;
|
|
if (di < preset.threshold2)
|
|
return 2;
|
|
if (di < preset.threshold3)
|
|
return 3;
|
|
|
|
return 4;
|
|
}
|
|
|
|
static std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
|
|
{
|
|
const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
|
|
const int32_t range{preset.maximum_sample_value + 1};
|
|
|
|
std::vector<int8_t> lut(static_cast<size_t>(range) * 2);
|
|
for (size_t i{}; i != lut.size(); ++i)
|
|
{
|
|
lut[i] = quantize_gradient_org(preset, static_cast<int32_t>(i) - range);
|
|
}
|
|
|
|
return lut;
|
|
}
|
|
|
|
|
|
const std::vector quantization_lut_lossless_8{create_quantize_lut_lossless(8)};
|
|
|
|
template<typename Traits>
|
|
struct scan_decoder
|
|
{
|
|
int32_t t1_{};
|
|
int32_t t2_{};
|
|
int32_t t3_{};
|
|
Traits traits_;
|
|
|
|
explicit scan_decoder(Traits traits, const int32_t bit_count) noexcept : traits_{std::move(traits)}
|
|
{
|
|
const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
|
|
|
|
t1_ = preset.threshold1;
|
|
t2_ = preset.threshold2;
|
|
t3_ = preset.threshold3;
|
|
}
|
|
|
|
[[nodiscard]]
|
|
int8_t quantize_gradient_org(const int32_t di) const noexcept
|
|
{
|
|
if (di <= -t3_)
|
|
return -4;
|
|
if (di <= -t2_)
|
|
return -3;
|
|
if (di <= -t1_)
|
|
return -2;
|
|
if (di < -traits_.near_lossless)
|
|
return -1;
|
|
if (di <= traits_.near_lossless)
|
|
return 0;
|
|
if (di < t1_)
|
|
return 1;
|
|
if (di < t2_)
|
|
return 2;
|
|
if (di < t3_)
|
|
return 3;
|
|
|
|
return 4;
|
|
}
|
|
};
|
|
|
|
struct lossless_traits final
|
|
{
|
|
static constexpr int32_t near_lossless{};
|
|
};
|
|
|
|
|
|
static __declspec(noinline) int32_t
|
|
get_predicted_value_default(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
|
|
{
|
|
if (ra < rb)
|
|
{
|
|
if (rc < ra)
|
|
return rb;
|
|
|
|
if (rc > rb)
|
|
return ra;
|
|
}
|
|
else
|
|
{
|
|
if (rc < rb)
|
|
return ra;
|
|
|
|
if (rc > ra)
|
|
return rb;
|
|
}
|
|
|
|
return ra + rb - rc;
|
|
}
|
|
|
|
|
|
constexpr size_t int32_t_bit_count = sizeof(int32_t) * 8;
|
|
|
|
|
|
static constexpr int32_t bit_wise_sign(const int32_t i) noexcept
|
|
{
|
|
return i >> (int32_t_bit_count - 1);
|
|
}
|
|
|
|
|
|
static __declspec(noinline) int32_t
|
|
get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
|
|
{
|
|
// sign trick reduces the number of if statements (branches)
|
|
const int32_t sign{bit_wise_sign(rb - ra)};
|
|
|
|
// is Ra between Rc and Rb?
|
|
if ((sign ^ (rc - ra)) < 0)
|
|
{
|
|
return rb;
|
|
}
|
|
if ((sign ^ (rb - rc)) < 0)
|
|
{
|
|
return ra;
|
|
}
|
|
|
|
// default case, valid if Rc element of [Ra,Rb]
|
|
return ra + rb - rc;
|
|
}
|
|
|
|
|
|
#if defined(_M_X64) || defined(_M_ARM64)
|
|
inline static int countl_zero(const uint64_t value) noexcept
|
|
{
|
|
if (value == 0)
|
|
return 64;
|
|
|
|
unsigned long index;
|
|
_BitScanReverse64(&index, value);
|
|
|
|
return 63 - static_cast<int>(index);
|
|
}
|
|
#endif
|
|
|
|
|
|
static void bm_get_predicted_value_default(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(get_predicted_value_default(100, 200, 300));
|
|
benchmark::DoNotOptimize(get_predicted_value_default(200, 100, 300));
|
|
}
|
|
}
|
|
BENCHMARK(bm_get_predicted_value_default);
|
|
|
|
static void bm_get_predicted_value_optimized(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(get_predicted_value_optimized(100, 200, 300));
|
|
benchmark::DoNotOptimize(get_predicted_value_default(200, 100, 300));
|
|
}
|
|
}
|
|
BENCHMARK(bm_get_predicted_value_optimized);
|
|
|
|
static void bm_quantize_gradient_calculated(benchmark::State& state)
|
|
{
|
|
const scan_decoder<lossless_traits> sd({}, 8);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(sd.quantize_gradient_org(0));
|
|
benchmark::DoNotOptimize(sd.quantize_gradient_org(127));
|
|
benchmark::DoNotOptimize(sd.quantize_gradient_org(255));
|
|
}
|
|
}
|
|
BENCHMARK(bm_quantize_gradient_calculated);
|
|
|
|
static void bm_quantize_gradient_lut(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(quantization_lut_lossless_8[0]);
|
|
benchmark::DoNotOptimize(quantization_lut_lossless_8[127]);
|
|
benchmark::DoNotOptimize(quantization_lut_lossless_8[255]);
|
|
}
|
|
}
|
|
BENCHMARK(bm_quantize_gradient_lut);
|
|
|
|
|
|
static int peek_zero_bits(uint64_t val_test) noexcept
|
|
{
|
|
for (int32_t count{}; count < 16; ++count)
|
|
{
|
|
if ((val_test & (uint64_t{1} << (64 - 1))) != 0)
|
|
return count;
|
|
|
|
val_test <<= 1;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static void bm_peek_zero_bits(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(peek_zero_bits(0));
|
|
benchmark::DoNotOptimize(peek_zero_bits(UINT64_MAX));
|
|
}
|
|
}
|
|
BENCHMARK(bm_peek_zero_bits);
|
|
|
|
|
|
#if defined(_M_X64) || defined(_M_ARM64)
|
|
int peek_zero_bits_intrinsic(const uint64_t value) noexcept
|
|
{
|
|
const auto count = countl_zero(value);
|
|
return count < 16 ? count : -1;
|
|
}
|
|
|
|
|
|
static void bm_peek_zero_bits_intrinsic(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(peek_zero_bits_intrinsic(0));
|
|
benchmark::DoNotOptimize(peek_zero_bits_intrinsic(UINT64_MAX));
|
|
}
|
|
}
|
|
BENCHMARK(bm_peek_zero_bits_intrinsic);
|
|
#endif
|
|
|
|
|
|
static std::vector<uint8_t> allocate_buffer(const size_t size)
|
|
{
|
|
std::vector<uint8_t> buffer;
|
|
buffer.resize(size);
|
|
return buffer;
|
|
}
|
|
|
|
static void bm_resize_vector(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(allocate_buffer(size_t{512} * 512 * 16));
|
|
benchmark::DoNotOptimize(allocate_buffer(size_t{1024} * 1024 * 8 * 3));
|
|
}
|
|
}
|
|
BENCHMARK(bm_resize_vector);
|
|
|
|
|
|
class overwrite_buffer
|
|
{
|
|
public:
|
|
void reset(const size_t new_size)
|
|
{
|
|
if (new_size <= size_)
|
|
{
|
|
size_ = new_size;
|
|
return;
|
|
}
|
|
|
|
data_.reset(); // First release, then re-alloc new memory.
|
|
data_.reset(new uint8_t[new_size]);
|
|
size_ = new_size;
|
|
}
|
|
|
|
[[nodiscard]]
|
|
uint8_t* data() const noexcept
|
|
{
|
|
return data_.get();
|
|
}
|
|
|
|
[[nodiscard]]
|
|
size_t size() const noexcept
|
|
{
|
|
return size_;
|
|
}
|
|
|
|
private:
|
|
std::unique_ptr<uint8_t[]> data_{};
|
|
size_t size_{};
|
|
};
|
|
|
|
|
|
static overwrite_buffer allocate_overwrite_buffer(const size_t size)
|
|
{
|
|
overwrite_buffer buffer;
|
|
buffer.reset(size);
|
|
return buffer;
|
|
}
|
|
|
|
static void bm_resize_overwrite_buffer(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(allocate_overwrite_buffer(size_t{512} * 512 * 16));
|
|
benchmark::DoNotOptimize(allocate_overwrite_buffer(size_t{1024} * 1024 * 8 * 3));
|
|
}
|
|
}
|
|
BENCHMARK(bm_resize_overwrite_buffer);
|
|
|
|
|
|
static int memset_buffer(uint8_t* data, const size_t size) noexcept
|
|
{
|
|
memset(data, 0, size);
|
|
return 0;
|
|
}
|
|
static void bm_memset_buffer(benchmark::State& state)
|
|
{
|
|
std::vector<uint8_t> buffer(size_t{1024} * 1024 * 8 * 3);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(memset_buffer(buffer.data(), size_t{512} * 512 * 16));
|
|
benchmark::DoNotOptimize(memset_buffer(buffer.data(), size_t{1024} * 1024 * 8 * 3));
|
|
}
|
|
}
|
|
BENCHMARK(bm_memset_buffer);
|
|
|
|
|
|
constexpr static bool has_ff_byte_classic(const unsigned int value) noexcept
|
|
{
|
|
// Check if any byte is equal to 0xFF
|
|
return ((value & 0xFF) == 0xFF) || (((value >> 8) & 0xFF) == 0xFF) || (((value >> 16) & 0xFF) == 0xFF) ||
|
|
(((value >> 24) & 0xFF) == 0xFF);
|
|
}
|
|
static void bm_has_ff_byte_classic(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(has_ff_byte_classic(0));
|
|
benchmark::DoNotOptimize(has_ff_byte_classic(0xFF));
|
|
}
|
|
}
|
|
BENCHMARK(bm_has_ff_byte_classic);
|
|
|
|
static bool has_ff_byte_loop(const unsigned int value) noexcept
|
|
{
|
|
// Iterate over each byte and check if it is equal to 0xFF
|
|
for (int i = 0; i < sizeof(unsigned int); ++i)
|
|
{
|
|
if ((value & (0xFF << (8 * i))) == (0xFFU << (8 * i)))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
static void bm_has_ff_byte_loop(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(has_ff_byte_loop(0));
|
|
benchmark::DoNotOptimize(has_ff_byte_loop(0xFF));
|
|
}
|
|
}
|
|
BENCHMARK(bm_has_ff_byte_loop);
|
|
|
|
#if !defined(_M_ARM64)
|
|
static bool has_ff_byte_simd(const unsigned int value) {
|
|
// Use SSE instructions for parallel comparison
|
|
const __m128i xmm_value = _mm_set1_epi32(value);
|
|
const __m128i xmm_ff = _mm_set1_epi32(0xFF);
|
|
|
|
// Compare each byte for equality with 0xFF
|
|
const __m128i comparison = _mm_cmpeq_epi8(xmm_value, xmm_ff);
|
|
|
|
// Check if any comparison result is true
|
|
return _mm_testz_si128(comparison, comparison) == 0;
|
|
}
|
|
static void bm_has_ff_byte_simd(benchmark::State& state)
|
|
{
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(has_ff_byte_simd(0));
|
|
benchmark::DoNotOptimize(has_ff_byte_simd(0xFF));
|
|
}
|
|
}
|
|
BENCHMARK(bm_has_ff_byte_simd);
|
|
#endif
|
|
|
|
static const std::byte* find_jpeg_marker_start_byte(const std::byte* position, const std::byte* end_position) noexcept
|
|
{
|
|
constexpr std::byte jpeg_marker_start_byte{0xFF};
|
|
|
|
// Use memchr to find next start byte (0xFF). memchr is optimized on some platforms to search faster.
|
|
return static_cast<const std::byte*>(
|
|
memchr(position, std::to_integer<int>(jpeg_marker_start_byte), end_position - position));
|
|
}
|
|
static void bm_find_jpeg_marker_start_byte(benchmark::State& state)
|
|
{
|
|
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(find_jpeg_marker_start_byte(buffer.data(), buffer.data() + buffer.size()));
|
|
}
|
|
}
|
|
BENCHMARK(bm_find_jpeg_marker_start_byte);
|
|
|
|
// A simple overload with uint64_t\uint32_t doesn't work for macOS. size_t is not the same type as uint64_t.
|
|
template<int BitCount, typename T>
|
|
constexpr bool is_uint_v = sizeof(T) == BitCount / 8 && std::is_integral_v<T> && !std::is_signed_v<T>;
|
|
|
|
template<typename T>
|
|
[[nodiscard]]
|
|
auto byte_swap(const T value) noexcept
|
|
{
|
|
if constexpr (is_uint_v<16, T>)
|
|
{
|
|
#ifdef _MSC_VER
|
|
return _byteswap_ushort(value);
|
|
#else
|
|
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
|
|
return static_cast<uint16_t>(value << 8 | value >> 8);
|
|
#endif
|
|
}
|
|
else if constexpr (is_uint_v<32, T>)
|
|
{
|
|
#ifdef _MSC_VER
|
|
return _byteswap_ulong(value);
|
|
#else
|
|
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
|
|
return value >> 24 | (value & 0x00FF0000) >> 8 | (value & 0x0000FF00) << 8 | value << 24;
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
static_assert(is_uint_v<64, T>);
|
|
#ifdef _MSC_VER
|
|
return _byteswap_uint64(value);
|
|
#else
|
|
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
|
|
return (value << 56) | ((value << 40) & 0x00FF'0000'0000'0000) | ((value << 24) & 0x0000'FF00'0000'0000) |
|
|
((value << 8) & 0x0000'00FF'0000'0000) | ((value >> 8) & 0x0000'0000'FF00'0000) |
|
|
((value >> 24) & 0x0000'0000'00FF'0000) | ((value >> 40) & 0x0000'0000'0000'FF00) | (value >> 56);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<typename T>
|
|
[[nodiscard]]
|
|
T read_unaligned(const void* buffer) noexcept
|
|
{
|
|
// Note: MSVC, GCC and clang will replace this with a direct register read if the CPU architecture allows it
|
|
// On x86, x64 and ARM64 this will just be 1 register load.
|
|
T value;
|
|
memcpy(&value, buffer, sizeof(T));
|
|
return value;
|
|
}
|
|
|
|
template<typename T>
|
|
T read_big_endian_unaligned(const void* buffer) noexcept
|
|
{
|
|
#ifdef LITTLE_ENDIAN_ARCHITECTURE
|
|
return byte_swap(read_unaligned<T>(buffer));
|
|
#else
|
|
return read_unaligned<T>(buffer);
|
|
#endif
|
|
}
|
|
|
|
#if !defined(_M_ARM64)
|
|
static uint32_t read_all_bytes_with_ff_check(const std::byte* position, const std::byte* end_position)
|
|
{
|
|
uint32_t result{};
|
|
|
|
for (; position < end_position; position += sizeof(uint32_t))
|
|
{
|
|
if (const uint32_t value{read_big_endian_unaligned<uint32_t>(position)};
|
|
has_ff_byte_simd(value))
|
|
{
|
|
result++;
|
|
}
|
|
else
|
|
{
|
|
result |= value;
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
}
|
|
static void bm_read_all_bytes_with_ff_check(benchmark::State& state)
|
|
{
|
|
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(read_all_bytes_with_ff_check(buffer.data(), buffer.data() + buffer.size()));
|
|
}
|
|
}
|
|
BENCHMARK(bm_read_all_bytes_with_ff_check);
|
|
#endif
|
|
|
|
#if !defined(_M_ARM64)
|
|
static bool has_ff_byte_simd64(const uint64_t value)
|
|
{
|
|
// Use SSE instructions for parallel comparison
|
|
const __m128i xmm_value = _mm_set1_epi64x(value);
|
|
const __m128i xmm_ff = _mm_set1_epi32(0xFF);
|
|
|
|
// Compare each byte for equality with 0xFF
|
|
const __m128i comparison = _mm_cmpeq_epi8(xmm_value, xmm_ff);
|
|
|
|
// Check if any comparison result is true
|
|
return _mm_testz_si128(comparison, comparison) == 0;
|
|
}
|
|
|
|
static uint64_t read_all_bytes_with_ff_check64(const std::byte* position, const std::byte* end_position)
|
|
{
|
|
uint64_t result{};
|
|
|
|
for (; position < end_position; position += sizeof(uint64_t))
|
|
{
|
|
if (const uint64_t value{read_big_endian_unaligned<uint64_t>(position)}; has_ff_byte_simd64(value))
|
|
{
|
|
result++;
|
|
}
|
|
else
|
|
{
|
|
result |= value;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
static void bm_read_all_bytes_with_ff_check64(benchmark::State& state)
|
|
{
|
|
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(read_all_bytes_with_ff_check64(buffer.data(), buffer.data() + buffer.size()));
|
|
}
|
|
}
|
|
BENCHMARK(bm_read_all_bytes_with_ff_check64);
|
|
#endif
|
|
|
|
|
|
static uint32_t read_all_bytes_no_check(const std::byte* position, const std::byte* end_position) noexcept
|
|
{
|
|
uint32_t result{};
|
|
|
|
for (; position < end_position; position += sizeof(uint32_t))
|
|
{
|
|
const uint32_t value{read_big_endian_unaligned<uint32_t>(position)};
|
|
result |= value;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
static void bm_read_all_bytes_no_check(benchmark::State& state)
|
|
{
|
|
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(read_all_bytes_no_check(buffer.data(), buffer.data() + buffer.size()));
|
|
}
|
|
}
|
|
BENCHMARK(bm_read_all_bytes_no_check);
|
|
|
|
static uint64_t read_all_bytes_no_check64(const std::byte* position, const std::byte* end_position) noexcept
|
|
{
|
|
uint64_t result{};
|
|
|
|
for (; position < end_position; position += sizeof(uint64_t))
|
|
{
|
|
const uint64_t value{read_big_endian_unaligned<uint64_t>(position)};
|
|
result |= value;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
static void bm_read_all_bytes_no_check64(benchmark::State& state)
|
|
{
|
|
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);
|
|
|
|
for (const auto _ : state)
|
|
{
|
|
benchmark::DoNotOptimize(read_all_bytes_no_check64(buffer.data(), buffer.data() + buffer.size()));
|
|
}
|
|
}
|
|
BENCHMARK(bm_read_all_bytes_no_check64);
|
|
|
|
// Tips to run the benchmark tests:
|
|
|
|
// To run a single benchmark:
|
|
// benchmark --benchmark_filter = bm_decode
|
|
|
|
BENCHMARK_MAIN();
|