Add benchmark project and use memchr (#127)

* Add an initial benchmark project to research optimizations.
* Use memchr to search for 0xFF during decoding.
* Rename function to make intent more clear.
* Use const pointers (decoding only need to read).
This commit is contained in:
Victor Derks 2022-01-05 18:27:22 +01:00 committed by GitHub
parent 7d1b06ee07
commit 26478a8ba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 653 additions and 191 deletions

View File

@ -72,6 +72,7 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=lossless/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=losslesstraits/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=maxval/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=memchr/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=mrfx/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=nightshot/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=NODISCARD/@EntryIndexedValue">True</s:Boolean>

7
benchmark/README.md Normal file
View File

@ -0,0 +1,7 @@
# Benchmark
The Visual Studio project in this folder contains benchmarks to analyze different way of
decoding and encoding functions.
The project expects that the Google Benchmark framework has been installed with vcpkg.
This can be done with: ```vcpkg install benchmark```

199
benchmark/benchmark.cpp Normal file
View File

@ -0,0 +1,199 @@
// Copyright (c) Team CharLS.
// SPDX-License-Identifier: BSD-3-Clause
#include <benchmark/benchmark.h>
#include "../src/jpegls_preset_coding_parameters.h"
#include <cstdint>
int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
{
constexpr int32_t near_lossless{};
if (di <= -preset.threshold3)
return -4;
if (di <= -preset.threshold2)
return -3;
if (di <= -preset.threshold1)
return -2;
if (di < -near_lossless)
return -1;
if (di <= near_lossless)
return 0;
if (di < preset.threshold1)
return 1;
if (di < preset.threshold2)
return 2;
if (di < preset.threshold3)
return 3;
return 4;
}
std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
{
const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
const int32_t range{preset.maximum_sample_value + 1};
std::vector<int8_t> lut(static_cast<size_t>(range) * 2);
for (size_t i{}; i != lut.size(); ++i)
{
lut[i] = quantize_gradient_org(preset, static_cast<int32_t>(i) - range);
}
return lut;
}
const std::vector<int8_t> quantization_lut_lossless_8{create_quantize_lut_lossless(8)};
template<typename Traits>
struct scan_decoder
{
int32_t t1_{};
int32_t t2_{};
int32_t t3_{};
Traits traits_;
explicit scan_decoder(Traits traits, const int32_t bit_count) noexcept : traits_{std::move(traits)}
{
const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
t1_ = preset.threshold1;
t2_ = preset.threshold2;
t3_ = preset.threshold3;
}
int8_t quantize_gradient_org(const int32_t di) const noexcept
{
if (di <= -t3_)
return -4;
if (di <= -t2_)
return -3;
if (di <= -t1_)
return -2;
if (di < -traits_.near_lossless)
return -1;
if (di <= traits_.near_lossless)
return 0;
if (di < t1_)
return 1;
if (di < t2_)
return 2;
if (di < t3_)
return 3;
return 4;
}
};
struct lossless_traits final
{
static constexpr int32_t near_lossless{};
};
__declspec(noinline) int32_t get_predicted_value_default(int32_t Ra, int32_t Rb, int32_t Rc) noexcept
{
if (Ra < Rb)
{
if (Rc < Ra)
return Rb;
if (Rc > Rb)
return Ra;
}
else
{
if (Rc < Rb)
return Ra;
if (Rc > Ra)
return Rb;
}
return Ra + Rb - Rc;
}
constexpr size_t int32_t_bit_count = sizeof(int32_t) * 8;
constexpr int32_t bit_wise_sign(const int32_t i) noexcept
{
return i >> (int32_t_bit_count - 1);
}
__declspec(noinline) int32_t get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
{
// sign trick reduces the number of if statements (branches)
const int32_t sign{bit_wise_sign(rb - ra)};
// is Ra between Rc and Rb?
if ((sign ^ (rc - ra)) < 0)
{
return rb;
}
if ((sign ^ (rb - rc)) < 0)
{
return ra;
}
// default case, valid if Rc element of [Ra,Rb]
return ra + rb - rc;
}
static void bm_get_predicted_value_default(benchmark::State& state)
{
for (const auto _ : state)
{
benchmark::DoNotOptimize(get_predicted_value_default(100, 200, 300));
benchmark::DoNotOptimize(get_predicted_value_default(200, 100, 300));
}
}
BENCHMARK(bm_get_predicted_value_default);
static void bm_get_predicted_value_optimized(benchmark::State& state)
{
for (const auto _ : state)
{
benchmark::DoNotOptimize(get_predicted_value_optimized(100, 200, 300));
benchmark::DoNotOptimize(get_predicted_value_default(200, 100, 300));
}
}
BENCHMARK(bm_get_predicted_value_optimized);
static void bm_quantize_gradient_calculated(benchmark::State& state)
{
const scan_decoder<lossless_traits> sd({}, 8);
for (const auto _ : state)
{
benchmark::DoNotOptimize(sd.quantize_gradient_org(0));
benchmark::DoNotOptimize(sd.quantize_gradient_org(127));
benchmark::DoNotOptimize(sd.quantize_gradient_org(255));
}
}
BENCHMARK(bm_quantize_gradient_calculated);
static void bm_quantize_gradient_lut(benchmark::State& state)
{
const scan_decoder<lossless_traits> sd({}, 8);
for (const auto _ : state)
{
benchmark::DoNotOptimize(quantization_lut_lossless_8[0]);
benchmark::DoNotOptimize(quantization_lut_lossless_8[127]);
benchmark::DoNotOptimize(quantization_lut_lossless_8[255]);
}
}
BENCHMARK(bm_quantize_gradient_lut);
BENCHMARK_MAIN();

31
benchmark/benchmark.sln Normal file
View File

@ -0,0 +1,31 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.1.31911.260
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "benchmark", "benchmark.vcxproj", "{F961EC29-4ACE-4D5E-B7ED-55681A678A90}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x64.ActiveCfg = Debug|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x64.Build.0 = Debug|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x86.ActiveCfg = Debug|Win32
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x86.Build.0 = Debug|Win32
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x64.ActiveCfg = Release|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x64.Build.0 = Release|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x86.ActiveCfg = Release|Win32
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {57D87A68-949C-476E-A240-13953EE8CA8C}
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,3 @@
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020public_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="12"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="PUBLIC"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aa_bb" /&gt;&lt;/NamingElement&gt;</s:String>
<s:Boolean x:Key="/Default/UserDictionary/Words/=lossless/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>

151
benchmark/benchmark.vcxproj Normal file
View File

@ -0,0 +1,151 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<VCProjectVersion>16.0</VCProjectVersion>
<Keyword>Win32Proj</Keyword>
<ProjectGuid>{f961ec29-4ace-4d5e-b7ed-55681a678a90}</ProjectGuid>
<RootNamespace>benchmark</RootNamespace>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v143</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v143</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v143</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v143</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="benchmark.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="benchmark.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
</Project>

View File

@ -170,6 +170,28 @@ One of the missing features of C++ is a standard Package Manager. The following
* Library to read Anymap files (for example Netpbm)
* Library to parse command line parameters (for example Clara, CLI11)
### Performance
#### Decoding the bitstream
There are 2 main ways to decode the bitstream
* Basic: Read it byte for byte and store the results in a cache variable
* Improved: read when possible a register in 1 step. The problem is that 0xFF can exists in the
bitstream. If such a 0xFF exists the next bit needs to be ignored. There are a couple of way to do this:
* A) Search for the first position with a 0xFF it and remember the position. 0xFFs are rare.
* B) Search for the first 0xFF with memchr. memchr can leverage special CPU instructions when possible.
* C) Load a register and check if it contains a 0xFF byte.
Measurements conclusion: option B is the fastest on x64. This is the original algorithm. There is not a large difference between the different options.
Examples of decoding performance on a AMD 5950X x64 CPU:
| Image | Basic | Improved A | Improved B |Improved C |
| -------------- | ------- | ---------- | ---------- |---------- |
| 16 bit 512 * 512 (CT image) | 3.09 ms | 3.17 ms | 3.06 ms | 3.10 ms |
| 8 bit 5412 * 7216 | 517 ms | 509 ms | 507 ms | 512 ms |
### Supported C++ Compilers
#### Clang

View File

@ -17,7 +17,7 @@ namespace charls {
class decoder_strategy
{
public:
explicit decoder_strategy(const frame_info& frame, const coding_parameters& parameters) noexcept :
decoder_strategy(const frame_info& frame, const coding_parameters& parameters) noexcept :
frame_info_{frame}, parameters_{parameters}
{
}
@ -35,14 +35,11 @@ public:
void initialize(const byte_span source)
{
valid_bits_ = 0;
read_cache_ = 0;
position_ = source.data;
end_position_ = position_ + source.size;
next_ff_position_ = find_next_ff();
make_valid();
find_jpeg_marker_start_byte();
fill_read_cache();
}
void reset()
@ -50,8 +47,8 @@ public:
valid_bits_ = 0;
read_cache_ = 0;
next_ff_position_ = find_next_ff();
make_valid();
find_jpeg_marker_start_byte();
fill_read_cache();
}
FORCE_INLINE void skip(const int32_t length) noexcept
@ -82,84 +79,10 @@ public:
impl::throw_jpegls_error(jpegls_errc::too_much_encoded_data);
}
FORCE_INLINE bool optimized_read() noexcept
{
// Easy & fast: if there is no 0xFF byte in sight, we can read without bit stuffing
if (position_ < next_ff_position_ - (sizeof(bufType) - 1))
{
read_cache_ |= from_big_endian<sizeof(bufType)>::read(position_) >> valid_bits_;
const int bytes_to_read{(bufType_bit_count - valid_bits_) >> 3};
position_ += bytes_to_read;
valid_bits_ += bytes_to_read * 8;
ASSERT(valid_bits_ >= bufType_bit_count - 8);
return true;
}
return false;
}
void make_valid()
{
ASSERT(valid_bits_ <= bufType_bit_count - 8);
if (optimized_read())
return;
do
{
if (position_ >= end_position_)
{
if (valid_bits_ <= 0)
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
return;
}
const bufType value_new{position_[0]};
if (value_new == jpeg_marker_start_byte)
{
// JPEG bit stream rule: no FF may be followed by 0x80 or higher
if (position_ == end_position_ - 1 || (position_[1] & 0x80) != 0)
{
if (valid_bits_ <= 0)
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
return;
}
}
read_cache_ |= value_new << (bufType_bit_count - 8 - valid_bits_);
position_ += 1;
valid_bits_ += 8;
if (value_new == jpeg_marker_start_byte)
{
--valid_bits_;
}
} while (valid_bits_ < bufType_bit_count - 8);
next_ff_position_ = find_next_ff();
}
uint8_t* find_next_ff() const noexcept
{
auto* position_next_ff{position_};
while (position_next_ff < end_position_)
{
if (*position_next_ff == jpeg_marker_start_byte)
break;
++position_next_ff;
}
return position_next_ff;
}
uint8_t* get_cur_byte_pos() const noexcept
const uint8_t* get_cur_byte_pos() const noexcept
{
int32_t valid_bits{valid_bits_};
uint8_t* compressed_bytes{position_};
const uint8_t* compressed_bytes{position_};
for (;;)
{
@ -177,14 +100,14 @@ public:
{
if (valid_bits_ < length)
{
make_valid();
fill_read_cache();
if (valid_bits_ < length)
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
}
ASSERT(length != 0 && length <= valid_bits_);
ASSERT(length < 32);
const auto result = static_cast<int32_t>(read_cache_ >> (bufType_bit_count - length));
const auto result = static_cast<int32_t>(read_cache_ >> (cache_t_bit_count - length));
skip(length);
return result;
}
@ -193,20 +116,20 @@ public:
{
if (valid_bits_ < 8)
{
make_valid();
fill_read_cache();
}
return static_cast<int32_t>(read_cache_ >> (bufType_bit_count - 8));
return static_cast<int32_t>(read_cache_ >> max_readable_cache_bits);
}
FORCE_INLINE bool read_bit()
{
if (valid_bits_ <= 0)
{
make_valid();
fill_read_cache();
}
const bool set = (read_cache_ & (static_cast<bufType>(1) << (bufType_bit_count - 1))) != 0;
const bool set = (read_cache_ & (static_cast<cache_t>(1) << (cache_t_bit_count - 1))) != 0;
skip(1);
return set;
}
@ -215,13 +138,13 @@ public:
{
if (valid_bits_ < 16)
{
make_valid();
fill_read_cache();
}
bufType val_test = read_cache_;
cache_t val_test = read_cache_;
for (int32_t count{}; count < 16; ++count)
{
if ((val_test & (static_cast<bufType>(1) << (bufType_bit_count - 1))) != 0)
if ((val_test & (static_cast<cache_t>(1) << (cache_t_bit_count - 1))) != 0)
return count;
val_test <<= 1;
@ -268,17 +191,97 @@ protected:
std::unique_ptr<process_line> process_line_;
private:
using bufType = size_t;
static constexpr auto bufType_bit_count = static_cast<int32_t>(sizeof(bufType) * 8);
using cache_t = size_t;
void fill_read_cache()
{
ASSERT(valid_bits_ <= max_readable_cache_bits);
if (fill_read_cache_optimistic())
return;
do
{
if (position_ >= end_position_)
{
if (valid_bits_ == 0)
{
// Decoding process expects at least some bits to be added to the cache.
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
}
return;
}
const cache_t new_byte_value{*position_};
if (new_byte_value == jpeg_marker_start_byte)
{
// JPEG-LS bit stream rule: if FF is followed by a 1 bit then it is a marker
if (position_ == end_position_ - 1 || (position_[1] & 0x80) != 0)
{
if (valid_bits_ <= 0)
{
// Decoding process expects at least some bits to be added to the cache.
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
}
// Marker detected, typical EOI, SOS (next scan) or RSTm.
return;
}
}
read_cache_ |= new_byte_value << (max_readable_cache_bits - valid_bits_);
valid_bits_ += 8;
++position_;
if (new_byte_value == jpeg_marker_start_byte)
{
// The next bit after an 0xFF needs to be ignored, compensate for the next read (see ISO/IEC 14495-1,A.1)
--valid_bits_;
}
} while (valid_bits_ < max_readable_cache_bits);
find_jpeg_marker_start_byte();
}
FORCE_INLINE bool fill_read_cache_optimistic() noexcept
{
// Easy & fast: if there is no 0xFF byte in sight, we can read without bit stuffing
if (position_ < position_ff_ - (sizeof(cache_t) - 1))
{
read_cache_ |= byte_swap(read_unaligned<cache_t>(position_)) >> valid_bits_;
const int bytes_to_read{(cache_t_bit_count - valid_bits_) / 8};
position_ += bytes_to_read;
valid_bits_ += bytes_to_read * 8;
ASSERT(valid_bits_ >= max_readable_cache_bits);
return true;
}
return false;
}
void find_jpeg_marker_start_byte() noexcept
{
// Use memchr to find next start byte (0xFF). memchr is optimized on some platforms to search faster.
position_ff_ = static_cast<const uint8_t*>(memchr(position_, jpeg_marker_start_byte, position_ - end_position_));
if (!position_ff_)
{
position_ff_ = end_position_;
}
}
static constexpr auto cache_t_bit_count = static_cast<int32_t>(sizeof(cache_t) * 8);
static constexpr int32_t max_readable_cache_bits{cache_t_bit_count - 8};
std::vector<uint8_t> buffer_;
// decoding
bufType read_cache_{};
cache_t read_cache_{};
int32_t valid_bits_{};
uint8_t* position_{};
uint8_t* next_ff_position_{};
uint8_t* end_position_{};
const uint8_t* position_{};
const uint8_t* end_position_{};
const uint8_t* position_ff_{};
};
} // namespace charls

View File

@ -543,8 +543,12 @@ void jpeg_stream_reader::skip_byte()
uint16_t jpeg_stream_reader::read_uint16()
{
const uint32_t value{read_byte() * 256U};
return static_cast<uint16_t>(value + read_byte());
if (source_.size < 2)
throw_jpegls_error(jpegls_errc::source_buffer_too_small);
const auto value{read_unaligned<uint16_t>(source_.data)};
skip_bytes(source_, 2);
return byte_swap(value);
}
@ -557,11 +561,12 @@ uint32_t jpeg_stream_reader::read_uint24()
uint32_t jpeg_stream_reader::read_uint32()
{
uint32_t value{read_uint16()};
value = value << 16U;
value += read_uint16();
if (source_.size < 4)
throw_jpegls_error(jpegls_errc::source_buffer_too_small);
return value;
const auto value{read_unaligned<uint32_t>(source_.data)};
skip_bytes(source_, 4);
return byte_swap(value);
}

View File

@ -30,6 +30,7 @@ void jpeg_stream_writer::write_end_of_image(const bool even_destination_size)
{
if (even_destination_size && bytes_written() % 2 != 0)
{
// Write an additional 0xFF byte to ensure that the encoded bit stream has an even size.
write_uint8(jpeg_marker_start_byte);
}
@ -49,22 +50,21 @@ void jpeg_stream_writer::write_spiff_header_segment(const spiff_header& header)
write_bytes(spiff_magic_id.data(), spiff_magic_id.size());
write_uint8(spiff_major_revision_number);
write_uint8(spiff_minor_revision_number);
write_uint8(static_cast<uint8_t>(header.profile_id));
write_uint8(static_cast<uint8_t>(header.component_count));
write_uint8(to_underlying_type(header.profile_id));
write_uint8(header.component_count);
write_uint32(header.height);
write_uint32(header.width);
write_uint8(static_cast<uint8_t>(header.color_space));
write_uint8(static_cast<uint8_t>(header.bits_per_sample));
write_uint8(static_cast<uint8_t>(header.compression_type));
write_uint8(static_cast<uint8_t>(header.resolution_units));
write_uint8(to_underlying_type(header.color_space));
write_uint8(header.bits_per_sample);
write_uint8(to_underlying_type(header.compression_type));
write_uint8(to_underlying_type(header.resolution_units));
write_uint32(header.vertical_resolution);
write_uint32(header.horizontal_resolution);
}
USE_DECL_ANNOTATIONS void jpeg_stream_writer::write_spiff_directory_entry(const uint32_t entry_tag,
const void* entry_data,
const size_t entry_data_size_bytes)
USE_DECL_ANNOTATIONS void jpeg_stream_writer::write_spiff_directory_entry(const uint32_t entry_tag, const void* entry_data,
const size_t entry_data_size_bytes)
{
write_segment_header(jpeg_marker_code::application_data8, sizeof(uint32_t) + entry_data_size_bytes);
write_uint32(entry_tag);
@ -78,7 +78,7 @@ void jpeg_stream_writer::write_spiff_end_of_directory_entry()
// but only 6 data bytes. This approach allows to wrap existing bit streams\encoders with a SPIFF header.
// In this implementation the SOI marker is added as data bytes to simplify the design.
static constexpr array<uint8_t, 6> spiff_end_of_directory{
0, 0, 0, spiff_end_of_directory_entry_type, 0xFF, static_cast<uint8_t>(charls::jpeg_marker_code::start_of_image)};
0, 0, 0, spiff_end_of_directory_entry_type, 0xFF, to_underlying_type(charls::jpeg_marker_code::start_of_image)};
write_segment_header(jpeg_marker_code::application_data8, spiff_end_of_directory.size());
write_bytes(spiff_end_of_directory.data(), spiff_end_of_directory.size());
@ -95,20 +95,20 @@ void jpeg_stream_writer::write_start_of_frame_segment(const frame_info& frame)
// Create a Frame Header as defined in ISO/IEC 14495-1, C.2.2 and T.81, B.2.2
const size_t data_size{6 + (static_cast<size_t>(frame.component_count) * 3)};
write_segment_header(jpeg_marker_code::start_of_frame_jpegls, data_size);
write_uint8(static_cast<uint8_t>(frame.bits_per_sample)); // P = Sample precision
write_uint16(static_cast<uint16_t>(frame.height)); // Y = Number of lines
write_uint16(static_cast<uint16_t>(frame.width)); // X = Number of samples per line
write_uint8(frame.bits_per_sample); // P = Sample precision
write_uint16(frame.height); // Y = Number of lines
write_uint16(frame.width); // X = Number of samples per line
// Components
write_uint8(static_cast<uint8_t>(frame.component_count)); // Nf = Number of image components in frame
write_uint8(frame.component_count); // Nf = Number of image components in frame
// Use by default 1 as the start component identifier to remain compatible with the
// code sample of ISO/IEC 14495-1, H.4 and the JPEG-LS ISO conformance sample files.
for (auto component_id{1}; component_id <= frame.component_count; ++component_id)
{
// Component Specification parameters
write_uint8(static_cast<uint8_t>(component_id)); // Ci = Component identifier
write_uint8(0x11); // Hi + Vi = Horizontal sampling factor + Vertical sampling factor
write_uint8(component_id); // Ci = Component identifier
write_uint8(0x11); // Hi + Vi = Horizontal sampling factor + Vertical sampling factor
write_uint8(0); // Tqi = Quantization table destination selector (reserved for JPEG-LS, should be set to 0)
}
}
@ -152,7 +152,7 @@ void jpeg_stream_writer::write_start_of_scan_segment(const int32_t component_cou
// Create a Scan Header as defined in T.87, C.2.3 and T.81, B.2.3
write_segment_header(jpeg_marker_code::start_of_scan, 1 + (static_cast<size_t>(component_count) * 2) + 3);
write_uint8(static_cast<uint8_t>(component_count));
write_uint8(component_count);
for (int32_t i{}; i != component_count; ++i)
{
@ -161,9 +161,9 @@ void jpeg_stream_writer::write_start_of_scan_segment(const int32_t component_cou
++component_id_;
}
write_uint8(static_cast<uint8_t>(near_lossless)); // NEAR parameter
write_uint8(static_cast<uint8_t>(interleave_mode)); // ILV parameter
write_uint8(0); // transformation
write_uint8(near_lossless); // NEAR parameter
write_uint8(to_underlying_type(interleave_mode)); // ILV parameter
write_uint8(0); // transformation
}
@ -179,7 +179,7 @@ void jpeg_stream_writer::write_segment_header(const jpeg_marker_code marker_code
impl::throw_jpegls_error(jpegls_errc::destination_buffer_too_small);
write_marker(marker_code);
write_uint16(static_cast<uint16_t>(data_size + segment_length_size));
write_uint16(static_cast<uint16_t>(segment_length_size + data_size));
}
} // namespace charls

View File

@ -115,6 +115,12 @@ private:
destination_.data[byte_offset_++] = value;
}
void write_uint8(const int32_t value) noexcept
{
ASSERT(value >= 0 && value <= std::numeric_limits<uint8_t>::max());
write_uint8(static_cast<uint8_t>(value));
}
void write_uint16(const uint16_t value) noexcept
{
write_uint<uint16_t>(value);
@ -126,6 +132,12 @@ private:
write_uint16(static_cast<uint16_t>(value));
}
void write_uint16(const uint32_t value) noexcept
{
ASSERT(value <= std::numeric_limits<uint16_t>::max());
write_uint16(static_cast<uint16_t>(value));
}
void write_uint32(const uint32_t value) noexcept
{
write_uint<uint32_t>(value);
@ -138,7 +150,7 @@ private:
// Use write_bytes to write to the unaligned byte array.
// The compiler will perform the correct optimization when the target platform support unaligned writes.
const UnsignedIntType big_endian_value{endian_swap(value)};
const UnsignedIntType big_endian_value{byte_swap(value)};
write_bytes(&big_endian_value, sizeof big_endian_value);
}
@ -164,16 +176,6 @@ private:
write_uint8(static_cast<uint8_t>(marker_code));
}
static constexpr uint32_t endian_swap(const uint32_t value) noexcept
{
return value >> 24 | (value & 0x00FF0000) >> 8 | (value & 0x0000FF00) << 8 | value << 24;
}
static constexpr uint16_t endian_swap(const uint16_t value) noexcept
{
return static_cast<uint16_t>(value >> 8 | value << 8);
}
byte_span destination_{};
size_t byte_offset_{};
uint8_t component_id_{1};

View File

@ -41,34 +41,6 @@ constexpr int32_t apply_sign(const int32_t i, const int32_t sign) noexcept
}
// Two alternatives for GetPredictedValue() (second is slightly faster due to reduced branching)
#if 0
inline int32_t get_predicted_value(int32_t Ra, int32_t Rb, int32_t Rc)
{
if (Ra < Rb)
{
if (Rc < Ra)
return Rb;
if (Rc > Rb)
return Ra;
}
else
{
if (Rc < Rb)
return Ra;
if (Rc > Ra)
return Rb;
}
return Ra + Rb - Rc;
}
#else
inline int32_t get_predicted_value(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
{
// sign trick reduces the number of if statements (branches)
@ -88,7 +60,6 @@ inline int32_t get_predicted_value(const int32_t ra, const int32_t rb, const int
return ra + rb - rc;
}
#endif
/// <summary>
/// This is the optimized inverse algorithm of ISO/IEC 14495-1, A.5.2, Code Segment A.11 (second else branch)
@ -101,6 +72,7 @@ CONSTEXPR int32_t unmap_error_value(const int32_t mapped_error) noexcept
return sign ^ (mapped_error >> 1);
}
/// <summary>
/// This is the algorithm of ISO/IEC 14495-1, A.5.2, Code Segment A.11 (second else branch)
/// It will map signed values to unsigned values. It has been optimized to prevent branching.
@ -113,6 +85,7 @@ CONSTEXPR int32_t map_error_value(const int32_t error_value) noexcept
return mapped_error;
}
constexpr int32_t compute_context_id(const int32_t q1, const int32_t q2, const int32_t q3) noexcept
{
return (q1 * 9 + q2) * 9 + q3;
@ -379,7 +352,7 @@ private:
}
/// <summary>Encodes/Decodes a scan line of samples</summary>
void do_line(sample_type* /*template_selector*/)
FORCE_INLINE void do_line(sample_type* /*template_selector*/)
{
int32_t index{};
int32_t rb{previous_line_[index - 1]};

View File

@ -9,6 +9,7 @@
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <type_traits>
#include <vector>
@ -127,7 +128,8 @@ inline void string_copy(CHARLS_IN_Z const char* source, CHARLS_OUT_WRITES_Z(size
#endif
}
inline jpegls_errc set_error_message(const jpegls_errc error, CHARLS_OUT_WRITES_Z(ErrorMessageSize) char* error_message) noexcept
inline jpegls_errc set_error_message(const jpegls_errc error,
CHARLS_OUT_WRITES_Z(ErrorMessageSize) char* error_message) noexcept
{
if (error_message)
{
@ -264,34 +266,59 @@ struct quad final : triplet<SampleType>
};
template<int Size>
struct from_big_endian final
// C++23 comes with std::byteswap. Use our own byte_swap implementation for now.
template<typename T>
CHARLS_CHECK_RETURN T byte_swap(T /*value*/) noexcept
{
};
ASSERT(false);
return 0;
}
template<>
struct from_big_endian<4> final
inline CHARLS_CHECK_RETURN uint16_t byte_swap<uint16_t>(const uint16_t value) noexcept
{
FORCE_INLINE static unsigned int read(const uint8_t* buffer) noexcept
{
return (static_cast<uint32_t>(buffer[0]) << 24U) + (static_cast<uint32_t>(buffer[1]) << 16U) +
(static_cast<uint32_t>(buffer[2]) << 8U) + (static_cast<uint32_t>(buffer[3]) << 0U);
}
};
#ifdef _MSC_VER
return _byteswap_ushort(value);
#else
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
return static_cast<uint16_t>(value << 8 | value >> 8);
#endif
}
template<>
struct from_big_endian<8> final
inline CHARLS_CHECK_RETURN uint32_t byte_swap<uint32_t>(const uint32_t value) noexcept
{
FORCE_INLINE static uint64_t read(const uint8_t* buffer) noexcept
{
return (static_cast<uint64_t>(buffer[0]) << 56U) + (static_cast<uint64_t>(buffer[1]) << 48U) +
(static_cast<uint64_t>(buffer[2]) << 40U) + (static_cast<uint64_t>(buffer[3]) << 32U) +
(static_cast<uint64_t>(buffer[4]) << 24U) + (static_cast<uint64_t>(buffer[5]) << 16U) +
(static_cast<uint64_t>(buffer[6]) << 8U) + (static_cast<uint64_t>(buffer[7]) << 0U);
}
};
#ifdef _MSC_VER
return _byteswap_ulong(value);
#else
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
return value >> 24 | (value & 0x00FF0000) >> 8 | (value & 0x0000FF00) << 8 | value << 24;
#endif
}
template<>
inline CHARLS_CHECK_RETURN uint64_t byte_swap(const uint64_t value) noexcept
{
#ifdef _MSC_VER
return _byteswap_uint64(value);
#else
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
return (value << 56) | ((value << 40) & 0x00FF'0000'0000'0000) | ((value << 24) & 0x0000'FF00'0000'0000) |
((value << 8) & 0x0000'00FF'0000'0000) | ((value >> 8) & 0x0000'0000'FF00'0000) |
((value >> 24) & 0x0000'0000'00FF'0000) | ((value >> 40) & 0x0000'0000'0000'FF00) | (value >> 56);
#endif
}
template<typename T>
T read_unaligned(const void* buffer) noexcept
{
// Note: MSVC, GCC and clang will replace this with a direct register read if architecture allows it (x86, x64, ARM64
// allows it)
T value;
memcpy(&value, buffer, sizeof(T));
return value;
}
inline void skip_bytes(byte_span& stream_info, const size_t count) noexcept
@ -362,6 +389,9 @@ constexpr uint32_t bit_to_byte_count(const int32_t bit_count) noexcept
}
/// <summary>
/// Converts an enumeration to its underlying type. Equivalent to C++23 std::to_underlying
/// </summary>
template<typename Enum>
constexpr auto to_underlying_type(Enum e) noexcept
{

View File

@ -115,14 +115,23 @@ void test_large_image_performance_rgb8(const int loop_count)
}
}
size_t get_destination_size(const vector<uint8_t>& source)
{
const jpegls_decoder decoder{source, true};
return decoder.destination_size();
}
void decode_performance_tests(const int loop_count)
{
cout << "Test decode performance with loop count " << loop_count << "\n";
const vector<uint8_t> encoded_source{read_file("decodetest.jls")};
try
{
// This test expect the file decodetest.jls to exist.
// It can be any valid JPEG-LS file.
// Changing the content of this file allows different performance measurements.
const vector<uint8_t> encoded_source{read_file("decodetest.jls")};
// Pre-allocate the destination outside the measurement loop.
// std::vector initializes its elements and this step needs to be excluded from the measurement.
vector<uint8_t> destination(jpegls_decoder{encoded_source, true}.destination_size());
@ -144,6 +153,10 @@ void decode_performance_tests(const int loop_count)
{
cout << "Decode failure: " << e.what() << "\n";
}
catch (const std::ios_base::failure& e)
{
cout << "IO failure (missing decodetest.jls?): " << e.what() << "\n";
}
}
void encode_performance_tests(const int loop_count)