Add benchmark project and use memchr (#127)

* Add an initial benchmark project to research optimizations. * Use memchr to search for 0xFF during decoding. * Rename function to make intent more clear. * Use const pointers (decoding only need to read).
2025-03-28 21:03:13 +00:00 · 2022-01-05 18:27:22 +01:00 · 2022-01-05 18:27:22 +01:00 · 26478a8ba3
commit 26478a8ba3
parent 7d1b06ee07
15 changed files with 653 additions and 191 deletions
--- a/CharLS.sln.DotSettings
+++ b/CharLS.sln.DotSettings
@ -72,6 +72,7 @@
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=lossless/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=losslesstraits/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=maxval/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=memchr/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=mrfx/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=nightshot/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=NODISCARD/@EntryIndexedValue">True</s:Boolean>
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -0,0 +1,7 @@
+# Benchmark
+
+The Visual Studio project in this folder contains benchmarks to analyze different way of
+decoding and encoding functions.
+
+The project expects that the Google Benchmark framework has been installed with vcpkg.  
+This can be done with: ```vcpkg install benchmark```
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@ -0,0 +1,199 @@
+// Copyright (c) Team CharLS.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <benchmark/benchmark.h>
+
+#include "../src/jpegls_preset_coding_parameters.h"
+
+#include <cstdint>
+
+
+
+int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
+{
+    constexpr int32_t near_lossless{};
+
+    if (di <= -preset.threshold3)
+        return -4;
+    if (di <= -preset.threshold2)
+        return -3;
+    if (di <= -preset.threshold1)
+        return -2;
+    if (di < -near_lossless)
+        return -1;
+    if (di <= near_lossless)
+        return 0;
+    if (di < preset.threshold1)
+        return 1;
+    if (di < preset.threshold2)
+        return 2;
+    if (di < preset.threshold3)
+        return 3;
+
+    return 4;
+}
+
+std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
+{
+    const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
+    const int32_t range{preset.maximum_sample_value + 1};
+
+    std::vector<int8_t> lut(static_cast<size_t>(range) * 2);
+    for (size_t i{}; i != lut.size(); ++i)
+    {
+        lut[i] = quantize_gradient_org(preset, static_cast<int32_t>(i) - range);
+    }
+
+    return lut;
+}
+
+
+const std::vector<int8_t> quantization_lut_lossless_8{create_quantize_lut_lossless(8)};
+
+template<typename Traits>
+struct scan_decoder
+{
+    int32_t t1_{};
+    int32_t t2_{};
+    int32_t t3_{};
+    Traits traits_;
+
+    explicit scan_decoder(Traits traits, const int32_t bit_count) noexcept : traits_{std::move(traits)}
+    {
+        const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
+
+        t1_ = preset.threshold1;
+        t2_ = preset.threshold2;
+        t3_ = preset.threshold3;
+    }
+
+    int8_t quantize_gradient_org(const int32_t di) const noexcept
+    {
+        if (di <= -t3_)
+            return -4;
+        if (di <= -t2_)
+            return -3;
+        if (di <= -t1_)
+            return -2;
+        if (di < -traits_.near_lossless)
+            return -1;
+        if (di <= traits_.near_lossless)
+            return 0;
+        if (di < t1_)
+            return 1;
+        if (di < t2_)
+            return 2;
+        if (di < t3_)
+            return 3;
+
+        return 4;
+    }
+};
+
+struct lossless_traits final
+{
+    static constexpr int32_t near_lossless{};
+};
+
+
+
+__declspec(noinline) int32_t get_predicted_value_default(int32_t Ra, int32_t Rb, int32_t Rc) noexcept
+{
+    if (Ra < Rb)
+    {
+        if (Rc < Ra)
+            return Rb;
+
+        if (Rc > Rb)
+            return Ra;
+    }
+    else
+    {
+        if (Rc < Rb)
+            return Ra;
+
+        if (Rc > Ra)
+            return Rb;
+    }
+
+    return Ra + Rb - Rc;
+}
+
+
+constexpr size_t int32_t_bit_count = sizeof(int32_t) * 8;
+
+
+constexpr int32_t bit_wise_sign(const int32_t i) noexcept
+{
+    return i >> (int32_t_bit_count - 1);
+}
+
+
+__declspec(noinline) int32_t get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
+{
+    // sign trick reduces the number of if statements (branches)
+    const int32_t sign{bit_wise_sign(rb - ra)};
+
+    // is Ra between Rc and Rb?
+    if ((sign ^ (rc - ra)) < 0)
+    {
+        return rb;
+    }
+    if ((sign ^ (rb - rc)) < 0)
+    {
+        return ra;
+    }
+
+    // default case, valid if Rc element of [Ra,Rb]
+    return ra + rb - rc;
+}
+
+
+static void bm_get_predicted_value_default(benchmark::State& state)
+{
+    for (const auto _ : state)
+    {
+        benchmark::DoNotOptimize(get_predicted_value_default(100, 200, 300));
+        benchmark::DoNotOptimize(get_predicted_value_default(200, 100, 300));
+    }
+}
+BENCHMARK(bm_get_predicted_value_default);
+
+static void bm_get_predicted_value_optimized(benchmark::State& state)
+{
+    for (const auto _ : state)
+    {
+        benchmark::DoNotOptimize(get_predicted_value_optimized(100, 200, 300));
+        benchmark::DoNotOptimize(get_predicted_value_default(200, 100, 300));
+    }
+}
+BENCHMARK(bm_get_predicted_value_optimized);
+
+static void bm_quantize_gradient_calculated(benchmark::State& state)
+{
+    const scan_decoder<lossless_traits> sd({}, 8);
+
+    for (const auto _ : state)
+    {
+        benchmark::DoNotOptimize(sd.quantize_gradient_org(0));
+        benchmark::DoNotOptimize(sd.quantize_gradient_org(127));
+        benchmark::DoNotOptimize(sd.quantize_gradient_org(255));
+    }
+}
+BENCHMARK(bm_quantize_gradient_calculated);
+
+static void bm_quantize_gradient_lut(benchmark::State& state)
+{
+    const scan_decoder<lossless_traits> sd({}, 8);
+
+    for (const auto _ : state)
+    {
+        benchmark::DoNotOptimize(quantization_lut_lossless_8[0]);
+        benchmark::DoNotOptimize(quantization_lut_lossless_8[127]);
+        benchmark::DoNotOptimize(quantization_lut_lossless_8[255]);
+    }
+}
+BENCHMARK(bm_quantize_gradient_lut);
+
+
+BENCHMARK_MAIN();
--- a/benchmark/benchmark.sln
+++ b/benchmark/benchmark.sln
@ -0,0 +1,31 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.1.31911.260
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "benchmark", "benchmark.vcxproj", "{F961EC29-4ACE-4D5E-B7ED-55681A678A90}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x64.ActiveCfg = Debug|x64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x64.Build.0 = Debug|x64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x86.ActiveCfg = Debug|Win32
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x86.Build.0 = Debug|Win32
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x64.ActiveCfg = Release|x64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x64.Build.0 = Release|x64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x86.ActiveCfg = Release|Win32
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {57D87A68-949C-476E-A240-13953EE8CA8C}
+	EndGlobalSection
+EndGlobal
--- a/benchmark/benchmark.sln.DotSettings
+++ b/benchmark/benchmark.sln.DotSettings
@ -0,0 +1,3 @@
+<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
+	<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Class_0020and_0020struct_0020public_0020fields/@EntryIndexedValue">&lt;NamingElement Priority="12"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="PUBLIC"&gt;&lt;type Name="class field" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" Prefix="" Suffix="_" Style="aa_bb" /&gt;&lt;/NamingElement&gt;</s:String>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=lossless/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
--- a/benchmark/benchmark.vcxproj
+++ b/benchmark/benchmark.vcxproj
@ -0,0 +1,151 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{f961ec29-4ace-4d5e-b7ed-55681a678a90}</ProjectGuid>
+    <RootNamespace>benchmark</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>Shlwapi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="benchmark.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/benchmark/benchmark.vcxproj.filters
+++ b/benchmark/benchmark.vcxproj.filters
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="benchmark.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/doc/style_and_design.md
+++ b/doc/style_and_design.md
@ -170,6 +170,28 @@ One of the missing features of C++ is a standard Package Manager. The following
 * Library to read Anymap files (for example Netpbm)
 * Library to parse command line parameters (for example Clara, CLI11)

+### Performance
+
+#### Decoding the bitstream
+
+There are 2 main ways to decode the bitstream
+
+* Basic: Read it byte for byte and store the results in a cache variable
+
+* Improved: read when possible a register in 1 step. The problem is that 0xFF can exists in the
+bitstream. If such a 0xFF exists the next bit needs to be ignored. There are a couple of way to do this:
+  * A) Search for the first position with a 0xFF it and remember the position. 0xFFs are rare.
+  * B) Search for the first 0xFF with memchr. memchr can leverage special CPU instructions when possible.
+  * C) Load a register and check if it contains a 0xFF byte.
+
+Measurements conclusion: option B is the fastest on x64. This is the original algorithm. There is not a large difference between the different options.
+Examples of decoding performance on a AMD 5950X x64 CPU:
+
+| Image                       | Basic   | Improved A | Improved B |Improved C |
+| --------------              | ------- | ---------- | ---------- |---------- |
+| 16 bit 512 * 512 (CT image) | 3.09 ms | 3.17 ms    | 3.06 ms    | 3.10 ms   |
+|  8 bit 5412 * 7216          | 517 ms  | 509 ms     | 507 ms     | 512 ms    |
+
 ### Supported C++ Compilers

 #### Clang
--- a/src/decoder_strategy.h
+++ b/src/decoder_strategy.h
@ -17,7 +17,7 @@ namespace charls {
 class decoder_strategy
 {
 public:
-    explicit decoder_strategy(const frame_info& frame, const coding_parameters& parameters) noexcept :
+    decoder_strategy(const frame_info& frame, const coding_parameters& parameters) noexcept :
        frame_info_{frame}, parameters_{parameters}
    {
    }
@ -35,14 +35,11 @@ public:

    void initialize(const byte_span source)
    {
-        valid_bits_ = 0;
-        read_cache_ = 0;
-
        position_ = source.data;
        end_position_ = position_ + source.size;

-        next_ff_position_ = find_next_ff();
-        make_valid();
+        find_jpeg_marker_start_byte();
+        fill_read_cache();
    }

    void reset()
@ -50,8 +47,8 @@ public:
        valid_bits_ = 0;
        read_cache_ = 0;

-        next_ff_position_ = find_next_ff();
-        make_valid();
+        find_jpeg_marker_start_byte();
+        fill_read_cache();
    }

    FORCE_INLINE void skip(const int32_t length) noexcept
@ -82,84 +79,10 @@ public:
            impl::throw_jpegls_error(jpegls_errc::too_much_encoded_data);
    }

-    FORCE_INLINE bool optimized_read() noexcept
-    {
-        // Easy & fast: if there is no 0xFF byte in sight, we can read without bit stuffing
-        if (position_ < next_ff_position_ - (sizeof(bufType) - 1))
-        {
-            read_cache_ |= from_big_endian<sizeof(bufType)>::read(position_) >> valid_bits_;
-            const int bytes_to_read{(bufType_bit_count - valid_bits_) >> 3};
-            position_ += bytes_to_read;
-            valid_bits_ += bytes_to_read * 8;
-            ASSERT(valid_bits_ >= bufType_bit_count - 8);
-            return true;
-        }
-        return false;
-    }
-
-    void make_valid()
-    {
-        ASSERT(valid_bits_ <= bufType_bit_count - 8);
-
-        if (optimized_read())
-            return;
-
-        do
-        {
-            if (position_ >= end_position_)
-            {
-                if (valid_bits_ <= 0)
-                    impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
-
-                return;
-            }
-
-            const bufType value_new{position_[0]};
-
-            if (value_new == jpeg_marker_start_byte)
-            {
-                // JPEG bit stream rule: no FF may be followed by 0x80 or higher
-                if (position_ == end_position_ - 1 || (position_[1] & 0x80) != 0)
-                {
-                    if (valid_bits_ <= 0)
-                        impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
-
-                    return;
-                }
-            }
-
-            read_cache_ |= value_new << (bufType_bit_count - 8 - valid_bits_);
-            position_ += 1;
-            valid_bits_ += 8;
-
-            if (value_new == jpeg_marker_start_byte)
-            {
-                --valid_bits_;
-            }
-        } while (valid_bits_ < bufType_bit_count - 8);
-
-        next_ff_position_ = find_next_ff();
-    }
-
-    uint8_t* find_next_ff() const noexcept
-    {
-        auto* position_next_ff{position_};
-
-        while (position_next_ff < end_position_)
-        {
-            if (*position_next_ff == jpeg_marker_start_byte)
-                break;
-
-            ++position_next_ff;
-        }
-
-        return position_next_ff;
-    }
-
-    uint8_t* get_cur_byte_pos() const noexcept
+    const uint8_t* get_cur_byte_pos() const noexcept
    {
        int32_t valid_bits{valid_bits_};
-        uint8_t* compressed_bytes{position_};
+        const uint8_t* compressed_bytes{position_};

        for (;;)
        {
@ -177,14 +100,14 @@ public:
    {
        if (valid_bits_ < length)
        {
-            make_valid();
+            fill_read_cache();
            if (valid_bits_ < length)
                impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
        }

        ASSERT(length != 0 && length <= valid_bits_);
        ASSERT(length < 32);
-        const auto result = static_cast<int32_t>(read_cache_ >> (bufType_bit_count - length));
+        const auto result = static_cast<int32_t>(read_cache_ >> (cache_t_bit_count - length));
        skip(length);
        return result;
    }
@ -193,20 +116,20 @@ public:
    {
        if (valid_bits_ < 8)
        {
-            make_valid();
+            fill_read_cache();
        }

-        return static_cast<int32_t>(read_cache_ >> (bufType_bit_count - 8));
+        return static_cast<int32_t>(read_cache_ >> max_readable_cache_bits);
    }

    FORCE_INLINE bool read_bit()
    {
        if (valid_bits_ <= 0)
        {
-            make_valid();
+            fill_read_cache();
        }

-        const bool set = (read_cache_ & (static_cast<bufType>(1) << (bufType_bit_count - 1))) != 0;
+        const bool set = (read_cache_ & (static_cast<cache_t>(1) << (cache_t_bit_count - 1))) != 0;
        skip(1);
        return set;
    }
@ -215,13 +138,13 @@ public:
    {
        if (valid_bits_ < 16)
        {
-            make_valid();
+            fill_read_cache();
        }
-        bufType val_test = read_cache_;
+        cache_t val_test = read_cache_;

        for (int32_t count{}; count < 16; ++count)
        {
-            if ((val_test & (static_cast<bufType>(1) << (bufType_bit_count - 1))) != 0)
+            if ((val_test & (static_cast<cache_t>(1) << (cache_t_bit_count - 1))) != 0)
                return count;

            val_test <<= 1;
@ -268,17 +191,97 @@ protected:
    std::unique_ptr<process_line> process_line_;

 private:
-    using bufType = size_t;
-    static constexpr auto bufType_bit_count = static_cast<int32_t>(sizeof(bufType) * 8);
+    using cache_t = size_t;
+
+    void fill_read_cache()
+    {
+        ASSERT(valid_bits_ <= max_readable_cache_bits);
+
+        if (fill_read_cache_optimistic())
+            return;
+
+        do
+        {
+            if (position_ >= end_position_)
+            {
+                if (valid_bits_ == 0)
+                {
+                    // Decoding process expects at least some bits to be added to the cache.
+                    impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
+                }
+
+                return;
+            }
+
+            const cache_t new_byte_value{*position_};
+
+            if (new_byte_value == jpeg_marker_start_byte)
+            {
+                // JPEG-LS bit stream rule: if FF is followed by a 1 bit then it is a marker
+                if (position_ == end_position_ - 1 || (position_[1] & 0x80) != 0)
+                {
+                    if (valid_bits_ <= 0)
+                    {
+                        // Decoding process expects at least some bits to be added to the cache.
+                        impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
+                    }
+
+                    // Marker detected, typical EOI, SOS (next scan) or RSTm.
+                    return;
+                }
+            }
+
+            read_cache_ |= new_byte_value << (max_readable_cache_bits - valid_bits_);
+            valid_bits_ += 8;
+            ++position_;
+
+            if (new_byte_value == jpeg_marker_start_byte)
+            {
+                // The next bit after an 0xFF needs to be ignored, compensate for the next read (see ISO/IEC 14495-1,A.1)
+                --valid_bits_;
+            }
+
+        } while (valid_bits_ < max_readable_cache_bits);
+
+        find_jpeg_marker_start_byte();
+    }
+
+    FORCE_INLINE bool fill_read_cache_optimistic() noexcept
+    {
+        // Easy & fast: if there is no 0xFF byte in sight, we can read without bit stuffing
+        if (position_ < position_ff_ - (sizeof(cache_t) - 1))
+        {
+            read_cache_ |= byte_swap(read_unaligned<cache_t>(position_)) >> valid_bits_;
+            const int bytes_to_read{(cache_t_bit_count - valid_bits_) / 8};
+            position_ += bytes_to_read;
+            valid_bits_ += bytes_to_read * 8;
+            ASSERT(valid_bits_ >= max_readable_cache_bits);
+            return true;
+        }
+        return false;
+    }
+
+    void find_jpeg_marker_start_byte() noexcept
+    {
+        // Use memchr to find next start byte (0xFF). memchr is optimized on some platforms to search faster.
+        position_ff_ = static_cast<const uint8_t*>(memchr(position_, jpeg_marker_start_byte, position_ - end_position_));
+        if (!position_ff_)
+        {
+            position_ff_ = end_position_;
+        }
+    }
+
+    static constexpr auto cache_t_bit_count = static_cast<int32_t>(sizeof(cache_t) * 8);
+    static constexpr int32_t max_readable_cache_bits{cache_t_bit_count - 8};

    std::vector<uint8_t> buffer_;

    // decoding
-    bufType read_cache_{};
+    cache_t read_cache_{};
    int32_t valid_bits_{};
-    uint8_t* position_{};
-    uint8_t* next_ff_position_{};
-    uint8_t* end_position_{};
+    const uint8_t* position_{};
+    const uint8_t* end_position_{};
+    const uint8_t* position_ff_{};
 };

 } // namespace charls
--- a/src/jpeg_stream_reader.cpp
+++ b/src/jpeg_stream_reader.cpp
@ -543,8 +543,12 @@ void jpeg_stream_reader::skip_byte()

 uint16_t jpeg_stream_reader::read_uint16()
 {
-    const uint32_t value{read_byte() * 256U};
-    return static_cast<uint16_t>(value + read_byte());
+    if (source_.size < 2)
+        throw_jpegls_error(jpegls_errc::source_buffer_too_small);
+
+    const auto value{read_unaligned<uint16_t>(source_.data)};
+    skip_bytes(source_, 2);
+    return byte_swap(value);
 }


@ -557,11 +561,12 @@ uint32_t jpeg_stream_reader::read_uint24()

 uint32_t jpeg_stream_reader::read_uint32()
 {
-    uint32_t value{read_uint16()};
-    value = value << 16U;
-    value += read_uint16();
+    if (source_.size < 4)
+        throw_jpegls_error(jpegls_errc::source_buffer_too_small);

-    return value;
+    const auto value{read_unaligned<uint32_t>(source_.data)};
+    skip_bytes(source_, 4);
+    return byte_swap(value);
 }


--- a/src/jpeg_stream_writer.cpp
+++ b/src/jpeg_stream_writer.cpp
@ -30,6 +30,7 @@ void jpeg_stream_writer::write_end_of_image(const bool even_destination_size)
 {
    if (even_destination_size && bytes_written() % 2 != 0)
    {
+        // Write an additional 0xFF byte to ensure that the encoded bit stream has an even size.
        write_uint8(jpeg_marker_start_byte);
    }

@ -49,22 +50,21 @@ void jpeg_stream_writer::write_spiff_header_segment(const spiff_header& header)
    write_bytes(spiff_magic_id.data(), spiff_magic_id.size());
    write_uint8(spiff_major_revision_number);
    write_uint8(spiff_minor_revision_number);
-    write_uint8(static_cast<uint8_t>(header.profile_id));
-    write_uint8(static_cast<uint8_t>(header.component_count));
+    write_uint8(to_underlying_type(header.profile_id));
+    write_uint8(header.component_count);
    write_uint32(header.height);
    write_uint32(header.width);
-    write_uint8(static_cast<uint8_t>(header.color_space));
-    write_uint8(static_cast<uint8_t>(header.bits_per_sample));
-    write_uint8(static_cast<uint8_t>(header.compression_type));
-    write_uint8(static_cast<uint8_t>(header.resolution_units));
+    write_uint8(to_underlying_type(header.color_space));
+    write_uint8(header.bits_per_sample);
+    write_uint8(to_underlying_type(header.compression_type));
+    write_uint8(to_underlying_type(header.resolution_units));
    write_uint32(header.vertical_resolution);
    write_uint32(header.horizontal_resolution);
 }


-USE_DECL_ANNOTATIONS void jpeg_stream_writer::write_spiff_directory_entry(const uint32_t entry_tag,
-                                                     const void* entry_data,
-                                                     const size_t entry_data_size_bytes)
+USE_DECL_ANNOTATIONS void jpeg_stream_writer::write_spiff_directory_entry(const uint32_t entry_tag, const void* entry_data,
+                                                                          const size_t entry_data_size_bytes)
 {
    write_segment_header(jpeg_marker_code::application_data8, sizeof(uint32_t) + entry_data_size_bytes);
    write_uint32(entry_tag);
@ -78,7 +78,7 @@ void jpeg_stream_writer::write_spiff_end_of_directory_entry()
    // but only 6 data bytes. This approach allows to wrap existing bit streams\encoders with a SPIFF header.
    // In this implementation the SOI marker is added as data bytes to simplify the design.
    static constexpr array<uint8_t, 6> spiff_end_of_directory{
-        0, 0, 0, spiff_end_of_directory_entry_type, 0xFF, static_cast<uint8_t>(charls::jpeg_marker_code::start_of_image)};
+        0, 0, 0, spiff_end_of_directory_entry_type, 0xFF, to_underlying_type(charls::jpeg_marker_code::start_of_image)};

    write_segment_header(jpeg_marker_code::application_data8, spiff_end_of_directory.size());
    write_bytes(spiff_end_of_directory.data(), spiff_end_of_directory.size());
@ -95,20 +95,20 @@ void jpeg_stream_writer::write_start_of_frame_segment(const frame_info& frame)
    // Create a Frame Header as defined in ISO/IEC 14495-1, C.2.2 and T.81, B.2.2
    const size_t data_size{6 + (static_cast<size_t>(frame.component_count) * 3)};
    write_segment_header(jpeg_marker_code::start_of_frame_jpegls, data_size);
-    write_uint8(static_cast<uint8_t>(frame.bits_per_sample)); // P = Sample precision
-    write_uint16(static_cast<uint16_t>(frame.height));        // Y = Number of lines
-    write_uint16(static_cast<uint16_t>(frame.width));         // X = Number of samples per line
+    write_uint8(frame.bits_per_sample); // P = Sample precision
+    write_uint16(frame.height);         // Y = Number of lines
+    write_uint16(frame.width);          // X = Number of samples per line

    // Components
-    write_uint8(static_cast<uint8_t>(frame.component_count)); // Nf = Number of image components in frame
+    write_uint8(frame.component_count); // Nf = Number of image components in frame

    // Use by default 1 as the start component identifier to remain compatible with the
    // code sample of ISO/IEC 14495-1, H.4 and the JPEG-LS ISO conformance sample files.
    for (auto component_id{1}; component_id <= frame.component_count; ++component_id)
    {
        // Component Specification parameters
-        write_uint8(static_cast<uint8_t>(component_id)); // Ci = Component identifier
-        write_uint8(0x11);                               // Hi + Vi = Horizontal sampling factor + Vertical sampling factor
+        write_uint8(component_id); // Ci = Component identifier
+        write_uint8(0x11);         // Hi + Vi = Horizontal sampling factor + Vertical sampling factor
        write_uint8(0); // Tqi = Quantization table destination selector (reserved for JPEG-LS, should be set to 0)
    }
 }
@ -152,7 +152,7 @@ void jpeg_stream_writer::write_start_of_scan_segment(const int32_t component_cou

    // Create a Scan Header as defined in T.87, C.2.3 and T.81, B.2.3
    write_segment_header(jpeg_marker_code::start_of_scan, 1 + (static_cast<size_t>(component_count) * 2) + 3);
-    write_uint8(static_cast<uint8_t>(component_count));
+    write_uint8(component_count);

    for (int32_t i{}; i != component_count; ++i)
    {
@ -161,9 +161,9 @@ void jpeg_stream_writer::write_start_of_scan_segment(const int32_t component_cou
        ++component_id_;
    }

-    write_uint8(static_cast<uint8_t>(near_lossless));   // NEAR parameter
-    write_uint8(static_cast<uint8_t>(interleave_mode)); // ILV parameter
-    write_uint8(0);                                     // transformation
+    write_uint8(near_lossless);                       // NEAR parameter
+    write_uint8(to_underlying_type(interleave_mode)); // ILV parameter
+    write_uint8(0);                                   // transformation
 }


@ -179,7 +179,7 @@ void jpeg_stream_writer::write_segment_header(const jpeg_marker_code marker_code
        impl::throw_jpegls_error(jpegls_errc::destination_buffer_too_small);

    write_marker(marker_code);
-    write_uint16(static_cast<uint16_t>(data_size + segment_length_size));
+    write_uint16(static_cast<uint16_t>(segment_length_size + data_size));
 }

 } // namespace charls
--- a/src/jpeg_stream_writer.h
+++ b/src/jpeg_stream_writer.h
@ -115,6 +115,12 @@ private:
        destination_.data[byte_offset_++] = value;
    }

+    void write_uint8(const int32_t value) noexcept
+    {
+        ASSERT(value >= 0 && value <= std::numeric_limits<uint8_t>::max());
+        write_uint8(static_cast<uint8_t>(value));
+    }
+
    void write_uint16(const uint16_t value) noexcept
    {
        write_uint<uint16_t>(value);
@ -126,6 +132,12 @@ private:
        write_uint16(static_cast<uint16_t>(value));
    }

+    void write_uint16(const uint32_t value) noexcept
+    {
+        ASSERT(value <= std::numeric_limits<uint16_t>::max());
+        write_uint16(static_cast<uint16_t>(value));
+    }
+
    void write_uint32(const uint32_t value) noexcept
    {
        write_uint<uint32_t>(value);
@ -138,7 +150,7 @@ private:

        // Use write_bytes to write to the unaligned byte array.
        // The compiler will perform the correct optimization when the target platform support unaligned writes.
-        const UnsignedIntType big_endian_value{endian_swap(value)};
+        const UnsignedIntType big_endian_value{byte_swap(value)};
        write_bytes(&big_endian_value, sizeof big_endian_value);
    }

@ -164,16 +176,6 @@ private:
        write_uint8(static_cast<uint8_t>(marker_code));
    }

-    static constexpr uint32_t endian_swap(const uint32_t value) noexcept
-    {
-        return value >> 24 | (value & 0x00FF0000) >> 8 | (value & 0x0000FF00) << 8 | value << 24;
-    }
-
-    static constexpr uint16_t endian_swap(const uint16_t value) noexcept
-    {
-        return static_cast<uint16_t>(value >> 8 | value << 8);
-    }
-
    byte_span destination_{};
    size_t byte_offset_{};
    uint8_t component_id_{1};
--- a/src/scan.h
+++ b/src/scan.h
@ -41,34 +41,6 @@ constexpr int32_t apply_sign(const int32_t i, const int32_t sign) noexcept
 }


-// Two alternatives for GetPredictedValue() (second is slightly faster due to reduced branching)
-
-#if 0
-
-inline int32_t get_predicted_value(int32_t Ra, int32_t Rb, int32_t Rc)
-{
-    if (Ra < Rb)
-    {
-        if (Rc < Ra)
-            return Rb;
-
-        if (Rc > Rb)
-            return Ra;
-    }
-    else
-    {
-        if (Rc < Rb)
-            return Ra;
-
-        if (Rc > Ra)
-            return Rb;
-    }
-
-    return Ra + Rb - Rc;
-}
-
-#else
-
 inline int32_t get_predicted_value(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
 {
    // sign trick reduces the number of if statements (branches)
@ -88,7 +60,6 @@ inline int32_t get_predicted_value(const int32_t ra, const int32_t rb, const int
    return ra + rb - rc;
 }

-#endif

 /// <summary>
 /// This is the optimized inverse algorithm of ISO/IEC 14495-1, A.5.2, Code Segment A.11 (second else branch)
@ -101,6 +72,7 @@ CONSTEXPR int32_t unmap_error_value(const int32_t mapped_error) noexcept
    return sign ^ (mapped_error >> 1);
 }

+
 /// <summary>
 /// This is the algorithm of ISO/IEC 14495-1, A.5.2, Code Segment A.11 (second else branch)
 /// It will map signed values to unsigned values. It has been optimized to prevent branching.
@ -113,6 +85,7 @@ CONSTEXPR int32_t map_error_value(const int32_t error_value) noexcept
    return mapped_error;
 }

+
 constexpr int32_t compute_context_id(const int32_t q1, const int32_t q2, const int32_t q3) noexcept
 {
    return (q1 * 9 + q2) * 9 + q3;
@ -379,7 +352,7 @@ private:
    }

    /// <summary>Encodes/Decodes a scan line of samples</summary>
-    void do_line(sample_type* /*template_selector*/)
+    FORCE_INLINE void do_line(sample_type* /*template_selector*/)
    {
        int32_t index{};
        int32_t rb{previous_line_[index - 1]};
--- a/src/util.h
+++ b/src/util.h
@ -9,6 +9,7 @@

 #include <algorithm>
 #include <cassert>
+#include <cstdlib>
 #include <cstring>
 #include <type_traits>
 #include <vector>
@ -127,7 +128,8 @@ inline void string_copy(CHARLS_IN_Z const char* source, CHARLS_OUT_WRITES_Z(size
 #endif
 }

-inline jpegls_errc set_error_message(const jpegls_errc error, CHARLS_OUT_WRITES_Z(ErrorMessageSize) char* error_message) noexcept
+inline jpegls_errc set_error_message(const jpegls_errc error,
+                                     CHARLS_OUT_WRITES_Z(ErrorMessageSize) char* error_message) noexcept
 {
    if (error_message)
    {
@ -264,34 +266,59 @@ struct quad final : triplet<SampleType>
 };


-template<int Size>
-struct from_big_endian final
+// C++23 comes with std::byteswap. Use our own byte_swap implementation for now.
+template<typename T>
+CHARLS_CHECK_RETURN T byte_swap(T /*value*/) noexcept
 {
-};
-
+    ASSERT(false);
+    return 0;
+}

 template<>
-struct from_big_endian<4> final
+inline CHARLS_CHECK_RETURN uint16_t byte_swap<uint16_t>(const uint16_t value) noexcept
 {
-    FORCE_INLINE static unsigned int read(const uint8_t* buffer) noexcept
-    {
-        return (static_cast<uint32_t>(buffer[0]) << 24U) + (static_cast<uint32_t>(buffer[1]) << 16U) +
-               (static_cast<uint32_t>(buffer[2]) << 8U) + (static_cast<uint32_t>(buffer[3]) << 0U);
-    }
-};
-
+#ifdef _MSC_VER
+    return _byteswap_ushort(value);
+#else
+    // Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
+    return static_cast<uint16_t>(value << 8 | value >> 8);
+#endif
+}

 template<>
-struct from_big_endian<8> final
+inline CHARLS_CHECK_RETURN uint32_t byte_swap<uint32_t>(const uint32_t value) noexcept
 {
-    FORCE_INLINE static uint64_t read(const uint8_t* buffer) noexcept
-    {
-        return (static_cast<uint64_t>(buffer[0]) << 56U) + (static_cast<uint64_t>(buffer[1]) << 48U) +
-               (static_cast<uint64_t>(buffer[2]) << 40U) + (static_cast<uint64_t>(buffer[3]) << 32U) +
-               (static_cast<uint64_t>(buffer[4]) << 24U) + (static_cast<uint64_t>(buffer[5]) << 16U) +
-               (static_cast<uint64_t>(buffer[6]) << 8U) + (static_cast<uint64_t>(buffer[7]) << 0U);
-    }
-};
+#ifdef _MSC_VER
+    return _byteswap_ulong(value);
+#else
+    // Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
+    return value >> 24 | (value & 0x00FF0000) >> 8 | (value & 0x0000FF00) << 8 | value << 24;
+#endif
+}
+
+template<>
+inline CHARLS_CHECK_RETURN uint64_t byte_swap(const uint64_t value) noexcept
+{
+#ifdef _MSC_VER
+    return _byteswap_uint64(value);
+#else
+    // Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
+    return (value << 56) | ((value << 40) & 0x00FF'0000'0000'0000) | ((value << 24) & 0x0000'FF00'0000'0000) |
+           ((value << 8) & 0x0000'00FF'0000'0000) | ((value >> 8) & 0x0000'0000'FF00'0000) |
+           ((value >> 24) & 0x0000'0000'00FF'0000) | ((value >> 40) & 0x0000'0000'0000'FF00) | (value >> 56);
+#endif
+}
+
+
+template<typename T>
+T read_unaligned(const void* buffer) noexcept
+{
+    // Note: MSVC, GCC and clang will replace this with a direct register read if architecture allows it (x86, x64, ARM64
+    // allows it)
+    T value;
+    memcpy(&value, buffer, sizeof(T));
+    return value;
+}


 inline void skip_bytes(byte_span& stream_info, const size_t count) noexcept
@ -362,6 +389,9 @@ constexpr uint32_t bit_to_byte_count(const int32_t bit_count) noexcept
 }


+/// <summary>
+/// Converts an enumeration to its underlying type. Equivalent to C++23 std::to_underlying
+/// </summary>
 template<typename Enum>
 constexpr auto to_underlying_type(Enum e) noexcept
 {
--- a/test/performance.cpp
+++ b/test/performance.cpp
@ -115,14 +115,23 @@ void test_large_image_performance_rgb8(const int loop_count)
    }
 }

+size_t get_destination_size(const vector<uint8_t>& source)
+{
+    const jpegls_decoder decoder{source, true};
+    return decoder.destination_size();
+}
+
 void decode_performance_tests(const int loop_count)
 {
    cout << "Test decode performance with loop count " << loop_count << "\n";

-    const vector<uint8_t> encoded_source{read_file("decodetest.jls")};
-
    try
    {
+        // This test expect the file decodetest.jls to exist.
+        // It can be any valid JPEG-LS file.
+        // Changing the content of this file allows different performance measurements.
+        const vector<uint8_t> encoded_source{read_file("decodetest.jls")};
+
        // Pre-allocate the destination outside the measurement loop.
        // std::vector initializes its elements and this step needs to be excluded from the measurement.
        vector<uint8_t> destination(jpegls_decoder{encoded_source, true}.destination_size());
@ -144,6 +153,10 @@ void decode_performance_tests(const int loop_count)
    {
        cout << "Decode failure: " << e.what() << "\n";
    }
+    catch (const std::ios_base::failure& e)
+    {
+        cout << "IO failure (missing decodetest.jls?): " << e.what() << "\n";
+    }
 }

 void encode_performance_tests(const int loop_count)