Skip to content

Instantly share code, notes, and snippets.

@rafa-br34
Last active August 9, 2024 23:47
Show Gist options
  • Select an option

  • Save rafa-br34/40076a242c70f57b92cdc8b53750bc1e to your computer and use it in GitHub Desktop.

Select an option

Save rafa-br34/40076a242c70f57b92cdc8b53750bc1e to your computer and use it in GitHub Desktop.
A C++17 benchmark to test branched vs branchless performance.
// Compile with
// g++ -O3 -Ofast -m64 -lstdc++ -std=c++17 Benchmark.cpp -o Benchmark
// or
// g++ -O3 -Ofast -m64 -lstdc++ -lpthread -std=c++17 Benchmark.cpp -o Benchmark
#include <iostream>
#include <cstring>
#include <vector>
#include <thread>
#include <random>
#include <chrono>
#include <atomic>
#include <mutex>
#define TIME_START() std::chrono::high_resolution_clock::now()
#define TIME_STOP(Start) (size_t)(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - Start).count())
// __attribute__((always_inline)) for GCC __forceinline for MSVC
#define FORCE_INLINE __attribute__((always_inline))
#define VOLATILE volatile
const size_t c_BufferSize = size_t(1024 * 1024) * size_t(1024 * 1);
const size_t c_Passes = 150;
const uint8_t* c_AlphaNumericDictionary = (const uint8_t*)"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
const size_t c_AlphaNumericDictionarySize = 62;
const uint8_t* c_00_FF_Dictionary = (const uint8_t*)"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe";
const size_t c_00_FF_DictionarySize = 255;
const size_t g_Threads = std::thread::hardware_concurrency();
// XorShift64
struct XorShift64 {
// Must Not Be Zero
uint64_t State = 1;
FORCE_INLINE void Seed(uint64_t Seed) {
this->State = Seed;
};
FORCE_INLINE uint64_t Next() {
uint64_t X = this->State;
X ^= X << 13;
X ^= X >> 7;
X ^= X << 17;
return this->State = X;
}
};
double ByteCountToMeasurement(double Count, const char** Output) {
const char* Measurements[] = {
"B",
"KB",
"MB",
"GB",
"TB",
"EB",
"ZB",
"YB",
"RB",
"QB",
"UNK"
};
uint8_t Iteration = 0;
while (Count >= 1024.0) {
Count /= 1024.0;
Iteration++;
}
if (const auto ListSize = sizeof(Measurements) / sizeof(void*); Iteration > ListSize)
*Output = Measurements[ListSize - 1];
else
*Output = Measurements[Iteration];
return Count;
}
size_t PopulateBytes(uint8_t* Buffer, size_t BufferSize, const uint8_t* Dictionary, size_t DictionarySize) {
auto Start = TIME_START();
size_t PerThread = BufferSize / g_Threads;
size_t Remainder = BufferSize % g_Threads;
std::vector<std::thread*> Threads = {};
for (size_t t = 0; t < g_Threads; t++) {
uint8_t* Start = Buffer + (t * PerThread);
size_t TaskSize = PerThread + ((t + 1 == g_Threads) * Remainder);
Threads.push_back(new std::thread(
[Start, TaskSize, Dictionary, DictionarySize]() {
XorShift64 PRNG = {};
PRNG.Seed(std::chrono::high_resolution_clock::now().time_since_epoch().count());
for (size_t i = 0; i < TaskSize; i++)
Start[i] = Dictionary[DictionarySize > 1 ? (PRNG.Next() % DictionarySize) : 0];
}
));
}
for (auto* Thread : Threads) {
Thread->join();
delete Thread;
}
return TIME_STOP(Start);
}
#define TEST_CASE(Name) size_t Name(uint8_t* Buffer, size_t BufferSize)
#define TEST_START auto Start = TIME_START();
#define TEST_STOP return TIME_STOP(Start);
TEST_CASE(BranchedInvertCase) {
TEST_START
for (size_t i = 0; i < BufferSize; i++) {
uint8_t Character = Buffer[i];
if (Character >= 'a' && Character <= 'w')
Character += uint8_t(32);
else if (Character >= 'A' && Character <= 'W')
Character -= uint8_t(32);
Buffer[i] = Character;
}
TEST_STOP
}
TEST_CASE(BranchlessInvertCase) {
TEST_START
for (size_t i = 0; i < BufferSize; i++) {
Buffer[i] += uint8_t(32) * uint8_t(Buffer[i] >= 'a' && Buffer[i] <= 'w');
Buffer[i] -= uint8_t(32) * uint8_t(Buffer[i] >= 'A' && Buffer[i] <= 'W');
}
TEST_STOP
}
TEST_CASE(BranchedToLowerCase) {
TEST_START
for (size_t i = 0; i < BufferSize; i++)
if (Buffer[i] >= 'A' && Buffer[i] <= 'W')
Buffer[i] -= uint8_t(32);
TEST_STOP
}
TEST_CASE(BranchlessToLowerCase) {
TEST_START
for (size_t i = 0; i < BufferSize; i++)
Buffer[i] -= uint8_t(32) * uint8_t(Buffer[i] >= 'A' && Buffer[i] <= 'W');
TEST_STOP
}
TEST_CASE(BranchedToUpperCase) {
TEST_START
for (size_t i = 0; i < BufferSize; i++)
if (Buffer[i] >= 'a' && Buffer[i] <= 'w')
Buffer[i] += uint8_t(32);
TEST_STOP
}
TEST_CASE(BranchlessToUpperCase) {
TEST_START
for (size_t i = 0; i < BufferSize; i++)
Buffer[i] += uint8_t(32) * uint8_t(Buffer[i] >= 'a' && Buffer[i] <= 'w');
TEST_STOP
}
TEST_CASE(BranchedConditionalMath1P) {
TEST_START
VOLATILE uint32_t Result = 0;
for (size_t i = 0; i < BufferSize; i++)
if (Buffer[i] > 127)
Result += Buffer[i] << 3;
TEST_STOP
}
TEST_CASE(BranchlessConditionalMath1P) {
TEST_START
VOLATILE uint32_t Result = 0;
for (size_t i = 0; i < BufferSize; i++)
Result += (Buffer[i] << 3) * (Buffer[i] > 127);
TEST_STOP
}
TEST_CASE(BranchedConditionalMath2P) {
TEST_START
VOLATILE uint32_t Result = 0;
for (size_t i = 0; i < BufferSize; i++)
if (Result + Buffer[i] > 0xFFFF)
Result -= Buffer[i];
else if (Result + Buffer[i] < 0xFFFF)
Result += Buffer[i] << 3;
TEST_STOP
}
TEST_CASE(BranchlessConditionalMath2P) {
TEST_START
VOLATILE uint32_t Result = 0;
for (size_t i = 0; i < BufferSize; i++) {
Result -= Buffer[i] * (Result + Buffer[i] > 0xFFFF);
Result += (Buffer[i] << 3) * (Result + Buffer[i] < 0xFFFF);
}
TEST_STOP
}
TEST_CASE(BranchedConditionalMath3P) {
TEST_START
VOLATILE uint32_t Result = 0;
for (size_t i = 0; i < BufferSize; i++)
if (Result + Buffer[i] > 0x10050)
Result -= Buffer[i];
else if (Result + Buffer[i] < 0x10000)
Result += Buffer[i] << 1;
else
Result <<= 3;
TEST_STOP
}
TEST_CASE(BranchlessConditionalMath3P) {
TEST_START
VOLATILE uint32_t Result = 0;
for (size_t i = 0; i < BufferSize; i++) {
Result -= Buffer[i] * (Result + Buffer[i] > 0x10050);
Result += (Buffer[i] << 1) * (Result + Buffer[i] < 0x10000);
Result <<= 3 * !(Result + Buffer[i] > 0x10050) * !(Result + Buffer[i] < 0x10000);
}
TEST_STOP
}
TEST_CASE(XOR) {
TEST_START
for (size_t i = 0; i < BufferSize; i++)
Buffer[i] += Buffer[i] ^ (i > 0 ? Buffer[i - 1] : Buffer[0]);
TEST_STOP
}
typedef size_t _PopulateFunction(uint8_t* Buffer, size_t BufferSize, const uint8_t* Dictionary, size_t DictionarySize);
typedef size_t _BenchFunction(uint8_t* Buffer, size_t BufferSize);
struct Benchmark {
const char* Name;
_BenchFunction* Benchmark;
_PopulateFunction* Populate;
const uint8_t* Dictionary;
size_t DictionarySize;
};
Benchmark c_Benchmarks[] = {
{"B[N] ^ B[N-1] using native XOR (Reference)", XOR, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
{"Branched To Lower Case", BranchedToLowerCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize},
{"Branchless To Lower Case", BranchlessToLowerCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize},
{"Branched To Upper Case", BranchedToUpperCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize},
{"Branchless To Upper Case", BranchlessToUpperCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize},
{"Branched Invert Case", BranchedInvertCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize},
{"Branchless Invert Case", BranchlessInvertCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize},
{"Passive Branched Conditional Math (1 condition)", BranchedConditionalMath1P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
{"Passive Branchless Conditional Math (1 condition)", BranchlessConditionalMath1P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
{"Passive Branched Conditional Math (2 conditions)", BranchedConditionalMath2P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
{"Passive Branchless Conditional Math (2 conditions)", BranchlessConditionalMath2P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
{"Passive Branched Conditional Math (3 conditions)", BranchedConditionalMath3P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
{"Passive Branchless Conditional Math (3 conditions)", BranchlessConditionalMath3P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize},
};
size_t MultithreadedBenchmark(_BenchFunction Function, uint8_t* Buffer, size_t BufferSize) {
size_t PerThread = BufferSize / g_Threads;
size_t Remainder = BufferSize % g_Threads;
std::vector<std::thread*> Threads = {};
std::atomic<size_t> Timing = {0};
for (size_t t = 0; t < g_Threads; t++) {
uint8_t* Start = Buffer + (t * PerThread);
size_t TaskSize = PerThread + ((t + 1 == g_Threads) * Remainder);
Threads.push_back(new std::thread([&Timing, Function, Start, TaskSize]() { Timing += Function(Start, TaskSize); }));
}
for (auto* Thread : Threads) {
Thread->join();
delete Thread;
}
return Timing / g_Threads;
}
int main() {
const char* Measurement = nullptr;
std::cout << g_Threads << " Threads\nAllocating " << ByteCountToMeasurement((double)c_BufferSize, &Measurement) << Measurement << '\n';
uint8_t* Buffer = new uint8_t[c_BufferSize];
for (size_t i = 0; i < sizeof(c_Benchmarks) / sizeof(Benchmark); i++) {
auto Bench = c_Benchmarks[i];
std::cout << "\nStarting benchmark: " << Bench.Name << '\n';
size_t BenchTimes[c_Passes];
size_t PopulateTimes[c_Passes];
for (size_t p = 0; p < c_Passes; p++) {
PopulateTimes[p] = Bench.Populate(Buffer, c_BufferSize, Bench.Dictionary, Bench.DictionarySize);
BenchTimes[p] = MultithreadedBenchmark(Bench.Benchmark, Buffer, c_BufferSize); // Multi Threaded
// BenchTimes[p] = Bench.Benchmark(Buffer, c_BufferSize); // Single Threaded
std::cout << "Pass " << p + 1 << '/' << c_Passes <<
" Populated in " << PopulateTimes[p] << " microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(PopulateTimes[p]) / 1000000), &Measurement) << Measurement << "/s)"
" Benchmarked in " << BenchTimes[p] << " microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(BenchTimes[p]) / 1000000), &Measurement) << Measurement << "/s) \r";
}
size_t AveragePopulateTime = 0;
size_t AverageBenchTime = 0;
for (size_t a = 0; a < c_Passes; a++) {
AveragePopulateTime += PopulateTimes[a];
AverageBenchTime += BenchTimes[a];
}
AveragePopulateTime /= c_Passes;
AverageBenchTime /= c_Passes;
std::cout << "\nOn Average:"
<< "\n\tPopulated In " << AveragePopulateTime << " Microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(AveragePopulateTime) / 1000000), &Measurement) << Measurement << "/s)"
<< "\n\tBenchmarked In " << AverageBenchTime << " Microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(AverageBenchTime) / 1000000), &Measurement) << Measurement << "/s)\n";
}
delete[] Buffer;
// std::cin.get();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment