Last active
August 9, 2024 23:47
-
-
Save rafa-br34/40076a242c70f57b92cdc8b53750bc1e to your computer and use it in GitHub Desktop.
A C++17 benchmark to test branched vs branchless performance.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Compile with | |
| // g++ -O3 -Ofast -m64 -lstdc++ -std=c++17 Benchmark.cpp -o Benchmark | |
| // or | |
| // g++ -O3 -Ofast -m64 -lstdc++ -lpthread -std=c++17 Benchmark.cpp -o Benchmark | |
| #include <iostream> | |
| #include <cstring> | |
| #include <vector> | |
| #include <thread> | |
| #include <random> | |
| #include <chrono> | |
| #include <atomic> | |
| #include <mutex> | |
| #define TIME_START() std::chrono::high_resolution_clock::now() | |
| #define TIME_STOP(Start) (size_t)(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - Start).count()) | |
| // __attribute__((always_inline)) for GCC __forceinline for MSVC | |
| #define FORCE_INLINE __attribute__((always_inline)) | |
| #define VOLATILE volatile | |
| const size_t c_BufferSize = size_t(1024 * 1024) * size_t(1024 * 1); | |
| const size_t c_Passes = 150; | |
| const uint8_t* c_AlphaNumericDictionary = (const uint8_t*)"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; | |
| const size_t c_AlphaNumericDictionarySize = 62; | |
| const uint8_t* c_00_FF_Dictionary = (const uint8_t*)"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe"; | |
| const size_t c_00_FF_DictionarySize = 255; | |
| const size_t g_Threads = std::thread::hardware_concurrency(); | |
| // XorShift64 | |
| struct XorShift64 { | |
| // Must Not Be Zero | |
| uint64_t State = 1; | |
| FORCE_INLINE void Seed(uint64_t Seed) { | |
| this->State = Seed; | |
| }; | |
| FORCE_INLINE uint64_t Next() { | |
| uint64_t X = this->State; | |
| X ^= X << 13; | |
| X ^= X >> 7; | |
| X ^= X << 17; | |
| return this->State = X; | |
| } | |
| }; | |
| double ByteCountToMeasurement(double Count, const char** Output) { | |
| const char* Measurements[] = { | |
| "B", | |
| "KB", | |
| "MB", | |
| "GB", | |
| "TB", | |
| "EB", | |
| "ZB", | |
| "YB", | |
| "RB", | |
| "QB", | |
| "UNK" | |
| }; | |
| uint8_t Iteration = 0; | |
| while (Count >= 1024.0) { | |
| Count /= 1024.0; | |
| Iteration++; | |
| } | |
| if (const auto ListSize = sizeof(Measurements) / sizeof(void*); Iteration > ListSize) | |
| *Output = Measurements[ListSize - 1]; | |
| else | |
| *Output = Measurements[Iteration]; | |
| return Count; | |
| } | |
| size_t PopulateBytes(uint8_t* Buffer, size_t BufferSize, const uint8_t* Dictionary, size_t DictionarySize) { | |
| auto Start = TIME_START(); | |
| size_t PerThread = BufferSize / g_Threads; | |
| size_t Remainder = BufferSize % g_Threads; | |
| std::vector<std::thread*> Threads = {}; | |
| for (size_t t = 0; t < g_Threads; t++) { | |
| uint8_t* Start = Buffer + (t * PerThread); | |
| size_t TaskSize = PerThread + ((t + 1 == g_Threads) * Remainder); | |
| Threads.push_back(new std::thread( | |
| [Start, TaskSize, Dictionary, DictionarySize]() { | |
| XorShift64 PRNG = {}; | |
| PRNG.Seed(std::chrono::high_resolution_clock::now().time_since_epoch().count()); | |
| for (size_t i = 0; i < TaskSize; i++) | |
| Start[i] = Dictionary[DictionarySize > 1 ? (PRNG.Next() % DictionarySize) : 0]; | |
| } | |
| )); | |
| } | |
| for (auto* Thread : Threads) { | |
| Thread->join(); | |
| delete Thread; | |
| } | |
| return TIME_STOP(Start); | |
| } | |
| #define TEST_CASE(Name) size_t Name(uint8_t* Buffer, size_t BufferSize) | |
| #define TEST_START auto Start = TIME_START(); | |
| #define TEST_STOP return TIME_STOP(Start); | |
| TEST_CASE(BranchedInvertCase) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) { | |
| uint8_t Character = Buffer[i]; | |
| if (Character >= 'a' && Character <= 'w') | |
| Character += uint8_t(32); | |
| else if (Character >= 'A' && Character <= 'W') | |
| Character -= uint8_t(32); | |
| Buffer[i] = Character; | |
| } | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchlessInvertCase) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) { | |
| Buffer[i] += uint8_t(32) * uint8_t(Buffer[i] >= 'a' && Buffer[i] <= 'w'); | |
| Buffer[i] -= uint8_t(32) * uint8_t(Buffer[i] >= 'A' && Buffer[i] <= 'W'); | |
| } | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchedToLowerCase) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) | |
| if (Buffer[i] >= 'A' && Buffer[i] <= 'W') | |
| Buffer[i] -= uint8_t(32); | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchlessToLowerCase) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) | |
| Buffer[i] -= uint8_t(32) * uint8_t(Buffer[i] >= 'A' && Buffer[i] <= 'W'); | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchedToUpperCase) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) | |
| if (Buffer[i] >= 'a' && Buffer[i] <= 'w') | |
| Buffer[i] += uint8_t(32); | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchlessToUpperCase) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) | |
| Buffer[i] += uint8_t(32) * uint8_t(Buffer[i] >= 'a' && Buffer[i] <= 'w'); | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchedConditionalMath1P) { | |
| TEST_START | |
| VOLATILE uint32_t Result = 0; | |
| for (size_t i = 0; i < BufferSize; i++) | |
| if (Buffer[i] > 127) | |
| Result += Buffer[i] << 3; | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchlessConditionalMath1P) { | |
| TEST_START | |
| VOLATILE uint32_t Result = 0; | |
| for (size_t i = 0; i < BufferSize; i++) | |
| Result += (Buffer[i] << 3) * (Buffer[i] > 127); | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchedConditionalMath2P) { | |
| TEST_START | |
| VOLATILE uint32_t Result = 0; | |
| for (size_t i = 0; i < BufferSize; i++) | |
| if (Result + Buffer[i] > 0xFFFF) | |
| Result -= Buffer[i]; | |
| else if (Result + Buffer[i] < 0xFFFF) | |
| Result += Buffer[i] << 3; | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchlessConditionalMath2P) { | |
| TEST_START | |
| VOLATILE uint32_t Result = 0; | |
| for (size_t i = 0; i < BufferSize; i++) { | |
| Result -= Buffer[i] * (Result + Buffer[i] > 0xFFFF); | |
| Result += (Buffer[i] << 3) * (Result + Buffer[i] < 0xFFFF); | |
| } | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchedConditionalMath3P) { | |
| TEST_START | |
| VOLATILE uint32_t Result = 0; | |
| for (size_t i = 0; i < BufferSize; i++) | |
| if (Result + Buffer[i] > 0x10050) | |
| Result -= Buffer[i]; | |
| else if (Result + Buffer[i] < 0x10000) | |
| Result += Buffer[i] << 1; | |
| else | |
| Result <<= 3; | |
| TEST_STOP | |
| } | |
| TEST_CASE(BranchlessConditionalMath3P) { | |
| TEST_START | |
| VOLATILE uint32_t Result = 0; | |
| for (size_t i = 0; i < BufferSize; i++) { | |
| Result -= Buffer[i] * (Result + Buffer[i] > 0x10050); | |
| Result += (Buffer[i] << 1) * (Result + Buffer[i] < 0x10000); | |
| Result <<= 3 * !(Result + Buffer[i] > 0x10050) * !(Result + Buffer[i] < 0x10000); | |
| } | |
| TEST_STOP | |
| } | |
| TEST_CASE(XOR) { | |
| TEST_START | |
| for (size_t i = 0; i < BufferSize; i++) | |
| Buffer[i] += Buffer[i] ^ (i > 0 ? Buffer[i - 1] : Buffer[0]); | |
| TEST_STOP | |
| } | |
| typedef size_t _PopulateFunction(uint8_t* Buffer, size_t BufferSize, const uint8_t* Dictionary, size_t DictionarySize); | |
| typedef size_t _BenchFunction(uint8_t* Buffer, size_t BufferSize); | |
| struct Benchmark { | |
| const char* Name; | |
| _BenchFunction* Benchmark; | |
| _PopulateFunction* Populate; | |
| const uint8_t* Dictionary; | |
| size_t DictionarySize; | |
| }; | |
| Benchmark c_Benchmarks[] = { | |
| {"B[N] ^ B[N-1] using native XOR (Reference)", XOR, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| {"Branched To Lower Case", BranchedToLowerCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize}, | |
| {"Branchless To Lower Case", BranchlessToLowerCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize}, | |
| {"Branched To Upper Case", BranchedToUpperCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize}, | |
| {"Branchless To Upper Case", BranchlessToUpperCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize}, | |
| {"Branched Invert Case", BranchedInvertCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize}, | |
| {"Branchless Invert Case", BranchlessInvertCase, PopulateBytes, c_AlphaNumericDictionary, c_AlphaNumericDictionarySize}, | |
| {"Passive Branched Conditional Math (1 condition)", BranchedConditionalMath1P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| {"Passive Branchless Conditional Math (1 condition)", BranchlessConditionalMath1P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| {"Passive Branched Conditional Math (2 conditions)", BranchedConditionalMath2P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| {"Passive Branchless Conditional Math (2 conditions)", BranchlessConditionalMath2P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| {"Passive Branched Conditional Math (3 conditions)", BranchedConditionalMath3P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| {"Passive Branchless Conditional Math (3 conditions)", BranchlessConditionalMath3P, PopulateBytes, c_00_FF_Dictionary, c_00_FF_DictionarySize}, | |
| }; | |
| size_t MultithreadedBenchmark(_BenchFunction Function, uint8_t* Buffer, size_t BufferSize) { | |
| size_t PerThread = BufferSize / g_Threads; | |
| size_t Remainder = BufferSize % g_Threads; | |
| std::vector<std::thread*> Threads = {}; | |
| std::atomic<size_t> Timing = {0}; | |
| for (size_t t = 0; t < g_Threads; t++) { | |
| uint8_t* Start = Buffer + (t * PerThread); | |
| size_t TaskSize = PerThread + ((t + 1 == g_Threads) * Remainder); | |
| Threads.push_back(new std::thread([&Timing, Function, Start, TaskSize]() { Timing += Function(Start, TaskSize); })); | |
| } | |
| for (auto* Thread : Threads) { | |
| Thread->join(); | |
| delete Thread; | |
| } | |
| return Timing / g_Threads; | |
| } | |
| int main() { | |
| const char* Measurement = nullptr; | |
| std::cout << g_Threads << " Threads\nAllocating " << ByteCountToMeasurement((double)c_BufferSize, &Measurement) << Measurement << '\n'; | |
| uint8_t* Buffer = new uint8_t[c_BufferSize]; | |
| for (size_t i = 0; i < sizeof(c_Benchmarks) / sizeof(Benchmark); i++) { | |
| auto Bench = c_Benchmarks[i]; | |
| std::cout << "\nStarting benchmark: " << Bench.Name << '\n'; | |
| size_t BenchTimes[c_Passes]; | |
| size_t PopulateTimes[c_Passes]; | |
| for (size_t p = 0; p < c_Passes; p++) { | |
| PopulateTimes[p] = Bench.Populate(Buffer, c_BufferSize, Bench.Dictionary, Bench.DictionarySize); | |
| BenchTimes[p] = MultithreadedBenchmark(Bench.Benchmark, Buffer, c_BufferSize); // Multi Threaded | |
| // BenchTimes[p] = Bench.Benchmark(Buffer, c_BufferSize); // Single Threaded | |
| std::cout << "Pass " << p + 1 << '/' << c_Passes << | |
| " Populated in " << PopulateTimes[p] << " microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(PopulateTimes[p]) / 1000000), &Measurement) << Measurement << "/s)" | |
| " Benchmarked in " << BenchTimes[p] << " microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(BenchTimes[p]) / 1000000), &Measurement) << Measurement << "/s) \r"; | |
| } | |
| size_t AveragePopulateTime = 0; | |
| size_t AverageBenchTime = 0; | |
| for (size_t a = 0; a < c_Passes; a++) { | |
| AveragePopulateTime += PopulateTimes[a]; | |
| AverageBenchTime += BenchTimes[a]; | |
| } | |
| AveragePopulateTime /= c_Passes; | |
| AverageBenchTime /= c_Passes; | |
| std::cout << "\nOn Average:" | |
| << "\n\tPopulated In " << AveragePopulateTime << " Microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(AveragePopulateTime) / 1000000), &Measurement) << Measurement << "/s)" | |
| << "\n\tBenchmarked In " << AverageBenchTime << " Microseconds(" << ByteCountToMeasurement(double(c_BufferSize) / (double(AverageBenchTime) / 1000000), &Measurement) << Measurement << "/s)\n"; | |
| } | |
| delete[] Buffer; | |
| // std::cin.get(); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment