Skip to content

Commit 0de4082

Browse files
author
Wei Chen
committed
add specific perf test for struct.unpack_from
1 parent f12ee38 commit 0de4082

File tree

1 file changed

+189
-3
lines changed

1 file changed

+189
-3
lines changed

src/test_deserialize_simple.cpp

Lines changed: 189 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <pybind11/numpy.h>
3838

3939
namespace py = pybind11;
40+
using namespace pybind11::literals;
4041

4142
// Include the exception class
4243
class PythonBackendException : public std::exception {
@@ -224,6 +225,188 @@ bool compare_arrays(const py::array& cpp_result, const py::array& python_result,
224225
return true;
225226
}
226227

228+
// Focused benchmark for struct.unpack_from operation
229+
void benchmark_struct_unpack_operations() {
230+
std::cout << "\n\n=== PERFORMANCE BENCHMARK: struct.unpack_from vs C++ ===" << std::endl;
231+
std::cout << "=========================================================" << std::endl;
232+
233+
// Test different string sizes
234+
std::vector<size_t> test_sizes = {10, 100, 1000, 10000, 100000};
235+
236+
for (size_t string_size : test_sizes) {
237+
std::cout << "\nTesting single string of size: " << string_size << " bytes" << std::endl;
238+
239+
// Create test data - single string with length prefix
240+
std::vector<uint8_t> test_data;
241+
242+
// Add 4-byte length prefix (little-endian)
243+
uint32_t length = static_cast<uint32_t>(string_size);
244+
test_data.push_back(length & 0xFF);
245+
test_data.push_back((length >> 8) & 0xFF);
246+
test_data.push_back((length >> 16) & 0xFF);
247+
test_data.push_back((length >> 24) & 0xFF);
248+
249+
// Add string data
250+
for (size_t i = 0; i < string_size; i++) {
251+
test_data.push_back('A' + (i % 26));
252+
}
253+
254+
py::bytes py_data = py::bytes(reinterpret_cast<const char*>(test_data.data()), test_data.size());
255+
256+
// Measure multiple iterations for stability
257+
const int iterations = 10000;
258+
259+
// Benchmark Python struct.unpack_from for the exact line 117 operation
260+
{
261+
std::cout << " Testing: sb = struct.unpack_from(\"<{}s\".format(l), val_buf, offset)[0]" << std::endl;
262+
263+
// Setup Python code that mimics the exact operation
264+
py::exec(R"(
265+
import struct
266+
def single_unpack_string(val_buf, l, offset):
267+
# This is the exact operation from line 117
268+
sb = struct.unpack_from("<{}s".format(l), val_buf, offset)[0]
269+
return sb
270+
)");
271+
272+
py::object py_func = py::globals()["single_unpack_string"];
273+
274+
// Warm up
275+
for (int i = 0; i < 100; i++) {
276+
py::bytes result = py_func(py_data, length, 4);
277+
}
278+
279+
// Measure Python version
280+
auto py_start = std::chrono::high_resolution_clock::now();
281+
for (int i = 0; i < iterations; i++) {
282+
py::bytes result = py_func(py_data, length, 4);
283+
}
284+
auto py_end = std::chrono::high_resolution_clock::now();
285+
286+
// Measure C++ equivalent
287+
auto cpp_start = std::chrono::high_resolution_clock::now();
288+
for (int i = 0; i < iterations; i++) {
289+
// Create py::bytes object - more comparable to struct.unpack_from
290+
py::bytes bytes_obj(reinterpret_cast<const char*>(test_data.data() + 4), length);
291+
}
292+
auto cpp_end = std::chrono::high_resolution_clock::now();
293+
294+
auto py_time = std::chrono::duration_cast<std::chrono::nanoseconds>(py_end - py_start);
295+
auto cpp_time = std::chrono::duration_cast<std::chrono::nanoseconds>(cpp_end - cpp_start);
296+
297+
double py_per_call = static_cast<double>(py_time.count()) / iterations;
298+
double cpp_per_call = static_cast<double>(cpp_time.count()) / iterations;
299+
300+
std::cout << " Python struct.unpack_from: " << py_per_call << " ns per call" << std::endl;
301+
std::cout << " C++ direct access: " << cpp_per_call << " ns per call" << std::endl;
302+
std::cout << " Overhead per call: " << (py_per_call - cpp_per_call) << " ns" << std::endl;
303+
std::cout << " Speedup: " << std::fixed << std::setprecision(2)
304+
<< py_per_call / cpp_per_call << "x" << std::endl;
305+
}
306+
307+
// Also test just the length unpacking (line 115)
308+
{
309+
std::cout << "\n Testing: l = struct.unpack_from(\"<I\", val_buf, offset)[0]" << std::endl;
310+
311+
py::exec(R"(
312+
import struct
313+
def single_unpack_length(val_buf, offset):
314+
# This is from line 115
315+
l = struct.unpack_from("<I", val_buf, offset)[0]
316+
return l
317+
)");
318+
319+
py::object py_func = py::globals()["single_unpack_length"];
320+
321+
// Warm up
322+
for (int i = 0; i < 100; i++) {
323+
py::int_ result = py_func(py_data, 0);
324+
}
325+
326+
// Measure Python version
327+
auto py_start = std::chrono::high_resolution_clock::now();
328+
for (int i = 0; i < iterations; i++) {
329+
py::int_ result = py_func(py_data, 0);
330+
}
331+
auto py_end = std::chrono::high_resolution_clock::now();
332+
333+
// Measure C++ equivalent
334+
auto cpp_start = std::chrono::high_resolution_clock::now();
335+
for (int i = 0; i < iterations; i++) {
336+
volatile uint32_t result = *reinterpret_cast<const uint32_t*>(test_data.data());
337+
(void)result; // Prevent optimization
338+
}
339+
auto cpp_end = std::chrono::high_resolution_clock::now();
340+
341+
auto py_time = std::chrono::duration_cast<std::chrono::nanoseconds>(py_end - py_start);
342+
auto cpp_time = std::chrono::duration_cast<std::chrono::nanoseconds>(cpp_end - cpp_start);
343+
344+
double py_per_call = static_cast<double>(py_time.count()) / iterations;
345+
double cpp_per_call = static_cast<double>(cpp_time.count()) / iterations;
346+
347+
std::cout << " Python struct.unpack_from: " << py_per_call << " ns per call" << std::endl;
348+
std::cout << " C++ direct access: " << cpp_per_call << " ns per call" << std::endl;
349+
std::cout << " Overhead per call: " << (py_per_call - cpp_per_call) << " ns" << std::endl;
350+
std::cout << " Speedup: " << std::fixed << std::setprecision(2)
351+
<< py_per_call / cpp_per_call << "x" << std::endl;
352+
}
353+
}
354+
355+
// Test with a realistic workload
356+
std::cout << "\n\nRealistic Workload Test (15000 strings)" << std::endl;
357+
std::cout << "========================================" << std::endl;
358+
359+
size_t num_strings = 15000;
360+
std::vector<std::string> test_strings;
361+
test_strings.reserve(num_strings);
362+
for (size_t i = 0; i < num_strings; i++) {
363+
test_strings.push_back("string_" + std::to_string(i));
364+
}
365+
366+
std::vector<uint8_t> serialized = serialize_strings(test_strings);
367+
py::bytes py_serialized = py::bytes(reinterpret_cast<const char*>(serialized.data()), serialized.size());
368+
369+
py::module triton_pb_utils = py::module::import("triton_python_backend_utils");
370+
371+
// Full function comparison
372+
const int iterations = 100;
373+
374+
// Warm up
375+
for (int i = 0; i < 5; i++) {
376+
py::array py_result = triton_pb_utils.attr("deserialize_bytes_tensor")(py_serialized);
377+
py::array cpp_result = deserialize_bytes_tensor_cpp(serialized.data(), serialized.size());
378+
}
379+
380+
// Python version
381+
auto py_start = std::chrono::high_resolution_clock::now();
382+
for (int i = 0; i < iterations; i++) {
383+
py::array py_result = triton_pb_utils.attr("deserialize_bytes_tensor")(py_serialized);
384+
}
385+
auto py_end = std::chrono::high_resolution_clock::now();
386+
387+
// C++ version
388+
auto cpp_start = std::chrono::high_resolution_clock::now();
389+
for (int i = 0; i < iterations; i++) {
390+
py::array cpp_result = deserialize_bytes_tensor_cpp(serialized.data(), serialized.size());
391+
}
392+
auto cpp_end = std::chrono::high_resolution_clock::now();
393+
394+
auto py_time = std::chrono::duration_cast<std::chrono::microseconds>(py_end - py_start);
395+
auto cpp_time = std::chrono::duration_cast<std::chrono::microseconds>(cpp_end - cpp_start);
396+
397+
std::cout << " Python deserialize_bytes_tensor: " << py_time.count() / iterations << " μs per call" << std::endl;
398+
std::cout << " C++ deserialize_bytes_tensor: " << cpp_time.count() / iterations << " μs per call" << std::endl;
399+
std::cout << " Speedup: " << std::fixed << std::setprecision(2)
400+
<< static_cast<double>(py_time.count()) / cpp_time.count() << "x" << std::endl;
401+
402+
// Calculate estimated impact of struct.unpack_from
403+
double estimated_unpack_overhead = num_strings * 2 * 500; // ~500ns per unpack call (estimate from above)
404+
std::cout << "\n Estimated struct.unpack_from overhead: ~" << estimated_unpack_overhead / 1000 << " μs" << std::endl;
405+
std::cout << " Actual performance difference: " << (py_time.count() - cpp_time.count()) / iterations << " μs" << std::endl;
406+
407+
std::cout << "\n=== END OF PERFORMANCE BENCHMARK ===" << std::endl;
408+
}
409+
227410
int main() {
228411
std::cout << "Simple Deserialize Function Equivalence Test" << std::endl;
229412
std::cout << "=============================================" << std::endl;
@@ -366,13 +549,16 @@ int main() {
366549
std::cout << "Passed: " << passed << std::endl;
367550
std::cout << "Failed: " << failed << std::endl;
368551
std::cout << "Total time: " << total_time.count() << "ms" << std::endl;
369-
std::cout << "Success rate: " << std::fixed << std::setprecision(1)
552+
std::cout << "Success rate: " << std::fixed << std::setprecision(1)
370553
<< (static_cast<double>(passed) / test_cases.size() * 100) << "%" << std::endl;
371-
554+
372555
if (passed > 0) {
373556
std::cout << "\nAll functional equivalence tests passed!" << std::endl;
374557
std::cout << "The C++ and Python implementations produce identical results." << std::endl;
375558
}
376-
559+
560+
// Run the performance benchmark
561+
benchmark_struct_unpack_operations();
562+
377563
return failed > 0 ? 1 : 0;
378564
}

0 commit comments

Comments
 (0)