37
37
#include < pybind11/numpy.h>
38
38
39
39
namespace py = pybind11;
40
+ using namespace pybind11 ::literals;
40
41
41
42
// Include the exception class
42
43
class PythonBackendException : public std ::exception {
@@ -224,6 +225,188 @@ bool compare_arrays(const py::array& cpp_result, const py::array& python_result,
224
225
return true ;
225
226
}
226
227
228
+ // Focused benchmark for struct.unpack_from operation
229
+ void benchmark_struct_unpack_operations () {
230
+ std::cout << " \n\n === PERFORMANCE BENCHMARK: struct.unpack_from vs C++ ===" << std::endl;
231
+ std::cout << " =========================================================" << std::endl;
232
+
233
+ // Test different string sizes
234
+ std::vector<size_t > test_sizes = {10 , 100 , 1000 , 10000 , 100000 };
235
+
236
+ for (size_t string_size : test_sizes) {
237
+ std::cout << " \n Testing single string of size: " << string_size << " bytes" << std::endl;
238
+
239
+ // Create test data - single string with length prefix
240
+ std::vector<uint8_t > test_data;
241
+
242
+ // Add 4-byte length prefix (little-endian)
243
+ uint32_t length = static_cast <uint32_t >(string_size);
244
+ test_data.push_back (length & 0xFF );
245
+ test_data.push_back ((length >> 8 ) & 0xFF );
246
+ test_data.push_back ((length >> 16 ) & 0xFF );
247
+ test_data.push_back ((length >> 24 ) & 0xFF );
248
+
249
+ // Add string data
250
+ for (size_t i = 0 ; i < string_size; i++) {
251
+ test_data.push_back (' A' + (i % 26 ));
252
+ }
253
+
254
+ py::bytes py_data = py::bytes (reinterpret_cast <const char *>(test_data.data ()), test_data.size ());
255
+
256
+ // Measure multiple iterations for stability
257
+ const int iterations = 10000 ;
258
+
259
+ // Benchmark Python struct.unpack_from for the exact line 117 operation
260
+ {
261
+ std::cout << " Testing: sb = struct.unpack_from(\" <{}s\" .format(l), val_buf, offset)[0]" << std::endl;
262
+
263
+ // Setup Python code that mimics the exact operation
264
+ py::exec (R"(
265
+ import struct
266
+ def single_unpack_string(val_buf, l, offset):
267
+ # This is the exact operation from line 117
268
+ sb = struct.unpack_from("<{}s".format(l), val_buf, offset)[0]
269
+ return sb
270
+ )" );
271
+
272
+ py::object py_func = py::globals ()[" single_unpack_string" ];
273
+
274
+ // Warm up
275
+ for (int i = 0 ; i < 100 ; i++) {
276
+ py::bytes result = py_func (py_data, length, 4 );
277
+ }
278
+
279
+ // Measure Python version
280
+ auto py_start = std::chrono::high_resolution_clock::now ();
281
+ for (int i = 0 ; i < iterations; i++) {
282
+ py::bytes result = py_func (py_data, length, 4 );
283
+ }
284
+ auto py_end = std::chrono::high_resolution_clock::now ();
285
+
286
+ // Measure C++ equivalent
287
+ auto cpp_start = std::chrono::high_resolution_clock::now ();
288
+ for (int i = 0 ; i < iterations; i++) {
289
+ // Create py::bytes object - more comparable to struct.unpack_from
290
+ py::bytes bytes_obj (reinterpret_cast <const char *>(test_data.data () + 4 ), length);
291
+ }
292
+ auto cpp_end = std::chrono::high_resolution_clock::now ();
293
+
294
+ auto py_time = std::chrono::duration_cast<std::chrono::nanoseconds>(py_end - py_start);
295
+ auto cpp_time = std::chrono::duration_cast<std::chrono::nanoseconds>(cpp_end - cpp_start);
296
+
297
+ double py_per_call = static_cast <double >(py_time.count ()) / iterations;
298
+ double cpp_per_call = static_cast <double >(cpp_time.count ()) / iterations;
299
+
300
+ std::cout << " Python struct.unpack_from: " << py_per_call << " ns per call" << std::endl;
301
+ std::cout << " C++ direct access: " << cpp_per_call << " ns per call" << std::endl;
302
+ std::cout << " Overhead per call: " << (py_per_call - cpp_per_call) << " ns" << std::endl;
303
+ std::cout << " Speedup: " << std::fixed << std::setprecision (2 )
304
+ << py_per_call / cpp_per_call << " x" << std::endl;
305
+ }
306
+
307
+ // Also test just the length unpacking (line 115)
308
+ {
309
+ std::cout << " \n Testing: l = struct.unpack_from(\" <I\" , val_buf, offset)[0]" << std::endl;
310
+
311
+ py::exec (R"(
312
+ import struct
313
+ def single_unpack_length(val_buf, offset):
314
+ # This is from line 115
315
+ l = struct.unpack_from("<I", val_buf, offset)[0]
316
+ return l
317
+ )" );
318
+
319
+ py::object py_func = py::globals ()[" single_unpack_length" ];
320
+
321
+ // Warm up
322
+ for (int i = 0 ; i < 100 ; i++) {
323
+ py::int_ result = py_func (py_data, 0 );
324
+ }
325
+
326
+ // Measure Python version
327
+ auto py_start = std::chrono::high_resolution_clock::now ();
328
+ for (int i = 0 ; i < iterations; i++) {
329
+ py::int_ result = py_func (py_data, 0 );
330
+ }
331
+ auto py_end = std::chrono::high_resolution_clock::now ();
332
+
333
+ // Measure C++ equivalent
334
+ auto cpp_start = std::chrono::high_resolution_clock::now ();
335
+ for (int i = 0 ; i < iterations; i++) {
336
+ volatile uint32_t result = *reinterpret_cast <const uint32_t *>(test_data.data ());
337
+ (void )result; // Prevent optimization
338
+ }
339
+ auto cpp_end = std::chrono::high_resolution_clock::now ();
340
+
341
+ auto py_time = std::chrono::duration_cast<std::chrono::nanoseconds>(py_end - py_start);
342
+ auto cpp_time = std::chrono::duration_cast<std::chrono::nanoseconds>(cpp_end - cpp_start);
343
+
344
+ double py_per_call = static_cast <double >(py_time.count ()) / iterations;
345
+ double cpp_per_call = static_cast <double >(cpp_time.count ()) / iterations;
346
+
347
+ std::cout << " Python struct.unpack_from: " << py_per_call << " ns per call" << std::endl;
348
+ std::cout << " C++ direct access: " << cpp_per_call << " ns per call" << std::endl;
349
+ std::cout << " Overhead per call: " << (py_per_call - cpp_per_call) << " ns" << std::endl;
350
+ std::cout << " Speedup: " << std::fixed << std::setprecision (2 )
351
+ << py_per_call / cpp_per_call << " x" << std::endl;
352
+ }
353
+ }
354
+
355
+ // Test with a realistic workload
356
+ std::cout << " \n\n Realistic Workload Test (15000 strings)" << std::endl;
357
+ std::cout << " ========================================" << std::endl;
358
+
359
+ size_t num_strings = 15000 ;
360
+ std::vector<std::string> test_strings;
361
+ test_strings.reserve (num_strings);
362
+ for (size_t i = 0 ; i < num_strings; i++) {
363
+ test_strings.push_back (" string_" + std::to_string (i));
364
+ }
365
+
366
+ std::vector<uint8_t > serialized = serialize_strings (test_strings);
367
+ py::bytes py_serialized = py::bytes (reinterpret_cast <const char *>(serialized.data ()), serialized.size ());
368
+
369
+ py::module triton_pb_utils = py::module::import (" triton_python_backend_utils" );
370
+
371
+ // Full function comparison
372
+ const int iterations = 100 ;
373
+
374
+ // Warm up
375
+ for (int i = 0 ; i < 5 ; i++) {
376
+ py::array py_result = triton_pb_utils.attr (" deserialize_bytes_tensor" )(py_serialized);
377
+ py::array cpp_result = deserialize_bytes_tensor_cpp (serialized.data (), serialized.size ());
378
+ }
379
+
380
+ // Python version
381
+ auto py_start = std::chrono::high_resolution_clock::now ();
382
+ for (int i = 0 ; i < iterations; i++) {
383
+ py::array py_result = triton_pb_utils.attr (" deserialize_bytes_tensor" )(py_serialized);
384
+ }
385
+ auto py_end = std::chrono::high_resolution_clock::now ();
386
+
387
+ // C++ version
388
+ auto cpp_start = std::chrono::high_resolution_clock::now ();
389
+ for (int i = 0 ; i < iterations; i++) {
390
+ py::array cpp_result = deserialize_bytes_tensor_cpp (serialized.data (), serialized.size ());
391
+ }
392
+ auto cpp_end = std::chrono::high_resolution_clock::now ();
393
+
394
+ auto py_time = std::chrono::duration_cast<std::chrono::microseconds>(py_end - py_start);
395
+ auto cpp_time = std::chrono::duration_cast<std::chrono::microseconds>(cpp_end - cpp_start);
396
+
397
+ std::cout << " Python deserialize_bytes_tensor: " << py_time.count () / iterations << " μs per call" << std::endl;
398
+ std::cout << " C++ deserialize_bytes_tensor: " << cpp_time.count () / iterations << " μs per call" << std::endl;
399
+ std::cout << " Speedup: " << std::fixed << std::setprecision (2 )
400
+ << static_cast <double >(py_time.count ()) / cpp_time.count () << " x" << std::endl;
401
+
402
+ // Calculate estimated impact of struct.unpack_from
403
+ double estimated_unpack_overhead = num_strings * 2 * 500 ; // ~500ns per unpack call (estimate from above)
404
+ std::cout << " \n Estimated struct.unpack_from overhead: ~" << estimated_unpack_overhead / 1000 << " μs" << std::endl;
405
+ std::cout << " Actual performance difference: " << (py_time.count () - cpp_time.count ()) / iterations << " μs" << std::endl;
406
+
407
+ std::cout << " \n === END OF PERFORMANCE BENCHMARK ===" << std::endl;
408
+ }
409
+
227
410
int main () {
228
411
std::cout << " Simple Deserialize Function Equivalence Test" << std::endl;
229
412
std::cout << " =============================================" << std::endl;
@@ -366,13 +549,16 @@ int main() {
366
549
std::cout << " Passed: " << passed << std::endl;
367
550
std::cout << " Failed: " << failed << std::endl;
368
551
std::cout << " Total time: " << total_time.count () << " ms" << std::endl;
369
- std::cout << " Success rate: " << std::fixed << std::setprecision (1 )
552
+ std::cout << " Success rate: " << std::fixed << std::setprecision (1 )
370
553
<< (static_cast <double >(passed) / test_cases.size () * 100 ) << " %" << std::endl;
371
-
554
+
372
555
if (passed > 0 ) {
373
556
std::cout << " \n All functional equivalence tests passed!" << std::endl;
374
557
std::cout << " The C++ and Python implementations produce identical results." << std::endl;
375
558
}
376
-
559
+
560
+ // Run the performance benchmark
561
+ benchmark_struct_unpack_operations ();
562
+
377
563
return failed > 0 ? 1 : 0 ;
378
564
}
0 commit comments