@@ -477,7 +477,7 @@ void initConfigBindings(nb::module_& m)
477
477
c.getExtendedRuntimePerfKnobConfig (), c.getDebugConfig (), c.getRecvPollPeriodMs (),
478
478
c.getMaxSeqIdleMicroseconds (), c.getSpecDecConfig (), c.getGuidedDecodingConfig (),
479
479
c.getAdditionalModelOutputs (), c.getCacheTransceiverConfig (), c.getGatherGenerationLogits (),
480
- c.getPromptTableOffloading (), c.getEnableTrtOverlap ());
480
+ c.getPromptTableOffloading (), c.getEnableTrtOverlap (), c. getFailFastOnAttentionWindowTooLarge () );
481
481
auto pickle_tuple = nb::make_tuple (cpp_states, nb::getattr (self, " __dict__" ));
482
482
return pickle_tuple;
483
483
};
@@ -490,7 +490,7 @@ void initConfigBindings(nb::module_& m)
490
490
}
491
491
492
492
auto cpp_states = nb::cast<nb::tuple>(state[0 ]);
493
- if (cpp_states.size () != 28 )
493
+ if (cpp_states.size () != 29 )
494
494
{
495
495
throw std::runtime_error (" Invalid cpp_states!" );
496
496
}
@@ -525,7 +525,8 @@ void initConfigBindings(nb::module_& m)
525
525
nb::cast<std::optional<tle::CacheTransceiverConfig>>(cpp_states[24 ]), // CacheTransceiverConfig
526
526
nb::cast<bool >(cpp_states[25 ]), // GatherGenerationLogits
527
527
nb::cast<bool >(cpp_states[26 ]), // PromptTableOffloading
528
- nb::cast<bool >(cpp_states[27 ]) // EnableTrtOverlap
528
+ nb::cast<bool >(cpp_states[27 ]), // EnableTrtOverlap
529
+ nb::cast<bool >(cpp_states[28 ]) // FailFastOnAttentionWindowTooLarge
529
530
);
530
531
531
532
// Restore Python data
@@ -564,7 +565,8 @@ void initConfigBindings(nb::module_& m)
564
565
std::optional<tle::CacheTransceiverConfig>, // CacheTransceiverConfig
565
566
bool , // GatherGenerationLogits
566
567
bool , // PromptTableOffloading
567
- bool // EnableTrtOverlap
568
+ bool , // EnableTrtOverlap
569
+ bool // FailFastOnAttentionWindowTooLarge
568
570
>(),
569
571
nb::arg (" max_beam_width" ) = 1 , nb::arg (" scheduler_config" ) = tle::SchedulerConfig (),
570
572
nb::arg (" kv_cache_config" ) = tle::KvCacheConfig (), nb::arg (" enable_chunked_context" ) = false ,
@@ -582,7 +584,7 @@ void initConfigBindings(nb::module_& m)
582
584
nb::arg (" spec_dec_config" ) = nb::none (), nb::arg (" guided_decoding_config" ) = nb::none (),
583
585
nb::arg (" additional_model_outputs" ) = nb::none (), nb::arg (" cache_transceiver_config" ) = nb::none (),
584
586
nb::arg (" gather_generation_logits" ) = false , nb::arg (" mm_embedding_offloading" ) = false ,
585
- nb::arg (" enable_trt_overlap" ) = false )
587
+ nb::arg (" enable_trt_overlap" ) = false , nb::arg ( " fail_fast_on_attention_window_too_large " ) = false )
586
588
.def_prop_rw (" max_beam_width" , &tle::ExecutorConfig::getMaxBeamWidth, &tle::ExecutorConfig::setMaxBeamWidth)
587
589
.def_prop_rw (" max_batch_size" , &tle::ExecutorConfig::getMaxBatchSize, &tle::ExecutorConfig::setMaxBatchSize)
588
590
.def_prop_rw (" max_num_tokens" , &tle::ExecutorConfig::getMaxNumTokens, &tle::ExecutorConfig::setMaxNumTokens)
@@ -632,6 +634,9 @@ void initConfigBindings(nb::module_& m)
632
634
&tle::ExecutorConfig::setPromptTableOffloading)
633
635
.def_prop_rw (
634
636
" enable_trt_overlap" , &tle::ExecutorConfig::getEnableTrtOverlap, &tle::ExecutorConfig::setEnableTrtOverlap)
637
+ .def_prop_rw (" fail_fast_on_attention_window_too_large" ,
638
+ &tle::ExecutorConfig::getFailFastOnAttentionWindowTooLarge,
639
+ &tle::ExecutorConfig::setFailFastOnAttentionWindowTooLarge)
635
640
.def (" __getstate__" , executorConfigGetState)
636
641
.def (" __setstate__" , executorConfigSetState);
637
642
}
0 commit comments