triton-inference-server · BasicCoder · Nov 3, 2023 · Nov 3, 2023
diff --git a/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/all_models/inflight_batcher_llm/ensemble/config.pbtxt
@@ -241,6 +241,10 @@ ensemble_scheduling {
         key: "TOKENS_BATCH"
         value: "_TOKENS_BATCH"
       }
+      input_map {
+        key: "REQUEST_INPUT_LEN"
+        value: "_REQUEST_INPUT_LEN"
+      }
       input_map {
         key: "SEQUENCE_LENGTH"
         value: "_SEQUENCE_LENGTH"

diff --git a/all_models/inflight_batcher_llm/postprocessing/1/model.py b/all_models/inflight_batcher_llm/postprocessing/1/model.py
@@ -109,6 +109,8 @@ def execute(self, requests):
             tokens_batch = pb_utils.get_input_tensor_by_name(
                 request, 'TOKENS_BATCH').as_numpy()
 
+            request_input_lens = pb_utils.get_input_tensor_by_name(
+                request, 'REQUEST_INPUT_LEN').as_numpy()
             # Get sequence length
             sequence_lengths = pb_utils.get_input_tensor_by_name(
                 request, 'SEQUENCE_LENGTH').as_numpy()
@@ -118,7 +120,7 @@ def execute(self, requests):
             # tokens_batch = tokens_batch.T
 
             # Postprocessing output data.
-            outputs = self._postprocessing(tokens_batch, sequence_lengths)
+            outputs = self._postprocessing(tokens_batch, request_input_lens, sequence_lengths)
 
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
@@ -148,11 +150,12 @@ def finalize(self):
         """
         print('Cleaning up...')
 
-    def _postprocessing(self, tokens_batch, sequence_lengths):
+    def _postprocessing(self, tokens_batch, request_input_lens, sequence_lengths):
         outputs = []
         for batch_idx, beam_tokens in enumerate(tokens_batch):
             for beam_idx, tokens in enumerate(beam_tokens):
                 seq_len = sequence_lengths[batch_idx][beam_idx]
-                output = self.tokenizer.decode(tokens[:seq_len])
+                request_input_len = request_input_lens[batch_idx].tolist()[0]
+                output = self.tokenizer.decode(tokens[request_input_len:max(seq_len-1, 1)])
                 outputs.append(output.encode('utf8'))
         return outputs
diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
@@ -33,6 +33,11 @@ input [
     data_type: TYPE_INT32
     dims: [ -1, -1 ]
   },
+  {
+    name: "REQUEST_INPUT_LEN"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
   {
     name: "SEQUENCE_LENGTH"
     data_type: TYPE_INT32

diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -29,7 +29,7 @@ backend: "tensorrtllm"
 max_batch_size: 128
 
 model_transaction_policy {
-  decoupled: ${decoupled_mode}
+  decoupled: False
 }
 
 input [

diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py
@@ -46,13 +46,13 @@ def test_functionality(client, prompts, output_lens):
         ]
         result = client.infer(model_name, inputs, request_id=str(i))
         output0 = result.as_numpy("INPUT_ID")
-        output1 = result.as_numpy("REQUEST_INPUT_LEN")
+        request_input_len = result.as_numpy("REQUEST_INPUT_LEN")
         output2 = result.as_numpy("REQUEST_OUTPUT_LEN")
 
         model_name = "tensorrt_llm"
         inputs = [
             utils.prepare_tensor("input_ids", output0, FLAGS.protocol),
-            utils.prepare_tensor("input_lengths", output1, FLAGS.protocol),
+            utils.prepare_tensor("input_lengths", request_input_len, FLAGS.protocol),
             utils.prepare_tensor("request_output_len", output2,
                                  FLAGS.protocol),
         ]
@@ -63,11 +63,12 @@ def test_functionality(client, prompts, output_lens):
         model_name = "postprocessing"
         inputs = [
             utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol),
+            utils.prepare_tensor("REQUEST_INPUT_LEN", request_input_len, FLAGS.protocol),
             utils.prepare_tensor("SEQUENCE_LENGTH", seq_lengths,
                                  FLAGS.protocol)
         ]
         inputs[0].set_data_from_numpy(output0)
-        inputs[1].set_data_from_numpy(seq_lengths)
+        inputs[2].set_data_from_numpy(seq_lengths)
 
         result = client.infer(model_name, inputs, request_id=str(i))
         output0 = result.as_numpy("OUTPUT")