@@ -2637,7 +2637,6 @@ static __global__ void mul_mat_q(
26372637
26382638 ids_dst_shared[j] = j;
26392639 }
2640- __syncthreads ();
26412640
26422641 // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
26432642#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -2666,7 +2665,6 @@ static __global__ void mul_mat_q(
26662665 return ;
26672666 }
26682667
2669- // __syncthreads(); // There is no previous tile that could cause a race condition.
26702668#pragma unroll
26712669 for (int j0 = 0 ; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
26722670 const int j = j0 + threadIdx .y *WARP_SIZE + threadIdx .x ;
@@ -2677,7 +2675,6 @@ static __global__ void mul_mat_q(
26772675
26782676 ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
26792677 }
2680- __syncthreads ();
26812678 }
26822679
26832680 offset_y += (col_low + jt*mmq_x)*(sizeof (block_q8_1_mmq)/sizeof (int ));
@@ -2744,7 +2741,6 @@ static __global__ void mul_mat_q(
27442741 continue ;
27452742 }
27462743
2747- __syncthreads ();
27482744#pragma unroll
27492745 for (int j0 = 0 ; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
27502746 const int j = j0 + threadIdx .y *WARP_SIZE + threadIdx .x ;
@@ -2755,7 +2751,6 @@ static __global__ void mul_mat_q(
27552751
27562752 ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
27572753 }
2758- __syncthreads ();
27592754 }
27602755
27612756 offset_y += (col_low + jt*mmq_x)*(sizeof (block_q8_1_mmq)/sizeof (int ));
@@ -2811,7 +2806,6 @@ static __global__ void mul_mat_q(
28112806 }
28122807
28132808 // The memory layout for the fixup buffer is always contiguous, therefore reset ids:
2814- __syncthreads ();
28152809#pragma unroll
28162810 for (int j0 = 0 ; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
28172811 const int j = j0 + threadIdx .y *WARP_SIZE + threadIdx .x ;
@@ -2822,7 +2816,6 @@ static __global__ void mul_mat_q(
28222816
28232817 ids_dst_shared[j] = j;
28242818 }
2825- __syncthreads ();
28262819 }
28272820
28282821 offset_y += (col_low + jt*mmq_x)*(sizeof (block_q8_1_mmq)/sizeof (int ));
0 commit comments