3939#define  cudaDeviceCanAccessPeer  hipDeviceCanAccessPeer
4040#define  cudaDeviceDisablePeerAccess  hipDeviceDisablePeerAccess
4141#define  cudaDeviceEnablePeerAccess  hipDeviceEnablePeerAccess
42- #define  cudaDeviceGetMemPool  hipDeviceGetMemPool
43- #define  cudaMemPoolAttrReleaseThreshold  hipMemPoolAttrReleaseThreshold
44- #define  cudaMemPoolSetAttribute  hipMemPoolSetAttribute
45- #define  cudaMemPool_t  hipMemPool_t
4642#define  cudaDeviceProp  hipDeviceProp_t
4743#define  cudaDeviceSynchronize  hipDeviceSynchronize
4844#define  cudaError_t  hipError_t
5248#define  cudaEvent_t  hipEvent_t
5349#define  cudaEventDestroy  hipEventDestroy
5450#define  cudaFree  hipFree
55- #define  cudaFreeAsync  hipFreeAsync
5651#define  cudaFreeHost  hipHostFree
5752#define  cudaGetDevice  hipGetDevice
5853#define  cudaGetDeviceCount  hipGetDeviceCount
5954#define  cudaGetDeviceProperties  hipGetDeviceProperties
6055#define  cudaGetErrorString  hipGetErrorString
6156#define  cudaGetLastError  hipGetLastError
6257#define  cudaMalloc  hipMalloc
63- #define  cudaMallocFromPoolAsync  hipMallocFromPoolAsync
6458#define  cudaMallocHost (ptr, size ) hipHostMalloc(ptr, size, hipHostMallocDefault)
6559#define  cudaMemcpy  hipMemcpy
6660#define  cudaMemcpy2DAsync  hipMemcpy2DAsync
@@ -187,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
187181    do  {                                                                                \
188182        cudaError_t err_ = (err);                                                       \
189183        if  (err_ != cudaSuccess) {                                                      \
190-             int  dev_id ;                                                                     \
191-             cudaGetDevice (&dev_id );                                                         \
184+             int  id ;                                                                     \
185+             cudaGetDevice (&id );                                                         \
192186            fprintf (stderr, " \n CUDA error %d at %s:%d: %s\n "  , err_, __FILE__, __LINE__, \
193187                cudaGetErrorString (err_));                                              \
194-             fprintf (stderr, " current device: %d\n "  , dev_id );                                \
188+             fprintf (stderr, " current device: %d\n "  , id );                                \
195189            exit (1 );                                                                    \
196190        }                                                                               \
197191    } while  (0 )
@@ -201,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
201195    do  {                                                                                \
202196        cublasStatus_t err_ = (err);                                                    \
203197        if  (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
204-             int  dev_id ;                                                                     \
205-             cudaGetDevice (&dev_id );                                                         \
198+             int  id ;                                                                     \
199+             cudaGetDevice (&id );                                                         \
206200            fprintf (stderr, " \n cuBLAS error %d at %s:%d: %s\n "  ,                         \
207201                    err_, __FILE__, __LINE__, cublasGetStatusString (err_));             \
208-             fprintf (stderr, " current device: %d\n "  , dev_id );                                \
202+             fprintf (stderr, " current device: %d\n "  , id );                                \
209203            exit (1 );                                                                    \
210204        }                                                                               \
211205    } while  (0 )
@@ -471,7 +465,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
471465
472466#define  MAX_STREAMS  8 
473467static  cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr  };
474- static  cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr  };
475468
476469struct  ggml_tensor_extra_gpu  {
477470    void  * data_device[GGML_CUDA_MAX_DEVICES]; //  1 pointer for each device for split tensors
@@ -5780,16 +5773,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
57805773    return  ptr;
57815774}
57825775
5783- static  void  * ggml_cuda_pool_malloc_async (size_t  size, size_t  * actual_size, int  id, cudaStream_t stream) {
5784-     if  (g_cudaMemPools[id] == nullptr ) {
5785-         return  ggml_cuda_pool_malloc (size, actual_size);
5786-     }
5787-     void  *ptr;
5788-     CUDA_CHECK (cudaMallocFromPoolAsync (&ptr, size, g_cudaMemPools[id], stream));
5789-     *actual_size = size;
5790-     return  ptr;
5791- }
5792- 
57935776static  void  ggml_cuda_pool_free (void  * ptr, size_t  size) {
57945777    scoped_spin_lock lock (g_cuda_pool_lock);
57955778    int  id;
@@ -5808,13 +5791,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
58085791}
58095792
58105793
5811- static  void  ggml_cuda_pool_free_async (void  * ptr, size_t  actual_size, int  id, cudaStream_t stream) {
5812-     if  (g_cudaMemPools[id] == nullptr ) {
5813-         return  ggml_cuda_pool_free (ptr, actual_size);
5814-     }
5815-     CUDA_CHECK (cudaFreeAsync (ptr, stream));
5816- }
5817- 
58185794void  ggml_init_cublas () {
58195795    static  bool  initialized = false ;
58205796
@@ -5869,13 +5845,6 @@ void ggml_init_cublas() {
58695845            //  create cublas handle
58705846            CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
58715847            CUBLAS_CHECK (cublasSetMathMode (g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
5872- 
5873-             //  configure memory pool
5874-             cudaError_t err = cudaDeviceGetMemPool (&g_cudaMemPools[id], id);
5875-             if  (err == cudaSuccess) {
5876-                 size_t  treshold = UINT64_MAX;
5877-                 CUDA_CHECK (cudaMemPoolSetAttribute (g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
5878-             }
58795848        }
58805849
58815850        //  configure logging to stdout
@@ -6469,7 +6438,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
64696438            const  to_fp16_cuda_t  to_fp16_cuda = ggml_get_to_fp16_cuda (src0->type );
64706439            GGML_ASSERT (to_fp16_cuda != nullptr );
64716440            size_t  ne = row_diff*ne00;
6472-             src0_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src0_as, id, stream );
6441+             src0_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src0_as);
64736442            to_fp16_cuda (src0_dd_i, src0_as_f16, ne, stream);
64746443        }
64756444        const  half * src0_ptr = src0->type  == GGML_TYPE_F16 ? (const  half *) src0_dd_i : src0_as_f16;
@@ -6480,12 +6449,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
64806449            const  to_fp16_cuda_t  to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
64816450            GGML_ASSERT (to_fp16_cuda != nullptr );
64826451            size_t  ne = src1_ncols*ne10;
6483-             src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src1_as, id, stream );
6452+             src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src1_as);
64846453            to_fp16_cuda (src1_ddf_i, src1_as_f16, ne, stream);
64856454        }
64866455        const  half * src1_ptr = src1->type  == GGML_TYPE_F16 ? (const  half *) src1_ddq_i : src1_as_f16;
6487-         size_t  dst_f16_as = 0 ;
6488-         half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (row_diff*src1_ncols * sizeof (half), &dst_f16_as, id, stream);
6456+ 
6457+         size_t  dst_as = 0 ;
6458+         half * dst_f16 = (half *) ggml_cuda_pool_malloc (row_diff*src1_ncols * sizeof (half), &dst_as);
64896459
64906460        const  half alpha_f16 = 1 .0f ;
64916461        const  half beta_f16 = 0 .0f ;
@@ -6503,15 +6473,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
65036473        const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
65046474        to_fp32_cuda (dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
65056475
6506-         if  (dst_f16_as != 0 ) {
6507-             ggml_cuda_pool_free_async (dst_f16, dst_f16_as, id, stream);
6508-         }
6476+         ggml_cuda_pool_free (dst_f16, dst_as);
65096477
65106478        if  (src0_as != 0 ) {
6511-             ggml_cuda_pool_free_async (src0_as_f16, src0_as, id, stream );
6479+             ggml_cuda_pool_free (src0_as_f16, src0_as);
65126480        }
6481+ 
65136482        if  (src1_as != 0 ) {
6514-             ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, stream );
6483+             ggml_cuda_pool_free (src1_as_f16, src1_as);
65156484        }
65166485    }
65176486    else  {
@@ -6521,7 +6490,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
65216490        if  (src0->type  != GGML_TYPE_F32) {
65226491            const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
65236492            GGML_ASSERT (to_fp32_cuda != nullptr );
6524-             src0_ddq_as_f32 = (float  *) ggml_cuda_pool_malloc_async (row_diff*ne00 * sizeof (float ), &src0_as, id, stream ); //  NOLINT
6493+             src0_ddq_as_f32 = (float  *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_as); //  NOLINT
65256494            to_fp32_cuda (src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
65266495        }
65276496        const  float  * src0_ddf_i = src0->type  == GGML_TYPE_F32 ? (const  float  *) src0_dd_i : src0_ddq_as_f32;
@@ -6538,7 +6507,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
65386507                    &beta,  dst_dd_i,   ldc));
65396508
65406509        if  (src0_as != 0 ) {
6541-             ggml_cuda_pool_free_async (src0_ddq_as_f32, src0_as, id, stream );
6510+             ggml_cuda_pool_free (src0_ddq_as_f32, src0_as);
65426511        }
65436512    }
65446513
@@ -6961,30 +6930,29 @@ static void ggml_cuda_op_mul_mat(
69616930            src0_dd[id] = (char  *) src0_extra->data_device [id];
69626931        } else  {
69636932            const  size_t  size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes (src0);
6964-             src0_dd[id] = (char  *) ggml_cuda_pool_malloc_async (ggml_nbytes (src0), &src0_as[id], id, stream );
6933+             src0_dd[id] = (char  *) ggml_cuda_pool_malloc (ggml_nbytes (src0), &src0_as[id]);
69656934        }
69666935
69676936        if  (src1_on_device && src1_is_contiguous) {
69686937            src1_ddf[id] = (float  *) src1_extra->data_device [id];
69696938        } else  {
6970-             src1_ddf[id] = (float  *) ggml_cuda_pool_malloc_async (ggml_nbytes (src1), &src1_asf[id], id, stream );
6939+             src1_ddf[id] = (float  *) ggml_cuda_pool_malloc (ggml_nbytes (src1), &src1_asf[id]);
69716940        }
69726941
69736942        if  (convert_src1_to_q8_1) {
6974-             const  size_t  size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
6975-             src1_ddq[id] = (char  *) ggml_cuda_pool_malloc_async (size_dst_ddq, &src1_asq[id], id, stream);
6943+             src1_ddq[id] = (char  *) ggml_cuda_pool_malloc (nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
69766944
69776945            if  (src1_on_device && src1_is_contiguous) {
69786946                quantize_row_q8_1_cuda (src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6979-                 //   CUDA_CHECK(cudaGetLastError());
6947+                 CUDA_CHECK (cudaGetLastError ());
69806948            }
69816949        }
69826950
69836951        if  (dst_on_device) {
69846952            dst_dd[id] = (float  *) dst_extra->data_device [id];
69856953        } else  {
69866954            const  size_t  size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof (float ) : ggml_nbytes (dst);
6987-             dst_dd[id] = (float  *) ggml_cuda_pool_malloc_async (size_dst_ddf, &dst_as[id], id,  stream );
6955+             dst_dd[id] = (float  *) ggml_cuda_pool_malloc (size_dst_ddf, &dst_as[id]);
69886956        }
69896957    }
69906958
@@ -7110,6 +7078,24 @@ static void ggml_cuda_op_mul_mat(
71107078        }
71117079    }
71127080
7081+     for  (int64_t  id = 0 ; id < g_device_count; ++id) {
7082+         CUDA_CHECK (ggml_cuda_set_device (id));
7083+ 
7084+         //  free buffers again when done
7085+         if  (src0_as[id] > 0 ) {
7086+             ggml_cuda_pool_free (src0_dd[id], src0_as[id]);
7087+         }
7088+         if  (src1_asf[id] > 0 ) {
7089+             ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
7090+         }
7091+         if  (src1_asq[id] > 0 ) {
7092+             ggml_cuda_pool_free (src1_ddq[id], src1_asq[id]);
7093+         }
7094+         if  (dst_as[id] > 0 ) {
7095+             ggml_cuda_pool_free (dst_dd[id], dst_as[id]);
7096+         }
7097+     }
7098+ 
71137099    //  main device waits for all other devices to be finished
71147100    if  (split && g_device_count > 1 ) {
71157101        int64_t  is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
@@ -7127,21 +7113,6 @@ static void ggml_cuda_op_mul_mat(
71277113        CUDA_CHECK (ggml_cuda_set_device (g_main_device));
71287114        CUDA_CHECK (cudaDeviceSynchronize ());
71297115    }
7130- 
7131-     for  (int64_t  id = 0 ; id < g_device_count; ++id) {
7132-         if  (src0_as[id] > 0 ) {
7133-             ggml_cuda_pool_free_async (src0_dd[id], src0_as[id], id, g_cudaStreams[id][0 ]);
7134-         }
7135-         if  (src1_asf[id] > 0 ) {
7136-             ggml_cuda_pool_free_async (src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0 ]);
7137-         }
7138-         if  (src1_asq[id] > 0 ) {
7139-             ggml_cuda_pool_free_async (src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0 ]);
7140-         }
7141-         if  (dst_as[id] > 0 ) {
7142-             ggml_cuda_pool_free_async (dst_dd[id], dst_as[id], id, g_cudaStreams[id][0 ]);
7143-         }
7144-     }
71457116}
71467117
71477118static  void  ggml_cuda_repeat (const  ggml_tensor * src0, const  ggml_tensor * src1, ggml_tensor * dst) {
@@ -7328,11 +7299,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
73287299    GGML_ASSERT (to_fp16_cuda != nullptr );
73297300
73307301    size_t  src1_as = 0 ;
7331-     half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne1 * sizeof (half), &src1_as, id, main_stream );
7302+     half * src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne1 * sizeof (half), &src1_as);
73327303    to_fp16_cuda (src1_ddf, src1_as_f16, ne1, main_stream);
73337304
73347305    size_t  dst_as = 0 ;
7335-     half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &dst_as, id, main_stream );
7306+     half * dst_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &dst_as);
73367307
73377308    GGML_ASSERT (ne12 % ne02 == 0 );
73387309    GGML_ASSERT (ne13 % ne03 == 0 );
@@ -7386,8 +7357,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
73867357        size_t  ptrs_src_s = 0 ;
73877358        size_t  ptrs_dst_s = 0 ;
73887359
7389-         ptrs_src = (const  void  **) ggml_cuda_pool_malloc_async (2 *ne23*sizeof (void  *), &ptrs_src_s, id, main_stream );
7390-         ptrs_dst = (      void  **) ggml_cuda_pool_malloc_async (1 *ne23*sizeof (void  *), &ptrs_dst_s, id, main_stream );
7360+         ptrs_src = (const  void  **) ggml_cuda_pool_malloc (2 *ne23*sizeof (void  *), &ptrs_src_s);
7361+         ptrs_dst = (      void  **) ggml_cuda_pool_malloc (1 *ne23*sizeof (void  *), &ptrs_dst_s);
73917362
73927363        dim3  block_dims (ne13, ne12);
73937364        k_compute_batched_ptrs<<<1 , block_dims, 0 , main_stream>>> (
@@ -7400,6 +7371,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
74007371                dst->nb [2 ], dst->nb [3 ],
74017372                r2, r3);
74027373        CUDA_CHECK (cudaGetLastError ());
7374+ 
74037375        CUBLAS_CHECK (
74047376        cublasGemmBatchedEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
74057377                ne01, ne11, ne10,
@@ -7411,22 +7383,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
74117383                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
74127384
74137385        if  (ptrs_src_s != 0 ) {
7414-             ggml_cuda_pool_free_async (ptrs_src, ptrs_src_s, id, main_stream );
7386+             ggml_cuda_pool_free (ptrs_src, ptrs_src_s);
74157387        }
74167388        if  (ptrs_dst_s != 0 ) {
7417-             ggml_cuda_pool_free_async (ptrs_dst, ptrs_dst_s, id, main_stream );
7389+             ggml_cuda_pool_free (ptrs_dst, ptrs_dst_s);
74187390        }
74197391    }
74207392#endif 
74217393
74227394    const  to_fp32_cuda_t  to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
74237395    to_fp32_cuda (dst_f16, dst_ddf, ne, main_stream);
7424-     if  (src1_as != 0 ) {
7425-         ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, main_stream);
7426-     }
7427-     if  (dst_as != 0 ) {
7428-         ggml_cuda_pool_free_async (dst_f16, dst_as, id, main_stream);
7429-     }
7396+ 
7397+     ggml_cuda_pool_free (src1_as_f16, src1_as);
7398+     ggml_cuda_pool_free (dst_f16, dst_as);
74307399}
74317400
74327401static  void  ggml_cuda_mul_mat (const  ggml_tensor * src0, const  ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments