diff --git a/src/interface/multilinear.cxx b/src/interface/multilinear.cxx index e45a9c24..4c6b5ad4 100644 --- a/src/interface/multilinear.cxx +++ b/src/interface/multilinear.cxx @@ -277,7 +277,6 @@ namespace CTF { free(op_lens); free(arrs); t_tttp.stop(); - } @@ -702,6 +701,9 @@ namespace CTF { IASSERT(!mat_list[i]->is_sparse); } dtype ** arrs = (dtype**)malloc(sizeof(dtype*)*T->order); + // Note: Buffer for safe re-distribution of lst_mode[buffer] data + int arrs_barrier_len = mat_list[mode]->lens[0]*mat_list[mode]->lens[1]; + dtype * arrs_barrier = (dtype*)malloc(arrs_barrier_len*sizeof(dtype)); int64_t * ldas = (int64_t*)malloc(T->order*sizeof(int64_t)); int * phys_phase = (int*)malloc(T->order*sizeof(int)); int * mat_strides = NULL; @@ -744,6 +746,88 @@ namespace CTF { Timer t_solve_remap("Solve_remap_mats"); t_solve_remap.start(); + // Must distribute factor matrix for 'mode' in order to incorporate barrier terms into Hessian diagonals + // Distribute it first + if (mu>0){ + Tensor mmat; + Tensor * mat ; + mat = mat_list[mode]; + int64_t tot_sz; + if (is_vec) + tot_sz = T->lens[mode]; + else + tot_sz = T->lens[mode]*kd; + if (!is_vec) { + if (aux_mode_first){ + mat_strides[2*mode+0] = k; + mat_strides[2*mode+1] = 1; + } else { + mat_strides[2*mode+0] = 1; + mat_strides[2*mode+1] = T->lens[mode]; + } + } + int nrow, ncol; + if (aux_mode_first){ + nrow = kd; + ncol = T->lens[mode]; + } else { + nrow = T->lens[mode]; + ncol = kd; + } + if (phys_phase[mode] == 1){ + redist_mats[mode] = NULL; + if (T->wrld->np == 1){ + IASSERT(div == 1); + std::copy(mat_list[mode]->data,mat_list[mode]->data+arrs_barrier_len,arrs_barrier); + //arrs_barrier = (dtype*)mat_list[mode]->data; + mat->read_all(arrs_barrier, true); + } + else{ //(i!=mode) + assert(0); // TODO: might not be right + //std::copy(mat_list[mode]->data,mat_list[mode]->data+arrs_barrier_len,arrs_barrier); + arrs_barrier = (dtype*)T->sr->alloc(tot_sz); + mat->read_all(arrs_barrier, true); + } + } + else { + int topo_dim = T->edge_map[mode].cdt; + IASSERT(T->edge_map[mode].type == CTF_int::PHYSICAL_MAP); + IASSERT(!T->edge_map[mode].has_child || T->edge_map[mode].child->type != CTF_int::PHYSICAL_MAP); + if (aux_mode_first){ + mat_idx[0] = 'a'; + mat_idx[1] = par_idx[topo_dim]; + } else { + mat_idx[0] = par_idx[topo_dim]; + mat_idx[1] = 'a'; + } + + int comm_lda = 1; + for (int l=0; ltopo->dim_comm[l].np; + } + CTF_int::CommData cmdt(T->wrld->rank-comm_lda*T->topo->dim_comm[topo_dim].rank,T->topo->dim_comm[topo_dim].rank,T->wrld->cdt); + if (is_vec){ + Vector * v = new Vector(mat_list[mode]->lens[0], par_idx[topo_dim], par[par_idx], Idx_Partition(), 0, *T->wrld, *T->sr); + v->operator[]("i") += mat_list[mode]->operator[]("i"); + redist_mats[mode] = v; + arrs_barrier = (dtype*)v->data; + cmdt.bcast(v->data,v->size,T->sr->mdtype(),0); + } else { + Matrix * m = new Matrix(nrow, ncol, mat_idx, par[par_idx], Idx_Partition(), 0, *T->wrld, *T->sr); + m->operator[]("ij") += mat->operator[]("ij"); + redist_mats[mode] = m; + arrs_barrier = (dtype*)m->data; + cmdt.bcast(m->data,m->size,T->sr->mdtype(),0); + if (aux_mode_first){ + mat_strides[2*mode+0] = kd; + mat_strides[2*mode+1] = 1; + } else { + mat_strides[2*mode+0] = 1; + mat_strides[2*mode+1] = m->pad_edge_len[0]/phys_phase[mode]; + } + } + } + } for (int i=0; iorder; i++){ Tensor mmat; Tensor * mat ; @@ -882,6 +966,291 @@ namespace CTF { int I = T->pad_edge_len[mode]/T->edge_map[mode].np ; int R = mat_list[0]->lens[1-aux_mode_first]; + + Timer t_trav("Sort_nnz"); + Timer t_LHS_work("LHS_work"); + Timer t_solve_lhs("LHS_solves"); + t_trav.start() ; + + /* + std::sort(pairs,pairs+ npair,[T,phys_phase,mode,ldas](Pair i1, Pair i2){ + return (((i1.k/ldas[mode])%T->lens[mode])/phys_phase[mode] < ((i2.k/ldas[mode])%T->lens[mode])/phys_phase[mode] ) ; + }) ; + */ + + Pair * pairs_copy = (Pair*)malloc(npair*2*sizeof(int64_t)) ; + //int * indices = (int *)malloc(npair*sizeof(int64_t)) ; + int64_t * indices = (int64_t *)malloc(npair*sizeof(int64_t)) ; + int64_t * c = (int64_t *) calloc(I+1,sizeof(int64_t)); + int64_t * count = (int64_t *) calloc(I,sizeof(int64_t)); + + for (int64_t i=0; ilens[mode])/phys_phase[mode]; + ++c[indices[i]]; + ++count[indices[i]] ; + } + + for(int64_t i=1;i<=I;i++){ + c[i]+=c[i-1]; + } + + for(int64_t i=npair-1;i>=0;i--){ + pairs_copy[c[indices[i]]-1]=pairs[i]; + --c[indices[i]] ; + } + + std::copy(pairs_copy, pairs_copy+npair,pairs) ; + + free(c); + free(pairs_copy); + free(indices); + + t_trav.stop(); + + //int64_t I_s = std::ceil(float(I)/cm_size) ; + int batches = 1 ; + int64_t batched_I = I ; + int64_t max_memuse = CTF_int::proc_bytes_available() ; + int64_t I_s ; + int64_t rows ; + int buffer = 2048*5; + + while (true){ + if (max_memuse > ((std::ceil(float(I/batches)/cm_size)*cm_size*(R+1)*R) + buffer*R + 10 )*(int64_t)sizeof(dtype) *(int64_t)sizeof(dtype) ) { + break ; + } + else{ + batches +=1 ; + } + } + MPI_Allreduce(MPI_IN_PLACE, &batches, 1, MPI_INT, MPI_MAX, T->wrld->comm); + batched_I = I/batches ; + + int64_t total= 0; + for (int b =0 ; bwrld->rank); + } + + //define how the symmetric arrays are referenced, keep this consistent throughout + char* uplo = "L" ; + char* trans = "N" ; //if want to incorporate column major then change this + char* trans1 = "N" ; //if want to incorporate column major then change this + char* trans2 = "T" ; //if want to incorporate column major then change this + int scale = 1 ; + double alpha = 1.0 ; + double beta = 1.0 ; + int info =0 ; + + t_LHS_work.start(); + Timer t_scatter("Scatter and Sc_reduce"); + //int * inds = (int*)malloc(T->order*sizeof(int)); + + int sweeps ; + + //double * row = (double *) malloc(R* sizeof(double) ); + dtype * H = (dtype *) calloc(buffer*R,sizeof(dtype) ) ; + dtype * H2 = (dtype *) calloc(buffer*R,sizeof(dtype) ) ; // Added by Edgward Hutter to account for possibility of negative phi'' elements + + for (int64_t j =0 ; j< rows ; j++){ + sweeps = count[j+ b*batched_I]/buffer ; + + if (sweeps >0){ + for (int s = 0 ; s< sweeps ; s++){ +#ifdef _OPENMP + #pragma omp parallel for +#endif + for(int q = 0 ; qorder ; i++){ + if (i!=mode){ + int64_t ke= key/ldas[i]; + int index = (ke%T->lens[i])/phys_phase[i]; + CTF_int::default_vec_mul(&arrs[i][index*R], H+q*R, H+q*R, R) ; + } + } + } + CTF_BLAS::syrk(uplo,trans,&R,&buffer,&alpha,H,&R,&alpha,&LHS_list[j*R*R],&R) ; + std::fill( + H, + H+ buffer*R, + 0.); + total+=buffer ; + } + } + sweeps = count[j+ b*batched_I]%buffer ; + if (sweeps>0){ + //printf("Sweep count: %d\n",sweeps); +#ifdef _OPENMP + #pragma omp parallel for +#endif + for(int q = 0 ; qorder ; i++){ + if (i!=mode){ + int64_t ke= key/ldas[i]; + int index = (ke%T->lens[i])/phys_phase[i]; + CTF_int::default_vec_mul(&arrs[i][index*R], H+q*R, H+q*R, R) ; + CTF_int::default_vec_mul(&arrs[i][index*R], H2+q*R, H2+q*R, R) ; + } + } + } + //CTF_BLAS::syrk(uplo,trans,&R,&sweeps,&alpha,H,&R,&alpha,&LHS_list[j*R*R],&R) ; + CTF_BLAS::gemm(trans1,trans2,&R,&R,&sweeps,&alpha,H,&R,H2,&R,&beta,&LHS_list[j*R*R],&R) ; + std::fill( + H, + H+ sweeps*R, + 0.); + total+=sweeps ; + } + } + + free(H) ; + //free(inds) ; + + t_LHS_work.stop(); + + //scatter reduce left hand sides and scatter right hand sides in a buffer + int* Recv_count = (int*) malloc(sizeof(int)*cm_size) ; + std::fill( + Recv_count, + Recv_count + cm_size, + I_s*R*R); + + t_scatter.start() ; + MPI_Reduce_scatter( MPI_IN_PLACE, LHS_list, Recv_count , MPI_DOUBLE, MPI_SUM, slice_comm ); + free(Recv_count); + + for(int64_t i =0 ; i< I_s ; i++){ + for(int r = 0 ; r 0) LHS_list[R*R*i +r*R +r]+=regu2; + if (mu>0) LHS_list[R*R*i +r*R +r]+=(mu/(arrs_barrier[i*R+r]*arrs_barrier[i*R+r])); + } + } + dtype * arrs_buf = (dtype *) calloc(I_s*cm_size*R,sizeof(dtype) ); + + if (b == batches-1){ + std::copy(&arrs[mode][b*batched_I*R],&arrs[mode][I*R],arrs_buf) ; + } + else{ + std::copy(&arrs[mode][b*batched_I*R],&arrs[mode][(b+1)*batched_I*R],arrs_buf) ; + } + + if (cm_rank == 0){ + MPI_Scatter(arrs_buf, I_s*R, MPI_DOUBLE, MPI_IN_PLACE, I_s*R, + MPI_DOUBLE, 0, slice_comm); + } + else{ + MPI_Scatter(NULL, I_s*R, MPI_DOUBLE, arrs_buf, I_s*R, + MPI_DOUBLE, 0, slice_comm); + } + t_scatter.stop() ; + + //call local spd solve on I/cm_size different systems locally (avoid calling solve on padding in lhs) + int* ipiv = (int*)malloc(sizeof(int)*R); + + t_solve_lhs.start() ; + for (int i=0; ilens[mode] % T->edge_map[mode].np > 0 ) + (jr< T->lens[mode] % T->edge_map[mode].np )){ + //CTF_BLAS::posv(uplo,&R,&scale,&LHS_list[i*R*R],&R,&arrs_buf[i*R],&R,&info) ; + //CTF_BLAS::sysv(uplo,&R,&scale,&LHS_list[i*R*R],&R,ipiv,&arrs_buf[i*R],&R,&info) ; + CTF_BLAS::gesv(&R,&scale,&LHS_list[i*R*R],&R,ipiv,&arrs_buf[i*R],&R,&info) ; + //arrs_buf[i*R] /= LHS_list[i*R*R]; + if (info>0){ + printf("ref-%g, mu-%g\n",regu,mu); + for (int j=0; jorder ; j++){ + if (j==mode){ + if (redist_mats[j] != NULL){ + mat_list[j]->set_zero(); + mat_list[j]->operator[]("ij") += redist_mats[j]->operator[]("ij"); + delete redist_mats[j]; + } + else { + IASSERT((dtype*)mat_list[j]->data == arrs[j]); + } + } + else { + if (redist_mats[j] != NULL){ + if (redist_mats[j]->data != (char*)arrs[j]) + T->sr->dealloc((char*)arrs[j]); + delete redist_mats[j]; + } else { + if (arrs[j] != (dtype*)mat_list[j]->data) + T->sr->dealloc((char*)arrs[j]); + } + } + } + } + free(redist_mats); + if (mat_strides != NULL) free(mat_strides); + free(par_idx); + free(phys_phase); + free(ldas); + free(arrs_barrier); + free(arrs); + if (!T->is_sparse) + T->sr->pair_dealloc((char*)pairs); + t_solve_factor.stop(); + } + +/* int I_s = std::ceil(float(I)/cm_size) ; double * LHS_list = (double *) malloc(I_s*cm_size*R*R* sizeof(double) ); @@ -997,8 +1366,10 @@ namespace CTF { free(phys_phase); free(ldas); free(arrs); + free(arrs_barrier); if (!T->is_sparse) T->sr->pair_dealloc((char*)pairs); t_solve_factor.stop(); } +*/ } diff --git a/src/interface/multilinear.h b/src/interface/multilinear.h index 0816cd64..58d00e95 100644 --- a/src/interface/multilinear.h +++ b/src/interface/multilinear.h @@ -57,7 +57,22 @@ namespace CTF { */ template - void Solve_Factor(Tensor * T, Tensor ** mat_list, Tensor * RHS, int mode, bool aux_mode_first); + void Solve_Factor(Tensor * T, Tensor ** mat_list, Tensor * RHS, int mode, bool aux_mode_first, double regu, double regu2, double mu); + + template + void Sparse_add(Tensor * T, Tensor * M,double alpha, double beta); + + template + void Sparse_mul(Tensor * T, Tensor * M); + + template + void Sparse_exp(Tensor * T); + + template + void Sparse_log(Tensor * T); + + template + void get_index_tensor(Tensor * T); } #endif diff --git a/src/shared/blas_symbs.cxx b/src/shared/blas_symbs.cxx index 36af207d..5f72efa0 100644 --- a/src/shared/blas_symbs.cxx +++ b/src/shared/blas_symbs.cxx @@ -71,6 +71,42 @@ namespace CTF_BLAS { INST_SYR(std::complex,C) INST_SYR(std::complex,Z) #undef INST_GEMM + + template + void syrk(const char * UPLO , + const char * TRANS, + const int * N , + const int * K, + const dtype * ALPHA, + const dtype * A , + const int * LDA , + const dtype * BETA, + dtype * C, + const int * LDC){ + printf("CTF ERROR SYRK not available for this type.\n"); + ASSERT(0); + assert(0); + } + +#define INST_SYRK(dtype,s) \ + template <> \ + void syrk(const char * a , \ + const char * b, \ + const int * c , \ + const int * d, \ + const dtype * e, \ + const dtype * f , \ + const int * g , \ + const dtype * h, \ + dtype * i, \ + const int * j){ \ + s ## SYRK(a,b,c,d,e,f,g,h,i,j); \ + } + INST_SYRK(float,S) + INST_SYRK(double,D) + INST_SYRK(std::complex,C) + INST_SYRK(std::complex,Z) +#undef INST_SYRK template @@ -107,6 +143,37 @@ namespace CTF_BLAS { INST_POSV(std::complex,Z) #undef INST_GEMM + template + void gesv( + const int * N, + const int * NRHS, + dtype * A, + const int * LDA, + int * IPIV, + dtype * B, + const int * LDB, + int * INFO){ + printf("CTF ERROR SYSV not available for this type.\n"); + ASSERT(0); + assert(0); + } + +#define INST_GESV(dtype,s) \ + template <> \ + void gesv(\ + const int * b, \ + const int * c, \ + dtype * d, \ + const int * e, \ + int * e1, \ + dtype * f, \ + const int * g, \ + int * h){ \ + s ## GESV(b,c,d,e,e1,f,g,h); \ + } + INST_GESV(float,S) + INST_GESV(double,D) +#undef INST_GESV #ifdef USE_BATCH_GEMM diff --git a/src/shared/blas_symbs.h b/src/shared/blas_symbs.h index cfd9ac69..3a6764cb 100644 --- a/src/shared/blas_symbs.h +++ b/src/shared/blas_symbs.h @@ -20,10 +20,16 @@ #define DSYR dsyr_ #define CSYR csyr_ #define ZSYR zsyr_ +#define SSYRK ssyrk_ +#define DSYRK dsyrk_ +#define CSYRK csyrk_ +#define ZSYRK zsyrk_ #define SPOSV sposv_ #define DPOSV dposv_ #define CPOSV cposv_ #define ZPOSV zposv_ +#define SGESV sgesv_ +#define DGESV dgesv_ #define SSCAL sscal_ #define DSCAL dscal_ #define CSCAL cscal_ @@ -49,6 +55,12 @@ #define DSYR dsyr #define CSYR csyr #define ZSYR zsyr +#define SGESV sgesv +#define DGESV dgesv +#define SSYRK ssyrk +#define DSYRK dsyrk +#define CSYRK csyrk +#define ZSYRK zsyrk #define SPOSV sposv #define DPOSV dposv #define CPOSV cposv @@ -238,8 +250,100 @@ namespace CTF_BLAS { dtype * A , const int * LDA ); + extern "C" + void SGESV( + const int * N, + const int * NRHS, + float * A, + const int * LDA, + int* ipiv, + float * B, + const int * LDB, + int * INFO); + + extern "C" + void DGESV( + const int * N, + const int * NRHS, + double * A, + const int * LDA, + int* ipiv, + double * B, + const int * LDB, + int * INFO); + template + void gesv( + const int * N, + const int * NRHS, + dtype * A, + const int * LDA, + int * IPIV, + dtype * B, + const int * LDB, + int * INFO); + + extern "C" + void SSYRK(const char * UPLO , + const char * TRANS, + const int * N , + const int * K, + const float * ALPHA, + const float * A , + const int * LDA , + const float * BETA, + float * C, + const int * LDC); + + extern "C" + void DSYRK(const char * UPLO , + const char * TRANS, + const int * N , + const int * K, + const double * ALPHA, + const double * A , + const int * LDA , + const double * BETA, + double * C, + const int * LDC); + + + extern "C" + void CSYRK(const char * UPLO , + const char * TRANS, + const int * N , + const int * K, + const std::complex * ALPHA, + const std::complex * A , + const int * LDA , + const std::complex * BETA, + std::complex * C, + const int * LDC); + + extern "C" + void ZSYRK(const char * UPLO , + const char * TRANS, + const int * N , + const int * K, + const std::complex * ALPHA, + const std::complex * A , + const int * LDA , + const std::complex * BETA, + std::complex * C, + const int * LDC); + + template + void syrk(const char * UPLO , + const char * TRANS, + const int * N , + const int * K, + const dtype * ALPHA, + const dtype * A , + const int * LDA , + const dtype * BETA, + dtype * C, + const int * LDC); extern "C" void SPOSV(char const * UPLO , diff --git a/src_python/ctf/multilinear.pyx b/src_python/ctf/multilinear.pyx index a7811191..b51912ec 100644 --- a/src_python/ctf/multilinear.pyx +++ b/src_python/ctf/multilinear.pyx @@ -7,7 +7,13 @@ from profile import timer cdef extern from "ctf.hpp" namespace "CTF": cdef void TTTP_ "CTF::TTTP"[dtype](Tensor[dtype] * T, int num_ops, int * modes, Tensor[dtype] ** mat_list, bool aux_mode_first) cdef void MTTKRP_ "CTF::MTTKRP"[dtype](Tensor[dtype] * T, Tensor[dtype] ** mat_list, int mode, bool aux_mode_first) - cdef void Solve_Factor_ "CTF::Solve_Factor"[dtype](Tensor[dtype] * T, Tensor[dtype] ** mat_list,Tensor[dtype] * RHS, int mode, bool aux_mode_first) + cdef void Solve_Factor_ "CTF::Solve_Factor"[dtype](Tensor[dtype] * T, Tensor[dtype] ** mat_list,Tensor[dtype] * RHS, int mode, bool aux_mode_first, double regu, double regu2, double mu) + cdef void Sparse_add_ "CTF::Sparse_add"[dtype](Tensor[dtype] * T, Tensor[dtype] * M, double alpha, double beta) + cdef void Sparse_mul_ "CTF::Sparse_mul"[dtype](Tensor[dtype] * T, Tensor[dtype] * M ) + cdef void Sparse_exp_ "CTF::Sparse_exp"[dtype](Tensor[dtype] * T) + cdef void Sparse_log_ "CTF::Sparse_log"[dtype](Tensor[dtype] * T) + cdef void get_index_tensor_ "CTF::get_index_tensor"[dtype](Tensor[dtype] * T) + def TTTP(tensor A, mat_list): """ TTTP(A, mat_list) @@ -128,9 +134,9 @@ def MTTKRP(tensor A, mat_list, mode): free(tsrs) t_mttkrp.stop() -def Solve_Factor(tensor A, mat_list, tensor R, mode): +def Solve_Factor(tensor A, mat_list, tensor R, mode, regu, regu2, mu): """ - Solve_Factor(A, mat_list,R, mode) + Solve_Factor(A, mat_list,R, mode, regu, regu2, mu) solves for a factor matrix parallelizing over rows given rhs, sparse tensor and list of factor matrices eg. for mode=0 order 3 tensor Computes LHS = einsum("ijk,jr,jz,kr,kz->irz",T,B,B,C,C) and solves each row with rhs in parallel @@ -178,10 +184,78 @@ def Solve_Factor(tensor A, mat_list, tensor R, mode): B = tensor(copy=A) RHS = tensor(copy=R) if A.dtype == np.float64: - Solve_Factor_[double](B.dt,tsrs,RHS.dt,A.ndim-mode-1,1) + Solve_Factor_[double](B.dt,tsrs,RHS.dt,A.ndim-mode-1,1,regu,regu2,mu) else: raise ValueError('CTF PYTHON ERROR: Solve_Factor does not support this dtype') free(tsrs) t_solve_factor.stop() +def Sparse_add(tensor A, tensor B,alpha=1.0,beta=1.0): + """ + Add two sparse tensors A and B with identical distributions + + """ + t_sp_add = timer("pySparse_add") + t_sp_add.start() + + if A.dtype == np.float64: + Sparse_add_[double](A.dt,B.dt,alpha,beta) + else: + raise ValueError('CTF PYTHON ERROR: Sparse_add does not support this dtype') + t_sp_add.stop() + +def Sparse_mul(tensor A, tensor B): + """ + Multiply two sparse tensors A and B with identical distributions + + """ + t_sp_add = timer("pySparse_mul") + t_sp_add.start() + + if A.dtype == np.float64: + Sparse_mul_[double](A.dt,B.dt) + else: + raise ValueError('CTF PYTHON ERROR: Sparse_mul does not support this dtype') + t_sp_add.stop() + +def Sparse_exp(tensor A): + """ + Multiply two sparse tensors A and B with identical distributions + + """ + t_sp_exp = timer("pySparse_exp") + t_sp_exp.start() + if A.dtype == np.float64: + Sparse_exp_[double](A.dt) + else: + raise ValueError('CTF PYTHON ERROR: Sparse_exp does not support this dtype') + t_sp_exp.stop() + +def Sparse_log(tensor A): + """ + Multiply two sparse tensors A and B with identical distributions + + """ + t_sp_log = timer("pySparse_log") + t_sp_log.start() + + if A.dtype == np.float64: + Sparse_log_[double](A.dt) + else: + raise ValueError('CTF PYTHON ERROR: Sparse_log does not support this dtype') + t_sp_log.stop() + +def get_index_tensor(tensor A): + """ + Multiply two sparse tensors A and B with identical distributions + + """ + t_ind_tnsr = timer("pyget_index_tensor") + t_ind_tnsr.start() + + if A.dtype == np.float64: + get_index_tensor_[double](A.dt) + else: + raise ValueError('CTF PYTHON ERROR: get_index_tensor does not support this dtype') + t_ind_tnsr.stop()