@@ -37,114 +37,114 @@ double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
3737        vfloat32m2_t  vx , vy ;
3838        unsigned int   gvl  =  0 ;
3939        vfloat64m1_t  v_res , v_z0 ;
40-         gvl  =  vsetvlmax_e64m1 ();
41-         v_res  =  vfmv_v_f_f64m1 (0 , gvl );
42-         v_z0  =  vfmv_v_f_f64m1 (0 , gvl );
40+         gvl  =  __riscv_vsetvlmax_e64m1 ();
41+         v_res  =  __riscv_vfmv_v_f_f64m1 (0 , gvl );
42+         v_z0  =  __riscv_vfmv_v_f_f64m1 (0 , gvl );
4343
4444        if (inc_x  ==  1  &&  inc_y  ==  1 ){
45-                 gvl  =  vsetvl_e64m4 (n );
46-                 vr  =  vfmv_v_f_f64m4 (0 , gvl );
45+                 gvl  =  __riscv_vsetvl_e64m4 (n );
46+                 vr  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
4747                for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
48-                         vx  =  vle32_v_f32m2 (& x [j ], gvl );
49-                         vy  =  vle32_v_f32m2 (& y [j ], gvl );
50-                         vr  =  vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
48+                         vx  =  __riscv_vle32_v_f32m2 (& x [j ], gvl );
49+                         vy  =  __riscv_vle32_v_f32m2 (& y [j ], gvl );
50+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
5151                        j  +=  gvl ;
5252                }
5353                if (j  >  0 ){
54-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
55-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
54+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
55+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
5656                }
5757                //tail 
5858                if (j  <  n ){
59-                         gvl  =  vsetvl_e64m4 (n - j );
60-                         vx  =  vle32_v_f32m2 (& x [j ], gvl );
61-                         vy  =  vle32_v_f32m2 (& y [j ], gvl );
62-                         vfloat64m4_t  vz  =  vfmv_v_f_f64m4 (0 , gvl );
63-                         //vr = vfdot_vv_f32m2 (vx, vy, gvl); 
64-                         vr  =  vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
65-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
66-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
59+                         gvl  =  __riscv_vsetvl_e64m4 (n - j );
60+                         vx  =  __riscv_vle32_v_f32m2 (& x [j ], gvl );
61+                         vy  =  __riscv_vle32_v_f32m2 (& y [j ], gvl );
62+                         vfloat64m4_t  vz  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
63+                         //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl); 
64+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
65+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
66+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
6767                }
6868        }else  if (inc_y  ==  1 ){
69-                 gvl  =  vsetvl_e64m4 (n );
70-                 vr  =  vfmv_v_f_f64m4 (0 , gvl );
69+                 gvl  =  __riscv_vsetvl_e64m4 (n );
70+                 vr  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
7171                 int  stride_x  =  inc_x  *  sizeof (FLOAT );
7272                for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
73-                         vx  =  vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
74-                         vy  =  vle32_v_f32m2 (& y [j ], gvl );
75-                         vr  =  vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
73+                         vx  =  __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
74+                         vy  =  __riscv_vle32_v_f32m2 (& y [j ], gvl );
75+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
7676                        j  +=  gvl ;
7777                }
7878                if (j  >  0 ){
79-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
80-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
79+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
80+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
8181
8282                }
8383                //tail 
8484                if (j  <  n ){
85-                         gvl  =  vsetvl_e64m4 (n - j );
86-                         vx  =  vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
87-                         vy  =  vle32_v_f32m2 (& y [j ], gvl );
88-                         vfloat64m4_t  vz  =  vfmv_v_f_f64m4 (0 , gvl );
89-                         //vr = vfdot_vv_f32m2 (vx, vy, gvl); 
90-                         vr  =  vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
91-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
92-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
85+                         gvl  =  __riscv_vsetvl_e64m4 (n - j );
86+                         vx  =  __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
87+                         vy  =  __riscv_vle32_v_f32m2 (& y [j ], gvl );
88+                         vfloat64m4_t  vz  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
89+                         //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl); 
90+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
91+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
92+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
9393
9494                }
9595        }else  if (inc_x  ==  1 ){
96-                 gvl  =  vsetvl_e64m4 (n );
97-                 vr  =  vfmv_v_f_f64m4 (0 , gvl );
96+                 gvl  =  __riscv_vsetvl_e64m4 (n );
97+                 vr  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
9898                 int  stride_y  =  inc_y  *  sizeof (FLOAT );
9999                for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
100-                         vx  =  vle32_v_f32m2 (& x [j ], gvl );
101-                         vy  =  vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
102-                         vr  =  vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
100+                         vx  =  __riscv_vle32_v_f32m2 (& x [j ], gvl );
101+                         vy  =  __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
102+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
103103                        j  +=  gvl ;
104104                }
105105                if (j  >  0 ){
106-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
107-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
106+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
107+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
108108
109109                }
110110                //tail 
111111                if (j  <  n ){
112-                         gvl  =  vsetvl_e64m4 (n - j );
113-                         vx  =  vle32_v_f32m2 (& x [j ], gvl );
114-                         vy  =  vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
115-                         vfloat64m4_t  vz  =  vfmv_v_f_f64m4 (0 , gvl );
116-                         //vr = vfdot_vv_f32m2 (vx, vy, gvl); 
117-                         vr  =  vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
118-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
119-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
112+                         gvl  =  __riscv_vsetvl_e64m4 (n - j );
113+                         vx  =  __riscv_vle32_v_f32m2 (& x [j ], gvl );
114+                         vy  =  __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
115+                         vfloat64m4_t  vz  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
116+                         //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl); 
117+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
118+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
119+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
120120
121121                }
122122        }else {
123-                 gvl  =  vsetvl_e64m4 (n );
124-                 vr  =  vfmv_v_f_f64m4 (0 , gvl );
123+                 gvl  =  __riscv_vsetvl_e64m4 (n );
124+                 vr  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
125125                 int  stride_x  =  inc_x  *  sizeof (FLOAT );
126126                 int  stride_y  =  inc_y  *  sizeof (FLOAT );
127127                for (i = 0 ,j = 0 ; i < n /gvl ; i ++ ){
128-                         vx  =  vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
129-                         vy  =  vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
130-                         vr  =  vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
128+                         vx  =  __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
129+                         vy  =  __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
130+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vr , vx , vy , gvl );
131131                        j  +=  gvl ;
132132                }
133133                if (j  >  0 ){
134-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
135-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
134+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
135+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
136136
137137                }
138138                //tail 
139139                if (j  <  n ){
140-                         gvl  =  vsetvl_e64m4 (n - j );
141-                         vx  =  vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
142-                         vy  =  vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
143-                         vfloat64m4_t  vz  =  vfmv_v_f_f64m4 (0 , gvl );
144-                         //vr = vfdot_vv_f32m2 (vx, vy, gvl); 
145-                         vr  =  vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
146-                         v_res  =  vfredusum_vs_f64m4_f64m1 ( v_res ,  vr , v_z0 , gvl );
147-                         dot  +=  (double )vfmv_f_s_f64m1_f64 (v_res );
140+                         gvl  =  __riscv_vsetvl_e64m4 (n - j );
141+                         vx  =  __riscv_vlse32_v_f32m2 (& x [j * inc_x ], stride_x , gvl );
142+                         vy  =  __riscv_vlse32_v_f32m2 (& y [j * inc_y ], stride_y , gvl );
143+                         vfloat64m4_t  vz  =  __riscv_vfmv_v_f_f64m4 (0 , gvl );
144+                         //vr = __riscv_vfdot_vv_f32m2 (vx, vy, gvl); 
145+                         vr  =  __riscv_vfwmacc_vv_f64m4 (vz , vx , vy , gvl );
146+                         v_res  =  __riscv_vfredusum_vs_f64m4_f64m1 ( vr , v_z0 , gvl );
147+                         dot  +=  (double )__riscv_vfmv_f_s_f64m1_f64 (v_res );
148148
149149                }
150150        }
0 commit comments