Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions kernel/power/casum.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifndef HAVE_KERNEL_16

static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
{

BLASLONG i=0;
Expand Down Expand Up @@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)

}

svec[0] = sum0+sum1+sum2+sum3;
svec[1] = 0.0;
svec[2] = 0.0;
svec[3] = 0.0;

return sum0+sum1+sum2+sum3;
}

#endif
Expand All @@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
BLASLONG inc_x2;

Expand All @@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{

casum_kernel_16(n1, x, svec);
sumf = svec[0] + svec[1]+svec[2]+svec[3];
sumf = casum_kernel_16(n1, x);
i=n1;
ip = 2 * n1;
}
Expand Down
279 changes: 140 additions & 139 deletions kernel/power/casum_microk_power8.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/

#define HAVE_KERNEL_16 1
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));

static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
static float casum_kernel_16 (long n, float *x)
{


BLASLONG i = n;
BLASLONG o16 = 16;
BLASLONG o32 = 32;
BLASLONG o48 = 48;
BLASLONG o64 = 64;
BLASLONG o80 = 80;
BLASLONG o96 = 96;
BLASLONG o112 = 112;
FLOAT *x1=x;
BLASLONG pre = 384;

__asm__ __volatile__
(

"dcbt %2 , %4 \n\t"

"xxlxor 32,32,32 \n\t"
"xxlxor 33,33,33 \n\t"
"xxlxor 34,34,34 \n\t"
"xxlxor 35,35,35 \n\t"
"xxlxor 36,36,36 \n\t"
"xxlxor 37,37,37 \n\t"
"xxlxor 38,38,38 \n\t"
"xxlxor 39,39,39 \n\t"

"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"

"addi %2, %2, 128 \n\t"

"addic. %0 , %0 , -16 \n\t"
"ble 2f \n\t"

".align 5 \n\t"
"1: \n\t"

"dcbt %2 , %4 \n\t"

"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"

"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t"

"xvabssp 52, 44 \n\t"
"xvabssp 53, 45 \n\t"

"lxvw4x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t"

"xvabssp 54, 46 \n\t"
"xvabssp 55, 47 \n\t"

"lxvw4x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t"

"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"

"lxvw4x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t"

"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, 52 \n\t"
"xvaddsp 37, 37, 53 \n\t"
"addic. %0 , %0 , -16 \n\t"
"xvaddsp 38, 38, 54 \n\t"
"xvaddsp 39, 39, 55 \n\t"

"bgt 1b \n\t"

"2: \n\t"


"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"xvabssp 52, 44 \n\t"
"xvabssp 53, 45 \n\t"
"xvabssp 54, 46 \n\t"
"xvabssp 55, 47 \n\t"

"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, 52 \n\t"
"xvaddsp 37, 37, 53 \n\t"
"xvaddsp 38, 38, 54 \n\t"
"xvaddsp 39, 39, 55 \n\t"

"xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38, 39 \n\t"

"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"

"xvaddsp 32, 32, 36 \n\t"


"stxvw4x 32, 0, %3 \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"r" (x1), // 2
"r" (svec), // 3
"r" (pre), // 4
"r" (o16), // 5
"r" (o32), // 6
"r" (o48), // 7
"r" (o64), // 8
"r" (o80), // 9
"r" (o96), // 10
"r" (o112) // 11
: "cr0", "%0", "%2", "memory"
);

}


float sum;
__vector float t0;
__vector float t1;
__vector float t2;
__vector float t3;

__asm__
(
"dcbt 0, %2 \n\t"

"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"

"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t"
"lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t"
"lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t"
"lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t"

"addi %2, %2, 128 \n\t"

"addic. %1, %1, -16 \n\t"
"ble 2f \n\t"

".p2align 5 \n"
"1: \n\t"

"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"

"lxvw4x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t"

"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"

"lxvw4x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t"

"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"

"lxvw4x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t"

"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"

"lxvw4x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t"

"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, %x4 \n\t"
"addic. %1, %1, -16 \n\t"
"xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t"

"bgt 1b \n"

"2: \n\t"

"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"

"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, %x4 \n\t"
"xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t"

"xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38, 39 \n\t"

"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"

"xvaddsp 32, 32, 36 \n\t"

"xxsldwi 33, 32, 32, 2 \n\t"
"xvaddsp 32, 32, 33 \n\t"

"xxsldwi 33, 32, 32, 1 \n\t"
"xvaddsp 32, 32, 33 \n\t"

"xscvspdp %0, 32 \n"

"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
:
"=f" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
:
"m" (*x),
"b" (16), // 8
"b" (32), // 9
"b" (48), // 10
"b" (64), // 11
"b" (80), // 12
"b" (96), // 13
"b" (112) // 14
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
);

return sum;
}
Loading