Skip to content
Open
203 changes: 153 additions & 50 deletions parmys/parmys-plugin/core/multiplier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b)
*-----------------------------------------------------------------------*/
void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist)
{
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addbig;
nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addsmall2, *addbig;
int size;

/* Check for a legitimate split */
Expand Down Expand Up @@ -976,50 +976,151 @@ void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *
init_split_multiplier(node, a1b0, a0, a1, 0, b0, a1b1, a0b0);
mult_list = insert_in_vptr_list(mult_list, a1b0);

/* New node for the initial add */
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);

/* New node for the BIG add */
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add1");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++)
connect_nodes(a1b0, i, addsmall, i);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++)
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);

// connect inputs to port a of addbig
size = addsmall->num_output_pins;
for (int i = 0; i < size; i++)
connect_nodes(addsmall, i, addbig, i);

// connect inputs to port b of addbig
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
connect_nodes(a0b0, i, addbig, i - b0 + size);
size = size + a0b0->output_port_sizes[0] - b0;
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
connect_nodes(a1b1, i, addbig, i + size);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < b0; i++) {
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
}
// using the balenced addition method only works if a0 and b0 are the same size
// (i.e. if the input ports on the hardware multiplier are equal)
if (b0 == a0) {
/* New node for the initial add */
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
// this addition will have a carry out in the worst case, add to input pins and connect then to gnd
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++)
connect_nodes(a1b0, i, addsmall, i);

add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++)
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);

/* New node for the BIG add */
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add1");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);

// connect inputs to port a of addbig
size = addsmall->num_output_pins;
for (int i = 0; i < size; i++)
connect_nodes(addsmall, i, addbig, i);

// connect inputs to port b of addbig
for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
connect_nodes(a0b0, i, addbig, i - b0 + size);
size = size + a0b0->output_port_sizes[0] - b0;
for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
connect_nodes(a1b1, i, addbig, i + size);

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < b0; i++) {
remap_pin_to_new_node(node->output_pins[i], a0b0, i);
}

// remap the multiplier outputs coming from addbig
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
}
} else {
/* Expounding upon the description for the method in this function.
if we have two numbers A and B and we have a hardware multiplier of size a0xb0,
we can split them into two parts:
A = A1 << a0 + A0
B = B1 << b0 + B0
where A1 and B1 are the high bits of A and B, and A0 and B0 are the low bits.
Note that len(A0) = a0 and len(B0) = b0 by definition.
The multiplication of A and B can be expressed as:
A * B = (A1 << a0 + A0) * (B1 << b0 + B0)
= {A1 * B1 << (a0 + b0)} + {(A1 * B0) << a0 + (A0 * B1) << b0} + {A0 * B0}
we define split the editions up like so:
addsmall = (A1 * B0) << a0 + (A0 * B1) << b0 // can have carry
addsmall2 = (A1 * B1 << (a0 + b0)) + (A0 * B0) // Will not have carry
addbig = addsmall + addsmall2
This is a slightly modified version of the Karatsuba algorithm.
*/
/////////////// Addsmall /////////////////////
addsmall = allocate_nnode(node->loc);
addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall->name, node->name);
strcat(addsmall->name, "-add0");
init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + a0 + 1, a0b1->num_output_pins + b0 + 1);

// The first a0 pins of addsmall input connecting to a1b0 are connected to zero
for (int i = 0; i < a0; i++) {
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i);
}

// connect inputs to port a of addsmall
for (int i = 0; i < a1b0->num_output_pins; i++) {
connect_nodes(a1b0, i, addsmall, i + a0);
}

// add zero pin for carry
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins + a0);

// The first b0 pins of addsmall input connecting to a0b1 are connected to zero
for (int i = 0; i < b0; i++) {
add_input_pin_to_node(addsmall, get_zero_pin(netlist), i + addsmall->input_port_sizes[0]);
}

// connect inputs to port b of addsmall
for (int i = 0; i < a0b1->num_output_pins; i++) {
connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0] + b0);
}

// add zero pin for carry
add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0] + b0);

/////////////// Addsmall2 /////////////////////
addsmall2 = allocate_nnode(node->loc);
addsmall2->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addsmall2->name, node->name);
strcat(addsmall2->name, "-add1");
init_multiplier_adder(addsmall2, a1b1, a1b1->num_output_pins + a0 + b0, a0b0->num_output_pins);

// remap the multiplier outputs coming from addbig
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
// connect first a0+ b0 pins of addsmall2 to zero
for (int i = 0; i < a0 + b0; i++) {
add_input_pin_to_node(addsmall2, get_zero_pin(netlist), i);
}

// connect inputs to port a of addsmall2
for (int i = 0; i < a1b1->num_output_pins; i++) {
connect_nodes(a1b1, i, addsmall2, i + a0 + b0);
}

// connect inputs to port b of addsmall2
for (int i = 0; i < a0b0->output_port_sizes[0]; i++) {
connect_nodes(a0b0, i, addsmall2, i + addsmall2->input_port_sizes[0]);
}

/////////////// Addbig /////////////////////
addbig = allocate_nnode(node->loc);
addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
strcpy(addbig->name, node->name);
strcat(addbig->name, "-add2");
init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, addsmall2->num_output_pins);
// Here the final addition can have a carry out in the worst case, however,
// our final product will always only be the length of the longest input port so regardless of the carry the
// final adds carry will always drop out.

// connect inputs to port a of addbig
for (int i = 0; i < addsmall->num_output_pins; i++) {
connect_nodes(addsmall, i, addbig, i);
}

// connect inputs to port b of addbig
for (int i = 0; i < addsmall2->num_output_pins; i++) {
connect_nodes(addsmall2, i, addbig, i + addbig->input_port_sizes[0]);
}

// remap the multiplier outputs coming directly from a0b0
for (int i = 0; i < addbig->num_output_pins; i++) {
remap_pin_to_new_node(node->output_pins[i], addbig, i);
}
}

// CLEAN UP
Expand Down Expand Up @@ -1060,7 +1161,6 @@ void split_multiplier_a(nnode_t *node, int a0, int a1, int b)
strcat(a0b->name, "-0");
init_split_multiplier(node, a0b, 0, a0, 0, b, nullptr, nullptr);
mult_list = insert_in_vptr_list(mult_list, a0b);

/* New node for a1b multiply */
a1b = allocate_nnode(node->loc);
a1b->name = (char *)vtr::malloc(strlen(node->name) + 3);
Expand Down Expand Up @@ -1184,7 +1284,6 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)

oassert(node->type == MULTIPLY);
oassert(hard_multipliers != NULL);

sizea = node->input_port_sizes[0];
sizeb = node->input_port_sizes[1];
sizeout = node->output_port_sizes[0];
Expand All @@ -1199,6 +1298,13 @@ void pad_multiplier(nnode_t *node, netlist_t *netlist)
}
diffa = ina - sizea;
diffb = inb - sizeb;
// input multiplier size on middle range of unequal Hard Block size(ex; mul_size>18 && mul_size<25)
if (diffb < 0) {
std::swap(ina, inb);
diffa = ina - sizea;
diffb = inb - sizeb;
}

diffout = hard_multipliers->outputs->size - sizeout;

if (configuration.split_hard_multiplier == 1) {
Expand Down Expand Up @@ -1281,11 +1387,9 @@ void iterate_multipliers(netlist_t *netlist)
int mula, mulb;
int a0, a1, b0, b1;
nnode_t *node;

/* Can only perform the optimisation if hard multipliers exist! */
if (hard_multipliers == NULL)
return;

sizea = hard_multipliers->inputs->size;
sizeb = hard_multipliers->inputs->next->size;
if (sizea < sizeb) {
Expand Down Expand Up @@ -1313,7 +1417,6 @@ void iterate_multipliers(netlist_t *netlist)
sizea = sizeb;
sizeb = swap;
}

/* Do I need to split the multiplier on both inputs? */
if ((mula > sizea) && (mulb > sizeb)) {
a0 = sizea;
Expand Down Expand Up @@ -1890,4 +1993,4 @@ void free_multipliers()

hard_multipliers->instances = NULL;
}
}
}
1 change: 1 addition & 0 deletions parmys/parmys-plugin/netlist/netlist_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net)
*-----------------------------------------------------------------------*/
void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx)
{
oassert(pin != NULL);
if (pin->type == INPUT) {
/* clean out the entry in the old net */
pin->node->input_pins[pin->pin_node_idx] = NULL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ circuits_dir=benchmarks/verilog
arch_list_add=7series_BRAM_DSP_carry.xml

# Add circuits to list to sweep
circuit_list_add=mcml.v
circuit_list_add=LU32PEEng.v
circuit_list_add=LU8PEEng.v
circuit_list_add=bgm.v
Expand Down
Loading
Loading