I am trying to create a custom pipelined configuration such that it can be used to calculate a 2 by 2 matrix calculation using Strassen's Algorithm. I can successfully make the configuration but I'm not sure whether it is a pipelined configuration. I compared another sequential 2 by 2 custom configuration with it but it seems that the pipelined configuration takes more cycles than the sequential one to complete 128 times of 2 by 2 matrix calculation. I'm not sure whether I did it correctly to make the pipelined configuration. Please help me.
Below are my code:
// 1.1 Path: chipyard/generators/rocket-chip/src/main/scala/tile/LazyRoCC.scala
class MatmulStrassenpipelinedver3 extends Module {
val io = IO(new Bundle {
val in_a11 = Input(SInt(8.W))
val in_a12 = Input(SInt(8.W))
val in_a21 = Input(SInt(8.W))
val in_a22 = Input(SInt(8.W))
val in_b11 = Input(SInt(8.W))
val in_b12 = Input(SInt(8.W))
val in_b21 = Input(SInt(8.W))
val in_b22 = Input(SInt(8.W))
val valid_in = Input(Bool())
val ready_in = Output(Bool())
val out_c11 = Output(SInt(8.W))
val out_c12 = Output(SInt(8.W))
val out_c21 = Output(SInt(8.W))
val out_c22 = Output(SInt(8.W))
val valid_out = Output(Bool())
})
val pipelineDepth = 5
// Pipeline Valid Signals
val validRegs = RegInit(VecInit(Seq.fill(pipelineDepth)(false.B)))
// Stage 1: Input Registers
val reg_a11 = Reg(SInt(8.W))
val reg_a12 = Reg(SInt(8.W))
val reg_a21 = Reg(SInt(8.W))
val reg_a22 = Reg(SInt(8.W))
val reg_b11 = Reg(SInt(8.W))
val reg_b12 = Reg(SInt(8.W))
val reg_b21 = Reg(SInt(8.W))
val reg_b22 = Reg(SInt(8.W))
io.ready_in := !validRegs(0)
when(io.valid_in && io.ready_in) {
reg_a11 := io.in_a11
reg_a12 := io.in_a12
reg_a21 := io.in_a21
reg_a22 := io.in_a22
reg_b11 := io.in_b11
reg_b12 := io.in_b12
reg_b21 := io.in_b21
reg_b22 := io.in_b22
}
validRegs(0) := io.valid_in && io.ready_in
// Stage 2: Intermediate calculations
val sum_a11_a22 = RegNext(reg_a11 + reg_a22)
val sum_b11_b22 = RegNext(reg_b11 + reg_b22)
val sum_a21_a22 = RegNext(reg_a21 + reg_a22)
val sum_b11_b12 = RegNext(reg_b11 + reg_b12)
val diff_b12_b22 = RegNext(reg_b12 - reg_b22)
val diff_b21_b11 = RegNext(reg_b21 - reg_b11)
val diff_a21_a11 = RegNext(reg_a21 - reg_a11)
val diff_a12_a22 = RegNext(reg_a12 - reg_a22)
validRegs(1) := validRegs(0)
// Stage 3: Partial products
val m1 = RegNext(sum_a11_a22 * sum_b11_b22)
val m2 = RegNext(sum_a21_a22 * reg_b11)
val m3 = RegNext(reg_a11 * diff_b12_b22)
val m4 = RegNext(reg_a22 * diff_b21_b11)
val m5 = RegNext((reg_a11 + reg_a12) * reg_b22)
val m6 = RegNext(diff_a21_a11 * sum_b11_b12)
val m7 = RegNext(diff_a12_a22 * (reg_b21 + reg_b22))
validRegs(2) := validRegs(1)
// Stage 4: Accumulate final results
val c11 = RegNext(m1 + m4 - m5 + m7)
val c12 = RegNext(m3 + m5)
val c21 = RegNext(m2 + m4)
val c22 = RegNext(m1 - m2 + m3 + m6)
validRegs(3) := validRegs(2)
// Stage 5: Output Registers
io.out_c11 := RegNext(c11)
io.out_c12 := RegNext(c12)
io.out_c21 := RegNext(c21)
io.out_c22 := RegNext(c22)
validRegs(4) := validRegs(3)
io.valid_out := validRegs(4)
}
// 1.2 Path: chipyard/generators/rocket-chip/src/main/scala/tile/LazyRoCC.scala
class MatmulStrassenpipelinedver3RoCC(opcodes: OpcodeSet)(implicit p: Parameters) extends LazyRoCC(opcodes) {
override lazy val module = new MatmulStrassenpipelinedver3RoCCModule(this)
}
// 1.3
Path: chipyard/generators/rocket-chip/src/main/scala/tile/LazyRoCC.scala
class MatmulStrassenpipelinedver3RoCCModule(outer: MatmulStrassenpipelinedver3RoCC)(implicit p: Parameters)
extends LazyRoCCModuleImp(outer) with HasCoreParameters {
val matmul = Module(new MatmulStrassenpipelinedver3)
val pipelineDepth = 5
val cmdQueue = Module(new Queue(chiselTypeOf(io.cmd.bits), entries = pipelineDepth))
// Push incoming commands into queue
cmdQueue.io.enq.bits := io.cmd.bits
cmdQueue.io.enq.valid := io.cmd.valid
io.cmd.ready := cmdQueue.io.enq.ready
// Connect queue to matmul pipeline
matmul.io.in_a11 := cmdQueue.io.deq.bits.rs1(7, 0).asSInt
matmul.io.in_a12 := cmdQueue.io.deq.bits.rs1(15, 8).asSInt
matmul.io.in_a21 := cmdQueue.io.deq.bits.rs1(23, 16).asSInt
matmul.io.in_a22 := cmdQueue.io.deq.bits.rs1(31, 24).asSInt
matmul.io.in_b11 := cmdQueue.io.deq.bits.rs2(7, 0).asSInt
matmul.io.in_b12 := cmdQueue.io.deq.bits.rs2(15, 8).asSInt
matmul.io.in_b21 := cmdQueue.io.deq.bits.rs2(23, 16).asSInt
matmul.io.in_b22 := cmdQueue.io.deq.bits.rs2(31, 24).asSInt
matmul.io.valid_in := cmdQueue.io.deq.valid
cmdQueue.io.deq.ready := matmul.io.ready_in
// Response FIFO to match cmd and response
val respQueue = Module(new Queue(UInt(5.W), pipelineDepth))
respQueue.io.enq.bits := cmdQueue.io.deq.bits.inst.rd
respQueue.io.enq.valid := cmdQueue.io.deq.fire()
io.resp.valid := matmul.io.valid_out && respQueue.io.deq.valid
io.resp.bits.rd := respQueue.io.deq.bits
io.resp.bits.data := Cat(
matmul.io.out_c22.asUInt,
matmul.io.out_c21.asUInt,
matmul.io.out_c12.asUInt,
matmul.io.out_c11.asUInt
)
respQueue.io.deq.ready := io.resp.fire()
io.busy := !cmdQueue.io.enq.ready
io.interrupt := false.B
}
// 2. Path: chipyard/generators/rocket-chip/src/main/scala/subsystem/Configs.scala
class WithMatmulStrassenpipelinedver3RoCC extends Config((site, here, up) => {
case BuildRoCC => Seq((p: Parameters) => {
val matmul = LazyModule(new MatmulStrassenpipelinedver3RoCC(OpcodeSet.custom0)(p))
matmul
})
})
// 3. Path: chipyard/generators/chipyard/src/main/scala/config/RocketConfigs.scala
class MatmulStrassenpipelinedver3RocketConfig extends Config(
new freechips.rocketchip.subsystem.WithMatmulStrassenpipelinedver3RoCC ++
new freechips.rocketchip.subsystem.WithNBigCores(1) ++
new chipyard.config.AbstractConfig)