How to make a custom instruction with a pipelined configuration?

37 views
Skip to first unread message

hui jackson

unread,
Mar 30, 2025, 8:38:44 AMMar 30
to Chipyard
I am trying to create a custom pipelined configuration such that it can be used to calculate a 2 by 2 matrix calculation using Strassen's Algorithm. I can successfully make the configuration but I'm not sure whether it is a pipelined configuration. I compared another sequential 2 by 2 custom configuration with it but it seems that the pipelined configuration takes more cycles than the sequential one to complete 128 times of 2 by 2 matrix calculation. I'm not sure whether I did it correctly to make the pipelined configuration. Please help me. 
Below are my code:
// 1.1 Path: chipyard/generators/rocket-chip/src/main/scala/tile/LazyRoCC.scala
class MatmulStrassenpipelinedver3 extends Module {
  val io = IO(new Bundle {
    val in_a11 = Input(SInt(8.W))
    val in_a12 = Input(SInt(8.W))
    val in_a21 = Input(SInt(8.W))
    val in_a22 = Input(SInt(8.W))

    val in_b11 = Input(SInt(8.W))
    val in_b12 = Input(SInt(8.W))
    val in_b21 = Input(SInt(8.W))
    val in_b22 = Input(SInt(8.W))

    val valid_in = Input(Bool())
    val ready_in = Output(Bool())

    val out_c11 = Output(SInt(8.W))
    val out_c12 = Output(SInt(8.W))
    val out_c21 = Output(SInt(8.W))
    val out_c22 = Output(SInt(8.W))

    val valid_out = Output(Bool())
  })

  val pipelineDepth = 5

  // Pipeline Valid Signals
  val validRegs = RegInit(VecInit(Seq.fill(pipelineDepth)(false.B)))

  // Stage 1: Input Registers
  val reg_a11 = Reg(SInt(8.W))
  val reg_a12 = Reg(SInt(8.W))
  val reg_a21 = Reg(SInt(8.W))
  val reg_a22 = Reg(SInt(8.W))
  val reg_b11 = Reg(SInt(8.W))
  val reg_b12 = Reg(SInt(8.W))
  val reg_b21 = Reg(SInt(8.W))
  val reg_b22 = Reg(SInt(8.W))

  io.ready_in := !validRegs(0)

  when(io.valid_in && io.ready_in) {
    reg_a11 := io.in_a11
    reg_a12 := io.in_a12
    reg_a21 := io.in_a21
    reg_a22 := io.in_a22
    reg_b11 := io.in_b11
    reg_b12 := io.in_b12
    reg_b21 := io.in_b21
    reg_b22 := io.in_b22
  }

  validRegs(0) := io.valid_in && io.ready_in

  // Stage 2: Intermediate calculations
  val sum_a11_a22 = RegNext(reg_a11 + reg_a22)
  val sum_b11_b22 = RegNext(reg_b11 + reg_b22)
  val sum_a21_a22 = RegNext(reg_a21 + reg_a22)
  val sum_b11_b12 = RegNext(reg_b11 + reg_b12)
  val diff_b12_b22 = RegNext(reg_b12 - reg_b22)
  val diff_b21_b11 = RegNext(reg_b21 - reg_b11)
  val diff_a21_a11 = RegNext(reg_a21 - reg_a11)
  val diff_a12_a22 = RegNext(reg_a12 - reg_a22)

  validRegs(1) := validRegs(0)

  // Stage 3: Partial products
  val m1 = RegNext(sum_a11_a22 * sum_b11_b22)
  val m2 = RegNext(sum_a21_a22 * reg_b11)
  val m3 = RegNext(reg_a11 * diff_b12_b22)
  val m4 = RegNext(reg_a22 * diff_b21_b11)
  val m5 = RegNext((reg_a11 + reg_a12) * reg_b22)
  val m6 = RegNext(diff_a21_a11 * sum_b11_b12)
  val m7 = RegNext(diff_a12_a22 * (reg_b21 + reg_b22))

  validRegs(2) := validRegs(1)

  // Stage 4: Accumulate final results
  val c11 = RegNext(m1 + m4 - m5 + m7)
  val c12 = RegNext(m3 + m5)
  val c21 = RegNext(m2 + m4)
  val c22 = RegNext(m1 - m2 + m3 + m6)

  validRegs(3) := validRegs(2)

  // Stage 5: Output Registers
  io.out_c11 := RegNext(c11)
  io.out_c12 := RegNext(c12)
  io.out_c21 := RegNext(c21)
  io.out_c22 := RegNext(c22)

  validRegs(4) := validRegs(3)
  io.valid_out := validRegs(4)
}

// 1.2 Path: chipyard/generators/rocket-chip/src/main/scala/tile/LazyRoCC.scala
class MatmulStrassenpipelinedver3RoCC(opcodes: OpcodeSet)(implicit p: Parameters) extends LazyRoCC(opcodes) {
  override lazy val module = new MatmulStrassenpipelinedver3RoCCModule(this)
}

// 1.3  Path: chipyard/generators/rocket-chip/src/main/scala/tile/LazyRoCC.scala
class MatmulStrassenpipelinedver3RoCCModule(outer: MatmulStrassenpipelinedver3RoCC)(implicit p: Parameters)
  extends LazyRoCCModuleImp(outer) with HasCoreParameters {

  val matmul = Module(new MatmulStrassenpipelinedver3)

  val pipelineDepth = 5
  val cmdQueue = Module(new Queue(chiselTypeOf(io.cmd.bits), entries = pipelineDepth))

  // Push incoming commands into queue
  cmdQueue.io.enq.bits := io.cmd.bits
  cmdQueue.io.enq.valid := io.cmd.valid
  io.cmd.ready := cmdQueue.io.enq.ready

  // Connect queue to matmul pipeline
  matmul.io.in_a11 := cmdQueue.io.deq.bits.rs1(7, 0).asSInt
  matmul.io.in_a12 := cmdQueue.io.deq.bits.rs1(15, 8).asSInt
  matmul.io.in_a21 := cmdQueue.io.deq.bits.rs1(23, 16).asSInt
  matmul.io.in_a22 := cmdQueue.io.deq.bits.rs1(31, 24).asSInt
  matmul.io.in_b11 := cmdQueue.io.deq.bits.rs2(7, 0).asSInt
  matmul.io.in_b12 := cmdQueue.io.deq.bits.rs2(15, 8).asSInt
  matmul.io.in_b21 := cmdQueue.io.deq.bits.rs2(23, 16).asSInt
  matmul.io.in_b22 := cmdQueue.io.deq.bits.rs2(31, 24).asSInt
  matmul.io.valid_in := cmdQueue.io.deq.valid
  cmdQueue.io.deq.ready := matmul.io.ready_in

  // Response FIFO to match cmd and response
  val respQueue = Module(new Queue(UInt(5.W), pipelineDepth))
  respQueue.io.enq.bits := cmdQueue.io.deq.bits.inst.rd
  respQueue.io.enq.valid := cmdQueue.io.deq.fire()

  io.resp.valid := matmul.io.valid_out && respQueue.io.deq.valid
  io.resp.bits.rd := respQueue.io.deq.bits
  io.resp.bits.data := Cat(
    matmul.io.out_c22.asUInt,
    matmul.io.out_c21.asUInt,
    matmul.io.out_c12.asUInt,
    matmul.io.out_c11.asUInt
  )
  respQueue.io.deq.ready := io.resp.fire()

  io.busy := !cmdQueue.io.enq.ready
  io.interrupt := false.B
}


// 2. Path: chipyard/generators/rocket-chip/src/main/scala/subsystem/Configs.scala
class WithMatmulStrassenpipelinedver3RoCC extends Config((site, here, up) => {
  case BuildRoCC => Seq((p: Parameters) => {
    val matmul = LazyModule(new MatmulStrassenpipelinedver3RoCC(OpcodeSet.custom0)(p))
    matmul
  })
})


// 3. Path: chipyard/generators/chipyard/src/main/scala/config/RocketConfigs.scala
class MatmulStrassenpipelinedver3RocketConfig extends Config(
  new freechips.rocketchip.subsystem.WithMatmulStrassenpipelinedver3RoCC ++
  new freechips.rocketchip.subsystem.WithNBigCores(1) ++
  new chipyard.config.AbstractConfig)
Reply all
Reply to author
Forward
0 new messages