From 98d8a7051d8caa9b5aebebe5bf16f9d731c34c0e Mon Sep 17 00:00:00 2001
From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Wed, 26 Oct 2016 22:47:30 -0400
Subject: [PATCH] gpu-compute: add instruction mix stats for the gpu

---
 src/gpu-compute/compute_unit.cc | 141 ++++++++++++++++++++++++++++++++
 src/gpu-compute/compute_unit.hh |  25 ++++++
 src/gpu-compute/wavefront.cc    |   4 +
 3 files changed, 170 insertions(+)

diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index abf8ff2c5..f05ecc1b2 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1408,6 +1408,114 @@ ComputeUnit::regStats()
 {
     MemObject::regStats();
 
+    vALUInsts
+        .name(name() + ".valu_insts")
+        .desc("Number of vector ALU insts issued.")
+        ;
+    vALUInstsPerWF
+        .name(name() + ".valu_insts_per_wf")
+        .desc("The avg. number of vector ALU insts issued per-wavefront.")
+        ;
+    sALUInsts
+        .name(name() + ".salu_insts")
+        .desc("Number of scalar ALU insts issued.")
+        ;
+    sALUInstsPerWF
+        .name(name() + ".salu_insts_per_wf")
+        .desc("The avg. number of scalar ALU insts issued per-wavefront.")
+        ;
+    instCyclesVALU
+        .name(name() + ".inst_cycles_valu")
+        .desc("Number of cycles needed to execute VALU insts.")
+        ;
+    instCyclesSALU
+        .name(name() + ".inst_cycles_salu")
+        .desc("Number of cycles needed to execute SALU insts.")
+        ;
+    threadCyclesVALU
+        .name(name() + ".thread_cycles_valu")
+        .desc("Number of thread cycles used to execute vector ALU ops. "
+              "Similar to instCyclesVALU but multiplied by the number of "
+              "active threads.")
+        ;
+    vALUUtilization
+        .name(name() + ".valu_utilization")
+        .desc("Percentage of active vector ALU threads in a wave.")
+        ;
+    ldsNoFlatInsts
+        .name(name() + ".lds_no_flat_insts")
+        .desc("Number of LDS insts issued, not including FLAT "
+              "accesses that resolve to LDS.")
+        ;
+    ldsNoFlatInstsPerWF
+        .name(name() + ".lds_no_flat_insts_per_wf")
+        .desc("The avg. number of LDS insts (not including FLAT "
+              "accesses that resolve to LDS) per-wavefront.")
+        ;
+    flatVMemInsts
+        .name(name() + ".flat_vmem_insts")
+        .desc("The number of FLAT insts that resolve to vmem issued.")
+        ;
+    flatVMemInstsPerWF
+        .name(name() + ".flat_vmem_insts_per_wf")
+        .desc("The average number of FLAT insts that resolve to vmem "
+              "issued per-wavefront.")
+        ;
+    flatLDSInsts
+        .name(name() + ".flat_lds_insts")
+        .desc("The number of FLAT insts that resolve to LDS issued.")
+        ;
+    flatLDSInstsPerWF
+        .name(name() + ".flat_lds_insts_per_wf")
+        .desc("The average number of FLAT insts that resolve to LDS "
+              "issued per-wavefront.")
+        ;
+    vectorMemWrites
+        .name(name() + ".vector_mem_writes")
+        .desc("Number of vector mem write insts (excluding FLAT insts).")
+        ;
+    vectorMemWritesPerWF
+        .name(name() + ".vector_mem_writes_per_wf")
+        .desc("The average number of vector mem write insts "
+              "(excluding FLAT insts) per-wavefront.")
+        ;
+    vectorMemReads
+        .name(name() + ".vector_mem_reads")
+        .desc("Number of vector mem read insts (excluding FLAT insts).")
+        ;
+    vectorMemReadsPerWF
+        .name(name() + ".vector_mem_reads_per_wf")
+        .desc("The avg. number of vector mem read insts (excluding "
+              "FLAT insts) per-wavefront.")
+        ;
+    scalarMemWrites
+        .name(name() + ".scalar_mem_writes")
+        .desc("Number of scalar mem write insts.")
+        ;
+    scalarMemWritesPerWF
+        .name(name() + ".scalar_mem_writes_per_wf")
+        .desc("The average number of scalar mem write insts per-wavefront.")
+        ;
+    scalarMemReads
+        .name(name() + ".scalar_mem_reads")
+        .desc("Number of scalar mem read insts.")
+        ;
+    scalarMemReadsPerWF
+        .name(name() + ".scalar_mem_reads_per_wf")
+        .desc("The average number of scalar mem read insts per-wavefront.")
+        ;
+
+    vALUInstsPerWF = vALUInsts / completedWfs;
+    sALUInstsPerWF = sALUInsts / completedWfs;
+    vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
+    ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
+    flatVMemInstsPerWF = flatVMemInsts / completedWfs;
+    flatLDSInstsPerWF = flatLDSInsts / completedWfs;
+    vectorMemWritesPerWF = vectorMemWrites / completedWfs;
+    vectorMemReadsPerWF = vectorMemReads / completedWfs;
+    scalarMemWritesPerWF = scalarMemWrites / completedWfs;
+    scalarMemReadsPerWF = scalarMemReads / completedWfs;
+
     tlbCycles
         .name(name() + ".tlb_cycles")
         .desc("total number of cycles for all uncoalesced requests")
@@ -1566,6 +1674,39 @@ ComputeUnit::regStats()
     localMemoryPipe.regStats();
 }
 
+void
+ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
+{
+    if (gpuDynInst->isScalar()) {
+        if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
+            sALUInsts++;
+            instCyclesSALU++;
+        } else if (gpuDynInst->isLoad()) {
+            scalarMemReads++;
+        } else if (gpuDynInst->isStore()) {
+            scalarMemWrites++;
+        }
+    } else {
+        if (gpuDynInst->isALU()) {
+            vALUInsts++;
+            instCyclesVALU++;
+            threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
+        } else if (gpuDynInst->isFlat()) {
+            if (gpuDynInst->isLocalMem()) {
+                flatLDSInsts++;
+            } else {
+                flatVMemInsts++;
+            }
+        } else if (gpuDynInst->isLocalMem()) {
+            ldsNoFlatInsts++;
+        } else if (gpuDynInst->isLoad()) {
+            vectorMemReads++;
+        } else if (gpuDynInst->isStore()) {
+            vectorMemWrites++;
+        }
+    }
+}
+
 void
 ComputeUnit::updatePageDivergenceDist(Addr addr)
 {
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 938658fd1..2187bec38 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -301,6 +301,31 @@ class ComputeUnit : public MemObject
     LdsState &lds;
 
   public:
+    Stats::Scalar vALUInsts;
+    Stats::Formula vALUInstsPerWF;
+    Stats::Scalar sALUInsts;
+    Stats::Formula sALUInstsPerWF;
+    Stats::Scalar instCyclesVALU;
+    Stats::Scalar instCyclesSALU;
+    Stats::Scalar threadCyclesVALU;
+    Stats::Formula vALUUtilization;
+    Stats::Scalar ldsNoFlatInsts;
+    Stats::Formula ldsNoFlatInstsPerWF;
+    Stats::Scalar flatVMemInsts;
+    Stats::Formula flatVMemInstsPerWF;
+    Stats::Scalar flatLDSInsts;
+    Stats::Formula flatLDSInstsPerWF;
+    Stats::Scalar vectorMemWrites;
+    Stats::Formula vectorMemWritesPerWF;
+    Stats::Scalar vectorMemReads;
+    Stats::Formula vectorMemReadsPerWF;
+    Stats::Scalar scalarMemWrites;
+    Stats::Formula scalarMemWritesPerWF;
+    Stats::Scalar scalarMemReads;
+    Stats::Formula scalarMemReadsPerWF;
+
+    void updateInstStats(GPUDynInstPtr gpuDynInst);
+
     // the following stats compute the avg. TLB accesslatency per
     // uncoalesced request (only for data)
     Stats::Scalar tlbRequests;
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 96f0d0e96..99ac24900 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -656,7 +656,11 @@ Wavefront::exec()
     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
             "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
             ii->disassemble(), old_pc);
+
+    // update the instruction stats in the CU
+
     ii->execute(ii);
+    computeUnit->updateInstStats(ii);
     // access the VRF
     computeUnit->vrf[simdId]->exec(ii, this);
     srcRegOpDist.sample(ii->numSrcRegOperands());