From 98d8a7051d8caa9b5aebebe5bf16f9d731c34c0e Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Wed, 26 Oct 2016 22:47:30 -0400 Subject: [PATCH] gpu-compute: add instruction mix stats for the gpu --- src/gpu-compute/compute_unit.cc | 141 ++++++++++++++++++++++++++++++++ src/gpu-compute/compute_unit.hh | 25 ++++++ src/gpu-compute/wavefront.cc | 4 + 3 files changed, 170 insertions(+) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index abf8ff2c5..f05ecc1b2 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1408,6 +1408,114 @@ ComputeUnit::regStats() { MemObject::regStats(); + vALUInsts + .name(name() + ".valu_insts") + .desc("Number of vector ALU insts issued.") + ; + vALUInstsPerWF + .name(name() + ".valu_insts_per_wf") + .desc("The avg. number of vector ALU insts issued per-wavefront.") + ; + sALUInsts + .name(name() + ".salu_insts") + .desc("Number of scalar ALU insts issued.") + ; + sALUInstsPerWF + .name(name() + ".salu_insts_per_wf") + .desc("The avg. number of scalar ALU insts issued per-wavefront.") + ; + instCyclesVALU + .name(name() + ".inst_cycles_valu") + .desc("Number of cycles needed to execute VALU insts.") + ; + instCyclesSALU + .name(name() + ".inst_cycles_salu") + .desc("Number of cycles needed to execute SALU insts.") + ; + threadCyclesVALU + .name(name() + ".thread_cycles_valu") + .desc("Number of thread cycles used to execute vector ALU ops. " + "Similar to instCyclesVALU but multiplied by the number of " + "active threads.") + ; + vALUUtilization + .name(name() + ".valu_utilization") + .desc("Percentage of active vector ALU threads in a wave.") + ; + ldsNoFlatInsts + .name(name() + ".lds_no_flat_insts") + .desc("Number of LDS insts issued, not including FLAT " + "accesses that resolve to LDS.") + ; + ldsNoFlatInstsPerWF + .name(name() + ".lds_no_flat_insts_per_wf") + .desc("The avg. number of LDS insts (not including FLAT " + "accesses that resolve to LDS) per-wavefront.") + ; + flatVMemInsts + .name(name() + ".flat_vmem_insts") + .desc("The number of FLAT insts that resolve to vmem issued.") + ; + flatVMemInstsPerWF + .name(name() + ".flat_vmem_insts_per_wf") + .desc("The average number of FLAT insts that resolve to vmem " + "issued per-wavefront.") + ; + flatLDSInsts + .name(name() + ".flat_lds_insts") + .desc("The number of FLAT insts that resolve to LDS issued.") + ; + flatLDSInstsPerWF + .name(name() + ".flat_lds_insts_per_wf") + .desc("The average number of FLAT insts that resolve to LDS " + "issued per-wavefront.") + ; + vectorMemWrites + .name(name() + ".vector_mem_writes") + .desc("Number of vector mem write insts (excluding FLAT insts).") + ; + vectorMemWritesPerWF + .name(name() + ".vector_mem_writes_per_wf") + .desc("The average number of vector mem write insts " + "(excluding FLAT insts) per-wavefront.") + ; + vectorMemReads + .name(name() + ".vector_mem_reads") + .desc("Number of vector mem read insts (excluding FLAT insts).") + ; + vectorMemReadsPerWF + .name(name() + ".vector_mem_reads_per_wf") + .desc("The avg. number of vector mem read insts (excluding " + "FLAT insts) per-wavefront.") + ; + scalarMemWrites + .name(name() + ".scalar_mem_writes") + .desc("Number of scalar mem write insts.") + ; + scalarMemWritesPerWF + .name(name() + ".scalar_mem_writes_per_wf") + .desc("The average number of scalar mem write insts per-wavefront.") + ; + scalarMemReads + .name(name() + ".scalar_mem_reads") + .desc("Number of scalar mem read insts.") + ; + scalarMemReadsPerWF + .name(name() + ".scalar_mem_reads_per_wf") + .desc("The average number of scalar mem read insts per-wavefront.") + ; + + vALUInstsPerWF = vALUInsts / completedWfs; + sALUInstsPerWF = sALUInsts / completedWfs; + vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100; + ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs; + flatVMemInstsPerWF = flatVMemInsts / completedWfs; + flatLDSInstsPerWF = flatLDSInsts / completedWfs; + vectorMemWritesPerWF = vectorMemWrites / completedWfs; + vectorMemReadsPerWF = vectorMemReads / completedWfs; + scalarMemWritesPerWF = scalarMemWrites / completedWfs; + scalarMemReadsPerWF = scalarMemReads / completedWfs; + tlbCycles .name(name() + ".tlb_cycles") .desc("total number of cycles for all uncoalesced requests") @@ -1566,6 +1674,39 @@ ComputeUnit::regStats() localMemoryPipe.regStats(); } +void +ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) +{ + if (gpuDynInst->isScalar()) { + if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) { + sALUInsts++; + instCyclesSALU++; + } else if (gpuDynInst->isLoad()) { + scalarMemReads++; + } else if (gpuDynInst->isStore()) { + scalarMemWrites++; + } + } else { + if (gpuDynInst->isALU()) { + vALUInsts++; + instCyclesVALU++; + threadCyclesVALU += gpuDynInst->wavefront()->execMask().count(); + } else if (gpuDynInst->isFlat()) { + if (gpuDynInst->isLocalMem()) { + flatLDSInsts++; + } else { + flatVMemInsts++; + } + } else if (gpuDynInst->isLocalMem()) { + ldsNoFlatInsts++; + } else if (gpuDynInst->isLoad()) { + vectorMemReads++; + } else if (gpuDynInst->isStore()) { + vectorMemWrites++; + } + } +} + void ComputeUnit::updatePageDivergenceDist(Addr addr) { diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 938658fd1..2187bec38 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -301,6 +301,31 @@ class ComputeUnit : public MemObject LdsState &lds; public: + Stats::Scalar vALUInsts; + Stats::Formula vALUInstsPerWF; + Stats::Scalar sALUInsts; + Stats::Formula sALUInstsPerWF; + Stats::Scalar instCyclesVALU; + Stats::Scalar instCyclesSALU; + Stats::Scalar threadCyclesVALU; + Stats::Formula vALUUtilization; + Stats::Scalar ldsNoFlatInsts; + Stats::Formula ldsNoFlatInstsPerWF; + Stats::Scalar flatVMemInsts; + Stats::Formula flatVMemInstsPerWF; + Stats::Scalar flatLDSInsts; + Stats::Formula flatLDSInstsPerWF; + Stats::Scalar vectorMemWrites; + Stats::Formula vectorMemWritesPerWF; + Stats::Scalar vectorMemReads; + Stats::Formula vectorMemReadsPerWF; + Stats::Scalar scalarMemWrites; + Stats::Formula scalarMemWritesPerWF; + Stats::Scalar scalarMemReads; + Stats::Formula scalarMemReadsPerWF; + + void updateInstStats(GPUDynInstPtr gpuDynInst); + // the following stats compute the avg. TLB accesslatency per // uncoalesced request (only for data) Stats::Scalar tlbRequests; diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 96f0d0e96..99ac24900 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -656,7 +656,11 @@ Wavefront::exec() DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, ii->disassemble(), old_pc); + + // update the instruction stats in the CU + ii->execute(ii); + computeUnit->updateInstStats(ii); // access the VRF computeUnit->vrf[simdId]->exec(ii, this); srcRegOpDist.sample(ii->numSrcRegOperands());