gpu-compute: add instruction mix stats for the gpu

This commit is contained in:
Tony Gutierrez 2016-10-26 22:47:30 -04:00
parent c7a79c9a42
commit 98d8a7051d
3 changed files with 170 additions and 0 deletions

View file

@ -1408,6 +1408,114 @@ ComputeUnit::regStats()
{ {
MemObject::regStats(); MemObject::regStats();
vALUInsts
.name(name() + ".valu_insts")
.desc("Number of vector ALU insts issued.")
;
vALUInstsPerWF
.name(name() + ".valu_insts_per_wf")
.desc("The avg. number of vector ALU insts issued per-wavefront.")
;
sALUInsts
.name(name() + ".salu_insts")
.desc("Number of scalar ALU insts issued.")
;
sALUInstsPerWF
.name(name() + ".salu_insts_per_wf")
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
;
instCyclesVALU
.name(name() + ".inst_cycles_valu")
.desc("Number of cycles needed to execute VALU insts.")
;
instCyclesSALU
.name(name() + ".inst_cycles_salu")
.desc("Number of cycles needed to execute SALU insts.")
;
threadCyclesVALU
.name(name() + ".thread_cycles_valu")
.desc("Number of thread cycles used to execute vector ALU ops. "
"Similar to instCyclesVALU but multiplied by the number of "
"active threads.")
;
vALUUtilization
.name(name() + ".valu_utilization")
.desc("Percentage of active vector ALU threads in a wave.")
;
ldsNoFlatInsts
.name(name() + ".lds_no_flat_insts")
.desc("Number of LDS insts issued, not including FLAT "
"accesses that resolve to LDS.")
;
ldsNoFlatInstsPerWF
.name(name() + ".lds_no_flat_insts_per_wf")
.desc("The avg. number of LDS insts (not including FLAT "
"accesses that resolve to LDS) per-wavefront.")
;
flatVMemInsts
.name(name() + ".flat_vmem_insts")
.desc("The number of FLAT insts that resolve to vmem issued.")
;
flatVMemInstsPerWF
.name(name() + ".flat_vmem_insts_per_wf")
.desc("The average number of FLAT insts that resolve to vmem "
"issued per-wavefront.")
;
flatLDSInsts
.name(name() + ".flat_lds_insts")
.desc("The number of FLAT insts that resolve to LDS issued.")
;
flatLDSInstsPerWF
.name(name() + ".flat_lds_insts_per_wf")
.desc("The average number of FLAT insts that resolve to LDS "
"issued per-wavefront.")
;
vectorMemWrites
.name(name() + ".vector_mem_writes")
.desc("Number of vector mem write insts (excluding FLAT insts).")
;
vectorMemWritesPerWF
.name(name() + ".vector_mem_writes_per_wf")
.desc("The average number of vector mem write insts "
"(excluding FLAT insts) per-wavefront.")
;
vectorMemReads
.name(name() + ".vector_mem_reads")
.desc("Number of vector mem read insts (excluding FLAT insts).")
;
vectorMemReadsPerWF
.name(name() + ".vector_mem_reads_per_wf")
.desc("The avg. number of vector mem read insts (excluding "
"FLAT insts) per-wavefront.")
;
scalarMemWrites
.name(name() + ".scalar_mem_writes")
.desc("Number of scalar mem write insts.")
;
scalarMemWritesPerWF
.name(name() + ".scalar_mem_writes_per_wf")
.desc("The average number of scalar mem write insts per-wavefront.")
;
scalarMemReads
.name(name() + ".scalar_mem_reads")
.desc("Number of scalar mem read insts.")
;
scalarMemReadsPerWF
.name(name() + ".scalar_mem_reads_per_wf")
.desc("The average number of scalar mem read insts per-wavefront.")
;
vALUInstsPerWF = vALUInsts / completedWfs;
sALUInstsPerWF = sALUInsts / completedWfs;
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
vectorMemReadsPerWF = vectorMemReads / completedWfs;
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
scalarMemReadsPerWF = scalarMemReads / completedWfs;
tlbCycles tlbCycles
.name(name() + ".tlb_cycles") .name(name() + ".tlb_cycles")
.desc("total number of cycles for all uncoalesced requests") .desc("total number of cycles for all uncoalesced requests")
@ -1566,6 +1674,39 @@ ComputeUnit::regStats()
localMemoryPipe.regStats(); localMemoryPipe.regStats();
} }
void
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->isScalar()) {
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
sALUInsts++;
instCyclesSALU++;
} else if (gpuDynInst->isLoad()) {
scalarMemReads++;
} else if (gpuDynInst->isStore()) {
scalarMemWrites++;
}
} else {
if (gpuDynInst->isALU()) {
vALUInsts++;
instCyclesVALU++;
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
} else if (gpuDynInst->isFlat()) {
if (gpuDynInst->isLocalMem()) {
flatLDSInsts++;
} else {
flatVMemInsts++;
}
} else if (gpuDynInst->isLocalMem()) {
ldsNoFlatInsts++;
} else if (gpuDynInst->isLoad()) {
vectorMemReads++;
} else if (gpuDynInst->isStore()) {
vectorMemWrites++;
}
}
}
void void
ComputeUnit::updatePageDivergenceDist(Addr addr) ComputeUnit::updatePageDivergenceDist(Addr addr)
{ {

View file

@ -301,6 +301,31 @@ class ComputeUnit : public MemObject
LdsState &lds; LdsState &lds;
public: public:
Stats::Scalar vALUInsts;
Stats::Formula vALUInstsPerWF;
Stats::Scalar sALUInsts;
Stats::Formula sALUInstsPerWF;
Stats::Scalar instCyclesVALU;
Stats::Scalar instCyclesSALU;
Stats::Scalar threadCyclesVALU;
Stats::Formula vALUUtilization;
Stats::Scalar ldsNoFlatInsts;
Stats::Formula ldsNoFlatInstsPerWF;
Stats::Scalar flatVMemInsts;
Stats::Formula flatVMemInstsPerWF;
Stats::Scalar flatLDSInsts;
Stats::Formula flatLDSInstsPerWF;
Stats::Scalar vectorMemWrites;
Stats::Formula vectorMemWritesPerWF;
Stats::Scalar vectorMemReads;
Stats::Formula vectorMemReadsPerWF;
Stats::Scalar scalarMemWrites;
Stats::Formula scalarMemWritesPerWF;
Stats::Scalar scalarMemReads;
Stats::Formula scalarMemReadsPerWF;
void updateInstStats(GPUDynInstPtr gpuDynInst);
// the following stats compute the avg. TLB accesslatency per // the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data) // uncoalesced request (only for data)
Stats::Scalar tlbRequests; Stats::Scalar tlbRequests;

View file

@ -656,7 +656,11 @@ Wavefront::exec()
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble(), old_pc); ii->disassemble(), old_pc);
// update the instruction stats in the CU
ii->execute(ii); ii->execute(ii);
computeUnit->updateInstStats(ii);
// access the VRF // access the VRF
computeUnit->vrf[simdId]->exec(ii, this); computeUnit->vrf[simdId]->exec(ii, this);
srcRegOpDist.sample(ii->numSrcRegOperands()); srcRegOpDist.sample(ii->numSrcRegOperands());