gpu-compute: add instruction mix stats for the gpu
This commit is contained in:
parent
c7a79c9a42
commit
98d8a7051d
3 changed files with 170 additions and 0 deletions
|
@ -1408,6 +1408,114 @@ ComputeUnit::regStats()
|
|||
{
|
||||
MemObject::regStats();
|
||||
|
||||
vALUInsts
|
||||
.name(name() + ".valu_insts")
|
||||
.desc("Number of vector ALU insts issued.")
|
||||
;
|
||||
vALUInstsPerWF
|
||||
.name(name() + ".valu_insts_per_wf")
|
||||
.desc("The avg. number of vector ALU insts issued per-wavefront.")
|
||||
;
|
||||
sALUInsts
|
||||
.name(name() + ".salu_insts")
|
||||
.desc("Number of scalar ALU insts issued.")
|
||||
;
|
||||
sALUInstsPerWF
|
||||
.name(name() + ".salu_insts_per_wf")
|
||||
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
|
||||
;
|
||||
instCyclesVALU
|
||||
.name(name() + ".inst_cycles_valu")
|
||||
.desc("Number of cycles needed to execute VALU insts.")
|
||||
;
|
||||
instCyclesSALU
|
||||
.name(name() + ".inst_cycles_salu")
|
||||
.desc("Number of cycles needed to execute SALU insts.")
|
||||
;
|
||||
threadCyclesVALU
|
||||
.name(name() + ".thread_cycles_valu")
|
||||
.desc("Number of thread cycles used to execute vector ALU ops. "
|
||||
"Similar to instCyclesVALU but multiplied by the number of "
|
||||
"active threads.")
|
||||
;
|
||||
vALUUtilization
|
||||
.name(name() + ".valu_utilization")
|
||||
.desc("Percentage of active vector ALU threads in a wave.")
|
||||
;
|
||||
ldsNoFlatInsts
|
||||
.name(name() + ".lds_no_flat_insts")
|
||||
.desc("Number of LDS insts issued, not including FLAT "
|
||||
"accesses that resolve to LDS.")
|
||||
;
|
||||
ldsNoFlatInstsPerWF
|
||||
.name(name() + ".lds_no_flat_insts_per_wf")
|
||||
.desc("The avg. number of LDS insts (not including FLAT "
|
||||
"accesses that resolve to LDS) per-wavefront.")
|
||||
;
|
||||
flatVMemInsts
|
||||
.name(name() + ".flat_vmem_insts")
|
||||
.desc("The number of FLAT insts that resolve to vmem issued.")
|
||||
;
|
||||
flatVMemInstsPerWF
|
||||
.name(name() + ".flat_vmem_insts_per_wf")
|
||||
.desc("The average number of FLAT insts that resolve to vmem "
|
||||
"issued per-wavefront.")
|
||||
;
|
||||
flatLDSInsts
|
||||
.name(name() + ".flat_lds_insts")
|
||||
.desc("The number of FLAT insts that resolve to LDS issued.")
|
||||
;
|
||||
flatLDSInstsPerWF
|
||||
.name(name() + ".flat_lds_insts_per_wf")
|
||||
.desc("The average number of FLAT insts that resolve to LDS "
|
||||
"issued per-wavefront.")
|
||||
;
|
||||
vectorMemWrites
|
||||
.name(name() + ".vector_mem_writes")
|
||||
.desc("Number of vector mem write insts (excluding FLAT insts).")
|
||||
;
|
||||
vectorMemWritesPerWF
|
||||
.name(name() + ".vector_mem_writes_per_wf")
|
||||
.desc("The average number of vector mem write insts "
|
||||
"(excluding FLAT insts) per-wavefront.")
|
||||
;
|
||||
vectorMemReads
|
||||
.name(name() + ".vector_mem_reads")
|
||||
.desc("Number of vector mem read insts (excluding FLAT insts).")
|
||||
;
|
||||
vectorMemReadsPerWF
|
||||
.name(name() + ".vector_mem_reads_per_wf")
|
||||
.desc("The avg. number of vector mem read insts (excluding "
|
||||
"FLAT insts) per-wavefront.")
|
||||
;
|
||||
scalarMemWrites
|
||||
.name(name() + ".scalar_mem_writes")
|
||||
.desc("Number of scalar mem write insts.")
|
||||
;
|
||||
scalarMemWritesPerWF
|
||||
.name(name() + ".scalar_mem_writes_per_wf")
|
||||
.desc("The average number of scalar mem write insts per-wavefront.")
|
||||
;
|
||||
scalarMemReads
|
||||
.name(name() + ".scalar_mem_reads")
|
||||
.desc("Number of scalar mem read insts.")
|
||||
;
|
||||
scalarMemReadsPerWF
|
||||
.name(name() + ".scalar_mem_reads_per_wf")
|
||||
.desc("The average number of scalar mem read insts per-wavefront.")
|
||||
;
|
||||
|
||||
vALUInstsPerWF = vALUInsts / completedWfs;
|
||||
sALUInstsPerWF = sALUInsts / completedWfs;
|
||||
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
|
||||
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
|
||||
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
|
||||
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
|
||||
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
|
||||
vectorMemReadsPerWF = vectorMemReads / completedWfs;
|
||||
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
|
||||
scalarMemReadsPerWF = scalarMemReads / completedWfs;
|
||||
|
||||
tlbCycles
|
||||
.name(name() + ".tlb_cycles")
|
||||
.desc("total number of cycles for all uncoalesced requests")
|
||||
|
@ -1566,6 +1674,39 @@ ComputeUnit::regStats()
|
|||
localMemoryPipe.regStats();
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (gpuDynInst->isScalar()) {
|
||||
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
|
||||
sALUInsts++;
|
||||
instCyclesSALU++;
|
||||
} else if (gpuDynInst->isLoad()) {
|
||||
scalarMemReads++;
|
||||
} else if (gpuDynInst->isStore()) {
|
||||
scalarMemWrites++;
|
||||
}
|
||||
} else {
|
||||
if (gpuDynInst->isALU()) {
|
||||
vALUInsts++;
|
||||
instCyclesVALU++;
|
||||
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
|
||||
} else if (gpuDynInst->isFlat()) {
|
||||
if (gpuDynInst->isLocalMem()) {
|
||||
flatLDSInsts++;
|
||||
} else {
|
||||
flatVMemInsts++;
|
||||
}
|
||||
} else if (gpuDynInst->isLocalMem()) {
|
||||
ldsNoFlatInsts++;
|
||||
} else if (gpuDynInst->isLoad()) {
|
||||
vectorMemReads++;
|
||||
} else if (gpuDynInst->isStore()) {
|
||||
vectorMemWrites++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::updatePageDivergenceDist(Addr addr)
|
||||
{
|
||||
|
|
|
@ -301,6 +301,31 @@ class ComputeUnit : public MemObject
|
|||
LdsState &lds;
|
||||
|
||||
public:
|
||||
Stats::Scalar vALUInsts;
|
||||
Stats::Formula vALUInstsPerWF;
|
||||
Stats::Scalar sALUInsts;
|
||||
Stats::Formula sALUInstsPerWF;
|
||||
Stats::Scalar instCyclesVALU;
|
||||
Stats::Scalar instCyclesSALU;
|
||||
Stats::Scalar threadCyclesVALU;
|
||||
Stats::Formula vALUUtilization;
|
||||
Stats::Scalar ldsNoFlatInsts;
|
||||
Stats::Formula ldsNoFlatInstsPerWF;
|
||||
Stats::Scalar flatVMemInsts;
|
||||
Stats::Formula flatVMemInstsPerWF;
|
||||
Stats::Scalar flatLDSInsts;
|
||||
Stats::Formula flatLDSInstsPerWF;
|
||||
Stats::Scalar vectorMemWrites;
|
||||
Stats::Formula vectorMemWritesPerWF;
|
||||
Stats::Scalar vectorMemReads;
|
||||
Stats::Formula vectorMemReadsPerWF;
|
||||
Stats::Scalar scalarMemWrites;
|
||||
Stats::Formula scalarMemWritesPerWF;
|
||||
Stats::Scalar scalarMemReads;
|
||||
Stats::Formula scalarMemReadsPerWF;
|
||||
|
||||
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
// the following stats compute the avg. TLB accesslatency per
|
||||
// uncoalesced request (only for data)
|
||||
Stats::Scalar tlbRequests;
|
||||
|
|
|
@ -656,7 +656,11 @@ Wavefront::exec()
|
|||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
||||
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
||||
ii->disassemble(), old_pc);
|
||||
|
||||
// update the instruction stats in the CU
|
||||
|
||||
ii->execute(ii);
|
||||
computeUnit->updateInstStats(ii);
|
||||
// access the VRF
|
||||
computeUnit->vrf[simdId]->exec(ii, this);
|
||||
srcRegOpDist.sample(ii->numSrcRegOperands());
|
||||
|
|
Loading…
Reference in a new issue