gpu-compute: add instruction mix stats for the gpu
This commit is contained in:
parent
c7a79c9a42
commit
98d8a7051d
3 changed files with 170 additions and 0 deletions
|
@ -1408,6 +1408,114 @@ ComputeUnit::regStats()
|
||||||
{
|
{
|
||||||
MemObject::regStats();
|
MemObject::regStats();
|
||||||
|
|
||||||
|
vALUInsts
|
||||||
|
.name(name() + ".valu_insts")
|
||||||
|
.desc("Number of vector ALU insts issued.")
|
||||||
|
;
|
||||||
|
vALUInstsPerWF
|
||||||
|
.name(name() + ".valu_insts_per_wf")
|
||||||
|
.desc("The avg. number of vector ALU insts issued per-wavefront.")
|
||||||
|
;
|
||||||
|
sALUInsts
|
||||||
|
.name(name() + ".salu_insts")
|
||||||
|
.desc("Number of scalar ALU insts issued.")
|
||||||
|
;
|
||||||
|
sALUInstsPerWF
|
||||||
|
.name(name() + ".salu_insts_per_wf")
|
||||||
|
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
|
||||||
|
;
|
||||||
|
instCyclesVALU
|
||||||
|
.name(name() + ".inst_cycles_valu")
|
||||||
|
.desc("Number of cycles needed to execute VALU insts.")
|
||||||
|
;
|
||||||
|
instCyclesSALU
|
||||||
|
.name(name() + ".inst_cycles_salu")
|
||||||
|
.desc("Number of cycles needed to execute SALU insts.")
|
||||||
|
;
|
||||||
|
threadCyclesVALU
|
||||||
|
.name(name() + ".thread_cycles_valu")
|
||||||
|
.desc("Number of thread cycles used to execute vector ALU ops. "
|
||||||
|
"Similar to instCyclesVALU but multiplied by the number of "
|
||||||
|
"active threads.")
|
||||||
|
;
|
||||||
|
vALUUtilization
|
||||||
|
.name(name() + ".valu_utilization")
|
||||||
|
.desc("Percentage of active vector ALU threads in a wave.")
|
||||||
|
;
|
||||||
|
ldsNoFlatInsts
|
||||||
|
.name(name() + ".lds_no_flat_insts")
|
||||||
|
.desc("Number of LDS insts issued, not including FLAT "
|
||||||
|
"accesses that resolve to LDS.")
|
||||||
|
;
|
||||||
|
ldsNoFlatInstsPerWF
|
||||||
|
.name(name() + ".lds_no_flat_insts_per_wf")
|
||||||
|
.desc("The avg. number of LDS insts (not including FLAT "
|
||||||
|
"accesses that resolve to LDS) per-wavefront.")
|
||||||
|
;
|
||||||
|
flatVMemInsts
|
||||||
|
.name(name() + ".flat_vmem_insts")
|
||||||
|
.desc("The number of FLAT insts that resolve to vmem issued.")
|
||||||
|
;
|
||||||
|
flatVMemInstsPerWF
|
||||||
|
.name(name() + ".flat_vmem_insts_per_wf")
|
||||||
|
.desc("The average number of FLAT insts that resolve to vmem "
|
||||||
|
"issued per-wavefront.")
|
||||||
|
;
|
||||||
|
flatLDSInsts
|
||||||
|
.name(name() + ".flat_lds_insts")
|
||||||
|
.desc("The number of FLAT insts that resolve to LDS issued.")
|
||||||
|
;
|
||||||
|
flatLDSInstsPerWF
|
||||||
|
.name(name() + ".flat_lds_insts_per_wf")
|
||||||
|
.desc("The average number of FLAT insts that resolve to LDS "
|
||||||
|
"issued per-wavefront.")
|
||||||
|
;
|
||||||
|
vectorMemWrites
|
||||||
|
.name(name() + ".vector_mem_writes")
|
||||||
|
.desc("Number of vector mem write insts (excluding FLAT insts).")
|
||||||
|
;
|
||||||
|
vectorMemWritesPerWF
|
||||||
|
.name(name() + ".vector_mem_writes_per_wf")
|
||||||
|
.desc("The average number of vector mem write insts "
|
||||||
|
"(excluding FLAT insts) per-wavefront.")
|
||||||
|
;
|
||||||
|
vectorMemReads
|
||||||
|
.name(name() + ".vector_mem_reads")
|
||||||
|
.desc("Number of vector mem read insts (excluding FLAT insts).")
|
||||||
|
;
|
||||||
|
vectorMemReadsPerWF
|
||||||
|
.name(name() + ".vector_mem_reads_per_wf")
|
||||||
|
.desc("The avg. number of vector mem read insts (excluding "
|
||||||
|
"FLAT insts) per-wavefront.")
|
||||||
|
;
|
||||||
|
scalarMemWrites
|
||||||
|
.name(name() + ".scalar_mem_writes")
|
||||||
|
.desc("Number of scalar mem write insts.")
|
||||||
|
;
|
||||||
|
scalarMemWritesPerWF
|
||||||
|
.name(name() + ".scalar_mem_writes_per_wf")
|
||||||
|
.desc("The average number of scalar mem write insts per-wavefront.")
|
||||||
|
;
|
||||||
|
scalarMemReads
|
||||||
|
.name(name() + ".scalar_mem_reads")
|
||||||
|
.desc("Number of scalar mem read insts.")
|
||||||
|
;
|
||||||
|
scalarMemReadsPerWF
|
||||||
|
.name(name() + ".scalar_mem_reads_per_wf")
|
||||||
|
.desc("The average number of scalar mem read insts per-wavefront.")
|
||||||
|
;
|
||||||
|
|
||||||
|
vALUInstsPerWF = vALUInsts / completedWfs;
|
||||||
|
sALUInstsPerWF = sALUInsts / completedWfs;
|
||||||
|
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
|
||||||
|
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
|
||||||
|
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
|
||||||
|
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
|
||||||
|
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
|
||||||
|
vectorMemReadsPerWF = vectorMemReads / completedWfs;
|
||||||
|
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
|
||||||
|
scalarMemReadsPerWF = scalarMemReads / completedWfs;
|
||||||
|
|
||||||
tlbCycles
|
tlbCycles
|
||||||
.name(name() + ".tlb_cycles")
|
.name(name() + ".tlb_cycles")
|
||||||
.desc("total number of cycles for all uncoalesced requests")
|
.desc("total number of cycles for all uncoalesced requests")
|
||||||
|
@ -1566,6 +1674,39 @@ ComputeUnit::regStats()
|
||||||
localMemoryPipe.regStats();
|
localMemoryPipe.regStats();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
||||||
|
{
|
||||||
|
if (gpuDynInst->isScalar()) {
|
||||||
|
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
|
||||||
|
sALUInsts++;
|
||||||
|
instCyclesSALU++;
|
||||||
|
} else if (gpuDynInst->isLoad()) {
|
||||||
|
scalarMemReads++;
|
||||||
|
} else if (gpuDynInst->isStore()) {
|
||||||
|
scalarMemWrites++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (gpuDynInst->isALU()) {
|
||||||
|
vALUInsts++;
|
||||||
|
instCyclesVALU++;
|
||||||
|
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
|
||||||
|
} else if (gpuDynInst->isFlat()) {
|
||||||
|
if (gpuDynInst->isLocalMem()) {
|
||||||
|
flatLDSInsts++;
|
||||||
|
} else {
|
||||||
|
flatVMemInsts++;
|
||||||
|
}
|
||||||
|
} else if (gpuDynInst->isLocalMem()) {
|
||||||
|
ldsNoFlatInsts++;
|
||||||
|
} else if (gpuDynInst->isLoad()) {
|
||||||
|
vectorMemReads++;
|
||||||
|
} else if (gpuDynInst->isStore()) {
|
||||||
|
vectorMemWrites++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
ComputeUnit::updatePageDivergenceDist(Addr addr)
|
ComputeUnit::updatePageDivergenceDist(Addr addr)
|
||||||
{
|
{
|
||||||
|
|
|
@ -301,6 +301,31 @@ class ComputeUnit : public MemObject
|
||||||
LdsState &lds;
|
LdsState &lds;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
Stats::Scalar vALUInsts;
|
||||||
|
Stats::Formula vALUInstsPerWF;
|
||||||
|
Stats::Scalar sALUInsts;
|
||||||
|
Stats::Formula sALUInstsPerWF;
|
||||||
|
Stats::Scalar instCyclesVALU;
|
||||||
|
Stats::Scalar instCyclesSALU;
|
||||||
|
Stats::Scalar threadCyclesVALU;
|
||||||
|
Stats::Formula vALUUtilization;
|
||||||
|
Stats::Scalar ldsNoFlatInsts;
|
||||||
|
Stats::Formula ldsNoFlatInstsPerWF;
|
||||||
|
Stats::Scalar flatVMemInsts;
|
||||||
|
Stats::Formula flatVMemInstsPerWF;
|
||||||
|
Stats::Scalar flatLDSInsts;
|
||||||
|
Stats::Formula flatLDSInstsPerWF;
|
||||||
|
Stats::Scalar vectorMemWrites;
|
||||||
|
Stats::Formula vectorMemWritesPerWF;
|
||||||
|
Stats::Scalar vectorMemReads;
|
||||||
|
Stats::Formula vectorMemReadsPerWF;
|
||||||
|
Stats::Scalar scalarMemWrites;
|
||||||
|
Stats::Formula scalarMemWritesPerWF;
|
||||||
|
Stats::Scalar scalarMemReads;
|
||||||
|
Stats::Formula scalarMemReadsPerWF;
|
||||||
|
|
||||||
|
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
||||||
|
|
||||||
// the following stats compute the avg. TLB accesslatency per
|
// the following stats compute the avg. TLB accesslatency per
|
||||||
// uncoalesced request (only for data)
|
// uncoalesced request (only for data)
|
||||||
Stats::Scalar tlbRequests;
|
Stats::Scalar tlbRequests;
|
||||||
|
|
|
@ -656,7 +656,11 @@ Wavefront::exec()
|
||||||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
||||||
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
||||||
ii->disassemble(), old_pc);
|
ii->disassemble(), old_pc);
|
||||||
|
|
||||||
|
// update the instruction stats in the CU
|
||||||
|
|
||||||
ii->execute(ii);
|
ii->execute(ii);
|
||||||
|
computeUnit->updateInstStats(ii);
|
||||||
// access the VRF
|
// access the VRF
|
||||||
computeUnit->vrf[simdId]->exec(ii, this);
|
computeUnit->vrf[simdId]->exec(ii, this);
|
||||||
srcRegOpDist.sample(ii->numSrcRegOperands());
|
srcRegOpDist.sample(ii->numSrcRegOperands());
|
||||||
|
|
Loading…
Reference in a new issue