2016-01-19 20:28:22 +01:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* For use for simulation and test purposes only
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* Author: John Kalamatianos, Anthony Gutierrez
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __COMPUTE_UNIT_HH__
|
|
|
|
#define __COMPUTE_UNIT_HH__
|
|
|
|
|
|
|
|
#include <deque>
|
|
|
|
#include <map>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "base/callback.hh"
|
|
|
|
#include "base/statistics.hh"
|
|
|
|
#include "base/types.hh"
|
|
|
|
#include "enums/PrefetchType.hh"
|
|
|
|
#include "gpu-compute/exec_stage.hh"
|
|
|
|
#include "gpu-compute/fetch_stage.hh"
|
|
|
|
#include "gpu-compute/global_memory_pipeline.hh"
|
|
|
|
#include "gpu-compute/local_memory_pipeline.hh"
|
|
|
|
#include "gpu-compute/qstruct.hh"
|
|
|
|
#include "gpu-compute/schedule_stage.hh"
|
|
|
|
#include "gpu-compute/scoreboard_check_stage.hh"
|
|
|
|
#include "mem/mem_object.hh"
|
|
|
|
#include "mem/port.hh"
|
|
|
|
|
|
|
|
static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
|
|
|
|
static const int MAX_WIDTH_FOR_MEM_INST = 32;
|
|
|
|
|
|
|
|
class NDRange;
|
|
|
|
class Shader;
|
|
|
|
class VectorRegisterFile;
|
|
|
|
|
|
|
|
struct ComputeUnitParams;
|
|
|
|
|
|
|
|
enum EXEC_POLICY
|
|
|
|
{
|
|
|
|
OLDEST = 0,
|
|
|
|
RR
|
|
|
|
};
|
|
|
|
|
|
|
|
// List of execution units
|
|
|
|
enum EXEC_UNIT
|
|
|
|
{
|
|
|
|
SIMD0 = 0,
|
|
|
|
SIMD1,
|
|
|
|
SIMD2,
|
|
|
|
SIMD3,
|
|
|
|
GLBMEM_PIPE,
|
|
|
|
LDSMEM_PIPE,
|
|
|
|
NUM_UNITS
|
|
|
|
};
|
|
|
|
|
|
|
|
enum TLB_CACHE
|
|
|
|
{
|
|
|
|
TLB_MISS_CACHE_MISS = 0,
|
|
|
|
TLB_MISS_CACHE_HIT,
|
|
|
|
TLB_HIT_CACHE_MISS,
|
|
|
|
TLB_HIT_CACHE_HIT
|
|
|
|
};
|
|
|
|
|
|
|
|
class ComputeUnit : public MemObject
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
FetchStage fetchStage;
|
|
|
|
ScoreboardCheckStage scoreboardCheckStage;
|
|
|
|
ScheduleStage scheduleStage;
|
|
|
|
ExecStage execStage;
|
|
|
|
GlobalMemPipeline globalMemoryPipe;
|
|
|
|
LocalMemPipeline localMemoryPipe;
|
|
|
|
|
|
|
|
// Buffers used to communicate between various pipeline stages
|
|
|
|
|
|
|
|
// List of waves which are ready to be scheduled.
|
|
|
|
// Each execution resource has a ready list. readyList is
|
|
|
|
// used to communicate between scoreboardCheck stage and
|
|
|
|
// schedule stage
|
|
|
|
// TODO: make enum to index readyList
|
|
|
|
std::vector<std::vector<Wavefront*>> readyList;
|
|
|
|
|
|
|
|
// Stores the status of waves. A READY implies the
|
|
|
|
// wave is ready to be scheduled this cycle and
|
|
|
|
// is already present in the readyList. waveStatusList is
|
|
|
|
// used to communicate between scoreboardCheck stage and
|
|
|
|
// schedule stage
|
|
|
|
// TODO: convert std::pair to a class to increase readability
|
|
|
|
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
|
|
|
|
|
|
|
|
// List of waves which will be dispatched to
|
|
|
|
// each execution resource. A FILLED implies
|
|
|
|
// dispatch list is non-empty and
|
|
|
|
// execution unit has something to execute
|
|
|
|
// this cycle. Currently, the dispatch list of
|
|
|
|
// an execution resource can hold only one wave because
|
|
|
|
// an execution resource can execute only one wave in a cycle.
|
|
|
|
// dispatchList is used to communicate between schedule
|
|
|
|
// and exec stage
|
|
|
|
// TODO: convert std::pair to a class to increase readability
|
|
|
|
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
|
|
|
|
|
|
|
|
int rrNextMemID; // used by RR WF exec policy to cycle through WF's
|
|
|
|
int rrNextALUWp;
|
|
|
|
typedef ComputeUnitParams Params;
|
|
|
|
std::vector<std::vector<Wavefront*>> wfList;
|
|
|
|
int cu_id;
|
|
|
|
|
|
|
|
// array of vector register files, one per SIMD
|
|
|
|
std::vector<VectorRegisterFile*> vrf;
|
|
|
|
// Number of vector ALU units (SIMDs) in CU
|
|
|
|
int numSIMDs;
|
|
|
|
// number of pipe stages for bypassing data to next dependent single
|
|
|
|
// precision vector instruction inside the vector ALU pipeline
|
|
|
|
int spBypassPipeLength;
|
|
|
|
// number of pipe stages for bypassing data to next dependent double
|
|
|
|
// precision vector instruction inside the vector ALU pipeline
|
|
|
|
int dpBypassPipeLength;
|
|
|
|
// number of cycles per issue period
|
|
|
|
int issuePeriod;
|
|
|
|
|
|
|
|
// Number of global and local memory execution resources in CU
|
|
|
|
int numGlbMemUnits;
|
|
|
|
int numLocMemUnits;
|
|
|
|
// tracks the last cycle a vector instruction was executed on a SIMD
|
|
|
|
std::vector<uint64_t> lastExecCycle;
|
|
|
|
|
|
|
|
// true if we allow a separate TLB per lane
|
|
|
|
bool perLaneTLB;
|
|
|
|
// if 0, TLB prefetching is off.
|
|
|
|
int prefetchDepth;
|
|
|
|
// if fixed-stride prefetching, this is the stride.
|
|
|
|
int prefetchStride;
|
|
|
|
|
2016-06-09 17:24:55 +02:00
|
|
|
std::vector<Addr> lastVaddrCU;
|
|
|
|
std::vector<std::vector<Addr>> lastVaddrSimd;
|
2016-01-19 20:28:22 +01:00
|
|
|
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
|
|
|
|
Enums::PrefetchType prefetchType;
|
|
|
|
EXEC_POLICY exec_policy;
|
|
|
|
|
|
|
|
bool xact_cas_mode;
|
|
|
|
bool debugSegFault;
|
|
|
|
bool functionalTLB;
|
|
|
|
bool localMemBarrier;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* for Counting page accesses
|
|
|
|
*
|
|
|
|
* cuExitCallback inherits from Callback. When you register a callback
|
|
|
|
* function as an exit callback, it will get added to an exit callback
|
|
|
|
* queue, such that on simulation exit, all callbacks in the callback
|
|
|
|
* queue will have their process() function called.
|
|
|
|
*/
|
|
|
|
bool countPages;
|
|
|
|
|
|
|
|
Shader *shader;
|
|
|
|
uint32_t barrier_id;
|
|
|
|
// vector of Vector ALU (MACC) pipelines
|
|
|
|
std::vector<WaitClass> aluPipe;
|
|
|
|
// minimum issue period per SIMD unit (in cycles)
|
|
|
|
std::vector<WaitClass> wfWait;
|
|
|
|
|
|
|
|
// Resource control for Vector Register File->Global Memory pipe buses
|
|
|
|
std::vector<WaitClass> vrfToGlobalMemPipeBus;
|
|
|
|
// Resource control for Vector Register File->Local Memory pipe buses
|
|
|
|
std::vector<WaitClass> vrfToLocalMemPipeBus;
|
|
|
|
int nextGlbMemBus;
|
|
|
|
int nextLocMemBus;
|
|
|
|
// Resource control for global memory to VRF data/address bus
|
|
|
|
WaitClass glbMemToVrfBus;
|
|
|
|
// Resource control for local memory to VRF data/address bus
|
|
|
|
WaitClass locMemToVrfBus;
|
|
|
|
|
|
|
|
uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
|
|
|
|
uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
|
|
|
|
uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
|
|
|
|
uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
|
|
|
|
|
|
|
|
Tick req_tick_latency;
|
|
|
|
Tick resp_tick_latency;
|
|
|
|
|
|
|
|
// number of vector registers being reserved for each SIMD unit
|
|
|
|
std::vector<int> vectorRegsReserved;
|
|
|
|
// number of vector registers per SIMD unit
|
|
|
|
uint32_t numVecRegsPerSimd;
|
|
|
|
// Support for scheduling VGPR status update events
|
|
|
|
std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
|
|
|
|
std::vector<uint64_t> timestampVec;
|
|
|
|
std::vector<uint8_t> statusVec;
|
|
|
|
|
|
|
|
void
|
|
|
|
registerEvent(uint32_t simdId,
|
|
|
|
uint32_t regIdx,
|
|
|
|
uint32_t operandSize,
|
|
|
|
uint64_t when,
|
|
|
|
uint8_t newStatus) {
|
|
|
|
regIdxVec.push_back(std::make_pair(simdId, regIdx));
|
|
|
|
timestampVec.push_back(when);
|
|
|
|
statusVec.push_back(newStatus);
|
|
|
|
if (operandSize > 4) {
|
|
|
|
regIdxVec.push_back(std::make_pair(simdId,
|
|
|
|
((regIdx + 1) %
|
|
|
|
numVecRegsPerSimd)));
|
|
|
|
timestampVec.push_back(when);
|
|
|
|
statusVec.push_back(newStatus);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void updateEvents();
|
|
|
|
|
|
|
|
// this hash map will keep track of page divergence
|
|
|
|
// per memory instruction per wavefront. The hash map
|
|
|
|
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
|
|
|
|
std::map<Addr, int> pagesTouched;
|
|
|
|
|
|
|
|
ComputeUnit(const Params *p);
|
|
|
|
~ComputeUnit();
|
|
|
|
int spBypassLength() { return spBypassPipeLength; };
|
|
|
|
int dpBypassLength() { return dpBypassPipeLength; };
|
|
|
|
int storeBusLength() { return numCyclesPerStoreTransfer; };
|
|
|
|
int loadBusLength() { return numCyclesPerLoadTransfer; };
|
|
|
|
int wfSize() const { return wavefrontSize; };
|
|
|
|
|
|
|
|
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
|
|
|
|
void exec();
|
|
|
|
void initiateFetch(Wavefront *wavefront);
|
|
|
|
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
2016-10-04 19:03:52 +02:00
|
|
|
void fillKernelState(Wavefront *w, NDRange *ndr);
|
2016-01-19 20:28:22 +01:00
|
|
|
|
2016-10-04 19:03:52 +02:00
|
|
|
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
|
|
|
|
NDRange *ndr);
|
2016-01-19 20:28:22 +01:00
|
|
|
|
|
|
|
void StartWorkgroup(NDRange *ndr);
|
|
|
|
int ReadyWorkgroup(NDRange *ndr);
|
|
|
|
|
|
|
|
bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
|
|
|
|
bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
|
|
|
|
bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
|
|
|
|
int GlbMemUnitId() { return GLBMEM_PIPE; }
|
|
|
|
int ShrMemUnitId() { return LDSMEM_PIPE; }
|
|
|
|
int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
|
|
|
|
int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
|
|
|
|
/* This function cycles through all the wavefronts in all the phases to see
|
|
|
|
* if all of the wavefronts which should be associated with one barrier
|
|
|
|
* (denoted with _barrier_id), are all at the same barrier in the program
|
|
|
|
* (denoted by bcnt). When the number at the barrier matches bslots, then
|
|
|
|
* return true.
|
|
|
|
*/
|
|
|
|
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
|
|
|
|
bool cedeSIMD(int simdId, int wfSlotId);
|
|
|
|
|
|
|
|
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
|
|
|
|
virtual void init();
|
|
|
|
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
|
|
|
|
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
|
|
|
|
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
|
|
|
|
bool kernelLaunch=true,
|
|
|
|
RequestPtr req=nullptr);
|
|
|
|
void handleMemPacket(PacketPtr pkt, int memport_index);
|
|
|
|
bool processTimingPacket(PacketPtr pkt);
|
|
|
|
void processFetchReturn(PacketPtr pkt);
|
|
|
|
void updatePageDivergenceDist(Addr addr);
|
|
|
|
|
|
|
|
MasterID masterId() { return _masterId; }
|
|
|
|
|
|
|
|
bool isDone() const;
|
|
|
|
bool isSimdDone(uint32_t) const;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
MasterID _masterId;
|
|
|
|
|
|
|
|
LdsState &lds;
|
|
|
|
|
|
|
|
public:
|
2016-10-27 04:47:30 +02:00
|
|
|
Stats::Scalar vALUInsts;
|
|
|
|
Stats::Formula vALUInstsPerWF;
|
|
|
|
Stats::Scalar sALUInsts;
|
|
|
|
Stats::Formula sALUInstsPerWF;
|
|
|
|
Stats::Scalar instCyclesVALU;
|
|
|
|
Stats::Scalar instCyclesSALU;
|
|
|
|
Stats::Scalar threadCyclesVALU;
|
|
|
|
Stats::Formula vALUUtilization;
|
|
|
|
Stats::Scalar ldsNoFlatInsts;
|
|
|
|
Stats::Formula ldsNoFlatInstsPerWF;
|
|
|
|
Stats::Scalar flatVMemInsts;
|
|
|
|
Stats::Formula flatVMemInstsPerWF;
|
|
|
|
Stats::Scalar flatLDSInsts;
|
|
|
|
Stats::Formula flatLDSInstsPerWF;
|
|
|
|
Stats::Scalar vectorMemWrites;
|
|
|
|
Stats::Formula vectorMemWritesPerWF;
|
|
|
|
Stats::Scalar vectorMemReads;
|
|
|
|
Stats::Formula vectorMemReadsPerWF;
|
|
|
|
Stats::Scalar scalarMemWrites;
|
|
|
|
Stats::Formula scalarMemWritesPerWF;
|
|
|
|
Stats::Scalar scalarMemReads;
|
|
|
|
Stats::Formula scalarMemReadsPerWF;
|
|
|
|
|
|
|
|
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
|
|
|
|
2016-01-19 20:28:22 +01:00
|
|
|
// the following stats compute the avg. TLB accesslatency per
|
|
|
|
// uncoalesced request (only for data)
|
|
|
|
Stats::Scalar tlbRequests;
|
|
|
|
Stats::Scalar tlbCycles;
|
|
|
|
Stats::Formula tlbLatency;
|
|
|
|
// hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
|
|
|
|
Stats::Vector hitsPerTLBLevel;
|
|
|
|
|
|
|
|
Stats::Scalar ldsBankAccesses;
|
|
|
|
Stats::Distribution ldsBankConflictDist;
|
|
|
|
|
|
|
|
// over all memory instructions executed over all wavefronts
|
|
|
|
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
|
|
|
|
Stats::Distribution pageDivergenceDist;
|
|
|
|
Stats::Scalar dynamicGMemInstrCnt;
|
|
|
|
Stats::Scalar dynamicLMemInstrCnt;
|
|
|
|
|
|
|
|
Stats::Scalar wgBlockedDueLdsAllocation;
|
|
|
|
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
|
|
|
|
// when the instruction is committed, this number is still incremented by 1
|
|
|
|
Stats::Scalar numInstrExecuted;
|
|
|
|
// Number of cycles among successive instruction executions across all
|
|
|
|
// wavefronts of the same CU
|
|
|
|
Stats::Distribution execRateDist;
|
|
|
|
// number of individual vector operations executed
|
|
|
|
Stats::Scalar numVecOpsExecuted;
|
|
|
|
// Total cycles that something is running on the GPU
|
|
|
|
Stats::Scalar totalCycles;
|
|
|
|
Stats::Formula vpc; // vector ops per cycle
|
|
|
|
Stats::Formula ipc; // vector instructions per cycle
|
|
|
|
Stats::Distribution controlFlowDivergenceDist;
|
|
|
|
Stats::Distribution activeLanesPerGMemInstrDist;
|
|
|
|
Stats::Distribution activeLanesPerLMemInstrDist;
|
|
|
|
// number of vector ALU instructions received
|
|
|
|
Stats::Formula numALUInstsExecuted;
|
|
|
|
// number of times a WG can not start due to lack of free VGPRs in SIMDs
|
|
|
|
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
|
|
|
|
Stats::Scalar numCASOps;
|
|
|
|
Stats::Scalar numFailedCASOps;
|
|
|
|
Stats::Scalar completedWfs;
|
|
|
|
// flag per vector SIMD unit that is set when there is at least one
|
|
|
|
// WV that has a vector ALU instruction as the oldest in its
|
|
|
|
// Instruction Buffer: Defined in the Scoreboard stage, consumed
|
|
|
|
// by the Execute stage.
|
|
|
|
std::vector<bool> vectorAluInstAvail;
|
|
|
|
// number of available (oldest) LDS instructions that could have
|
|
|
|
// been issued to the LDS at a specific issue slot
|
|
|
|
int shrMemInstAvail;
|
|
|
|
// number of available Global memory instructions that could have
|
|
|
|
// been issued to TCP at a specific issue slot
|
|
|
|
int glbMemInstAvail;
|
|
|
|
|
|
|
|
void
|
|
|
|
regStats();
|
|
|
|
|
|
|
|
LdsState &
|
|
|
|
getLds() const
|
|
|
|
{
|
|
|
|
return lds;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t
|
|
|
|
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
|
|
|
|
|
|
|
|
bool
|
|
|
|
sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
|
|
|
|
|
|
|
|
typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
|
|
|
|
pageDataStruct pageAccesses;
|
|
|
|
|
|
|
|
class CUExitCallback : public Callback
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
|
|
|
|
public:
|
|
|
|
virtual ~CUExitCallback() { }
|
|
|
|
|
|
|
|
CUExitCallback(ComputeUnit *_cu)
|
|
|
|
{
|
|
|
|
computeUnit = _cu;
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
process();
|
|
|
|
};
|
|
|
|
|
|
|
|
CUExitCallback *cuExitCallback;
|
|
|
|
|
|
|
|
/** Data access Port **/
|
|
|
|
class DataPort : public MasterPort
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
|
|
|
|
: MasterPort(_name, _cu), computeUnit(_cu),
|
|
|
|
index(_index) { }
|
|
|
|
|
|
|
|
bool snoopRangeSent;
|
|
|
|
|
|
|
|
struct SenderState : public Packet::SenderState
|
|
|
|
{
|
|
|
|
GPUDynInstPtr _gpuDynInst;
|
|
|
|
int port_index;
|
|
|
|
Packet::SenderState *saved;
|
|
|
|
|
|
|
|
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
|
|
|
|
Packet::SenderState *sender_state=nullptr)
|
|
|
|
: _gpuDynInst(gpuDynInst),
|
|
|
|
port_index(_port_index),
|
|
|
|
saved(sender_state) { }
|
|
|
|
};
|
|
|
|
|
|
|
|
class MemReqEvent : public Event
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
DataPort *dataPort;
|
|
|
|
PacketPtr pkt;
|
|
|
|
|
|
|
|
public:
|
|
|
|
MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
|
|
|
|
: Event(), dataPort(_data_port), pkt(_pkt)
|
|
|
|
{
|
|
|
|
setFlags(Event::AutoDelete);
|
|
|
|
}
|
|
|
|
|
|
|
|
void process();
|
|
|
|
const char *description() const;
|
|
|
|
};
|
|
|
|
|
|
|
|
class MemRespEvent : public Event
|
|
|
|
{
|
|
|
|
private:
|
|
|
|
DataPort *dataPort;
|
|
|
|
PacketPtr pkt;
|
|
|
|
|
|
|
|
public:
|
|
|
|
MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
|
|
|
|
: Event(), dataPort(_data_port), pkt(_pkt)
|
|
|
|
{
|
|
|
|
setFlags(Event::AutoDelete);
|
|
|
|
}
|
|
|
|
|
|
|
|
void process();
|
|
|
|
const char *description() const;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
|
|
virtual void recvRangeChange() { }
|
|
|
|
virtual void recvReqRetry();
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
|
|
|
|
{
|
|
|
|
resp.clear();
|
|
|
|
snoop = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
// Instruction cache access port
|
|
|
|
class SQCPort : public MasterPort
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
|
|
|
|
: MasterPort(_name, _cu), computeUnit(_cu),
|
|
|
|
index(_index) { }
|
|
|
|
|
|
|
|
bool snoopRangeSent;
|
|
|
|
|
|
|
|
struct SenderState : public Packet::SenderState
|
|
|
|
{
|
|
|
|
Wavefront *wavefront;
|
|
|
|
Packet::SenderState *saved;
|
|
|
|
|
|
|
|
SenderState(Wavefront *_wavefront, Packet::SenderState
|
|
|
|
*sender_state=nullptr)
|
|
|
|
: wavefront(_wavefront), saved(sender_state) { }
|
|
|
|
};
|
|
|
|
|
|
|
|
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
|
|
virtual void recvRangeChange() { }
|
|
|
|
virtual void recvReqRetry();
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
|
|
|
|
{
|
|
|
|
resp.clear();
|
|
|
|
snoop = true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/** Data TLB port **/
|
|
|
|
class DTLBPort : public MasterPort
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
|
|
|
|
: MasterPort(_name, _cu), computeUnit(_cu),
|
|
|
|
index(_index), stalled(false)
|
|
|
|
{ }
|
|
|
|
|
|
|
|
bool isStalled() { return stalled; }
|
|
|
|
void stallPort() { stalled = true; }
|
|
|
|
void unstallPort() { stalled = false; }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* here we queue all the translation requests that were
|
|
|
|
* not successfully sent.
|
|
|
|
*/
|
|
|
|
std::deque<PacketPtr> retries;
|
|
|
|
|
|
|
|
/** SenderState is information carried along with the packet
|
|
|
|
* throughout the TLB hierarchy
|
|
|
|
*/
|
|
|
|
struct SenderState: public Packet::SenderState
|
|
|
|
{
|
|
|
|
// the memInst that this is associated with
|
|
|
|
GPUDynInstPtr _gpuDynInst;
|
|
|
|
|
|
|
|
// the lane in the memInst this is associated with, so we send
|
|
|
|
// the memory request down the right port
|
|
|
|
int portIndex;
|
|
|
|
|
|
|
|
// constructor used for packets involved in timing accesses
|
|
|
|
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
|
|
|
|
: _gpuDynInst(gpuDynInst), portIndex(port_index) { }
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
protected:
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
int index;
|
|
|
|
bool stalled;
|
|
|
|
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
|
|
virtual void recvRangeChange() { }
|
|
|
|
virtual void recvReqRetry();
|
|
|
|
};
|
|
|
|
|
|
|
|
class ITLBPort : public MasterPort
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
ITLBPort(const std::string &_name, ComputeUnit *_cu)
|
|
|
|
: MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
|
|
|
|
|
|
|
|
|
|
|
|
bool isStalled() { return stalled; }
|
|
|
|
void stallPort() { stalled = true; }
|
|
|
|
void unstallPort() { stalled = false; }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* here we queue all the translation requests that were
|
|
|
|
* not successfully sent.
|
|
|
|
*/
|
|
|
|
std::deque<PacketPtr> retries;
|
|
|
|
|
|
|
|
/** SenderState is information carried along with the packet
|
|
|
|
* throughout the TLB hierarchy
|
|
|
|
*/
|
|
|
|
struct SenderState: public Packet::SenderState
|
|
|
|
{
|
|
|
|
// The wavefront associated with this request
|
|
|
|
Wavefront *wavefront;
|
|
|
|
|
|
|
|
SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
|
|
|
|
};
|
|
|
|
|
|
|
|
protected:
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
bool stalled;
|
|
|
|
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
|
|
virtual void recvRangeChange() { }
|
|
|
|
virtual void recvReqRetry();
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* the port intended to communicate between the CU and its LDS
|
|
|
|
*/
|
|
|
|
class LDSPort : public MasterPort
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
|
|
|
|
: MasterPort(_name, _cu, _id), computeUnit(_cu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
bool isStalled() const { return stalled; }
|
|
|
|
void stallPort() { stalled = true; }
|
|
|
|
void unstallPort() { stalled = false; }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* here we queue all the requests that were
|
|
|
|
* not successfully sent.
|
|
|
|
*/
|
|
|
|
std::queue<PacketPtr> retries;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* SenderState is information carried along with the packet, esp. the
|
|
|
|
* GPUDynInstPtr
|
|
|
|
*/
|
|
|
|
class SenderState: public Packet::SenderState
|
|
|
|
{
|
|
|
|
protected:
|
|
|
|
// The actual read/write/atomic request that goes with this command
|
|
|
|
GPUDynInstPtr _gpuDynInst = nullptr;
|
|
|
|
|
|
|
|
public:
|
|
|
|
SenderState(GPUDynInstPtr gpuDynInst):
|
|
|
|
_gpuDynInst(gpuDynInst)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
GPUDynInstPtr
|
|
|
|
getMemInst() const
|
|
|
|
{
|
|
|
|
return _gpuDynInst;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
virtual bool
|
|
|
|
sendTimingReq(PacketPtr pkt);
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
|
|
|
bool stalled = false; ///< whether or not it is stalled
|
|
|
|
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
|
|
|
|
virtual bool
|
|
|
|
recvTimingResp(PacketPtr pkt);
|
|
|
|
|
|
|
|
virtual Tick
|
|
|
|
recvAtomic(PacketPtr pkt) { return 0; }
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
recvFunctional(PacketPtr pkt)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
recvRangeChange()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
recvReqRetry();
|
|
|
|
};
|
|
|
|
|
|
|
|
/** The port to access the Local Data Store
|
|
|
|
* Can be connected to a LDS object
|
|
|
|
*/
|
|
|
|
LDSPort *ldsPort = nullptr;
|
|
|
|
|
|
|
|
LDSPort *
|
|
|
|
getLdsPort() const
|
|
|
|
{
|
|
|
|
return ldsPort;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** The memory port for SIMD data accesses.
|
|
|
|
* Can be connected to PhysMem for Ruby for timing simulations
|
|
|
|
*/
|
|
|
|
std::vector<DataPort*> memPort;
|
|
|
|
// port to the TLB hierarchy (i.e., the L1 TLB)
|
|
|
|
std::vector<DTLBPort*> tlbPort;
|
|
|
|
// port to the SQC (i.e. the I-cache)
|
|
|
|
SQCPort *sqcPort;
|
|
|
|
// port to the SQC TLB (there's a separate TLB for each I-cache)
|
|
|
|
ITLBPort *sqcTLBPort;
|
|
|
|
|
|
|
|
virtual BaseMasterPort&
|
|
|
|
getMasterPort(const std::string &if_name, PortID idx)
|
|
|
|
{
|
|
|
|
if (if_name == "memory_port") {
|
|
|
|
memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
|
|
|
|
this, idx);
|
|
|
|
return *memPort[idx];
|
|
|
|
} else if (if_name == "translation_port") {
|
|
|
|
tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
|
|
|
|
this, idx);
|
|
|
|
return *tlbPort[idx];
|
|
|
|
} else if (if_name == "sqc_port") {
|
|
|
|
sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
|
|
|
|
this, idx);
|
|
|
|
return *sqcPort;
|
|
|
|
} else if (if_name == "sqc_tlb_port") {
|
|
|
|
sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
|
|
|
|
return *sqcTLBPort;
|
|
|
|
} else if (if_name == "ldsPort") {
|
|
|
|
if (ldsPort) {
|
|
|
|
fatal("an LDS port was already allocated");
|
|
|
|
}
|
|
|
|
ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
|
|
|
|
return *ldsPort;
|
|
|
|
} else {
|
|
|
|
panic("incorrect port name");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// xact_cas_load()
|
|
|
|
class waveIdentifier
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
waveIdentifier() { }
|
|
|
|
waveIdentifier(int _simdId, int _wfSlotId)
|
|
|
|
: simdId(_simdId), wfSlotId(_wfSlotId) { }
|
|
|
|
|
|
|
|
int simdId;
|
|
|
|
int wfSlotId;
|
|
|
|
};
|
|
|
|
|
|
|
|
class waveQueue
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
std::list<waveIdentifier> waveIDQueue;
|
|
|
|
};
|
|
|
|
std::map<unsigned, waveQueue> xactCasLoadMap;
|
|
|
|
|
|
|
|
uint64_t getAndIncSeqNum() { return globalSeqNum++; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
uint64_t globalSeqNum;
|
|
|
|
int wavefrontSize;
|
2016-10-27 04:47:11 +02:00
|
|
|
GPUStaticInst *kernelLaunchInst;
|
2016-01-19 20:28:22 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
#endif // __COMPUTE_UNIT_HH__
|