2016-01-19 20:28:22 +01:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* For use for simulation and test purposes only
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* Author: Steve Reinhardt
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "gpu-compute/shader.hh"
|
|
|
|
|
|
|
|
#include <limits>
|
|
|
|
|
|
|
|
#include "arch/x86/linux/linux.hh"
|
|
|
|
#include "base/chunk_generator.hh"
|
|
|
|
#include "debug/GPUDisp.hh"
|
|
|
|
#include "debug/GPUMem.hh"
|
|
|
|
#include "debug/HSAIL.hh"
|
|
|
|
#include "gpu-compute/dispatcher.hh"
|
|
|
|
#include "gpu-compute/gpu_static_inst.hh"
|
|
|
|
#include "gpu-compute/qstruct.hh"
|
|
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
#include "mem/packet.hh"
|
|
|
|
#include "mem/ruby/system/RubySystem.hh"
|
|
|
|
#include "sim/sim_exit.hh"
|
|
|
|
|
|
|
|
Shader::Shader(const Params *p) : SimObject(p),
|
|
|
|
clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
|
|
|
|
cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
|
|
|
|
hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
|
|
|
|
separate_acquire_release(p->separate_acquire_release), coissue_return(1),
|
|
|
|
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
|
|
|
|
globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
|
|
|
|
box_tick_cnt(0), start_tick_cnt(0)
|
|
|
|
{
|
|
|
|
|
|
|
|
cuList.resize(n_cu);
|
|
|
|
|
|
|
|
for (int i = 0; i < n_cu; ++i) {
|
|
|
|
cuList[i] = p->CUs[i];
|
|
|
|
assert(i == cuList[i]->cu_id);
|
|
|
|
cuList[i]->shader = this;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Addr
|
|
|
|
Shader::mmap(int length)
|
|
|
|
{
|
|
|
|
|
|
|
|
Addr start;
|
|
|
|
|
|
|
|
// round up length to the next page
|
|
|
|
length = roundUp(length, TheISA::PageBytes);
|
|
|
|
|
2016-03-17 18:29:32 +01:00
|
|
|
Process *proc = gpuTc->getProcessPtr();
|
|
|
|
|
|
|
|
if (proc->mmapGrowsDown()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
DPRINTF(HSAIL, "GROWS DOWN");
|
2016-03-17 18:29:32 +01:00
|
|
|
start = proc->mmap_end - length;
|
|
|
|
proc->mmap_end = start;
|
2016-01-19 20:28:22 +01:00
|
|
|
} else {
|
|
|
|
DPRINTF(HSAIL, "GROWS UP");
|
2016-03-17 18:29:32 +01:00
|
|
|
start = proc->mmap_end;
|
|
|
|
proc->mmap_end += length;
|
2016-01-19 20:28:22 +01:00
|
|
|
|
|
|
|
// assertion to make sure we don't overwrite the stack (it grows down)
|
2016-03-17 18:29:32 +01:00
|
|
|
assert(proc->mmap_end < proc->stack_base - proc->max_stack_size);
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
|
|
|
|
|
2016-03-17 18:29:32 +01:00
|
|
|
proc->allocateMem(start, length);
|
2016-01-19 20:28:22 +01:00
|
|
|
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::init()
|
|
|
|
{
|
|
|
|
// grab the threadContext of the thread running on the CPU
|
|
|
|
assert(cpuPointer);
|
|
|
|
gpuTc = cpuPointer->getContext(0);
|
|
|
|
assert(gpuTc);
|
|
|
|
}
|
|
|
|
|
|
|
|
Shader::~Shader()
|
|
|
|
{
|
|
|
|
for (int j = 0; j < n_cu; ++j)
|
|
|
|
delete cuList[j];
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2016-04-07 16:30:20 +02:00
|
|
|
Shader::updateContext(int cid) {
|
|
|
|
// context of the thread which dispatched work
|
2016-01-19 20:28:22 +01:00
|
|
|
assert(cpuPointer);
|
2016-04-07 16:30:20 +02:00
|
|
|
gpuTc = cpuPointer->getContext(cid);
|
2016-01-19 20:28:22 +01:00
|
|
|
assert(gpuTc);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::hostWakeUp(BaseCPU *cpu) {
|
|
|
|
if (cpuPointer == cpu) {
|
|
|
|
if (gpuTc->status() == ThreadContext::Suspended)
|
|
|
|
cpu->activateContext(gpuTc->threadId());
|
|
|
|
} else {
|
|
|
|
//Make sure both dispatcher and shader are trying to
|
|
|
|
//wakeup same host. Hack here to enable kernel launch
|
|
|
|
//from multiple CPUs
|
|
|
|
panic("Dispatcher wants to wakeup a different host");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Shader*
|
|
|
|
ShaderParams::create()
|
|
|
|
{
|
|
|
|
return new Shader(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::exec()
|
|
|
|
{
|
|
|
|
tick_cnt = curTick();
|
|
|
|
box_tick_cnt = curTick() - start_tick_cnt;
|
|
|
|
|
|
|
|
// apply any scheduled adds
|
|
|
|
for (int i = 0; i < sa_n; ++i) {
|
|
|
|
if (sa_when[i] <= tick_cnt) {
|
|
|
|
*sa_val[i] += sa_x[i];
|
|
|
|
sa_val.erase(sa_val.begin() + i);
|
|
|
|
sa_x.erase(sa_x.begin() + i);
|
|
|
|
sa_when.erase(sa_when.begin() + i);
|
|
|
|
--sa_n;
|
|
|
|
--i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// clock all of the cu's
|
|
|
|
for (int i = 0; i < n_cu; ++i)
|
|
|
|
cuList[i]->exec();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Shader::dispatch_workgroups(NDRange *ndr)
|
|
|
|
{
|
|
|
|
bool scheduledSomething = false;
|
|
|
|
int cuCount = 0;
|
|
|
|
int curCu = nextSchedCu;
|
|
|
|
|
|
|
|
while (cuCount < n_cu) {
|
|
|
|
//Every time we try a CU, update nextSchedCu
|
|
|
|
nextSchedCu = (nextSchedCu + 1) % n_cu;
|
|
|
|
|
|
|
|
// dispatch workgroup iff the following two conditions are met:
|
|
|
|
// (a) wg_rem is true - there are unassigned workgroups in the grid
|
|
|
|
// (b) there are enough free slots in cu cuList[i] for this wg
|
|
|
|
if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
|
|
|
|
scheduledSomething = true;
|
|
|
|
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
|
|
|
|
|
|
|
|
// ticks() member function translates cycles to simulation ticks.
|
|
|
|
if (!tickEvent.scheduled()) {
|
|
|
|
schedule(tickEvent, curTick() + this->ticks(1));
|
|
|
|
}
|
|
|
|
|
|
|
|
cuList[curCu]->StartWorkgroup(ndr);
|
|
|
|
ndr->wgId[0]++;
|
|
|
|
ndr->globalWgId++;
|
|
|
|
if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
|
|
|
|
ndr->wgId[0] = 0;
|
|
|
|
ndr->wgId[1]++;
|
|
|
|
|
|
|
|
if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
|
|
|
|
ndr->wgId[1] = 0;
|
|
|
|
ndr->wgId[2]++;
|
|
|
|
|
|
|
|
if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
|
|
|
|
ndr->wg_disp_rem = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
++cuCount;
|
|
|
|
curCu = nextSchedCu;
|
|
|
|
}
|
|
|
|
|
|
|
|
return scheduledSomething;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::handshake(GpuDispatcher *_dispatcher)
|
|
|
|
{
|
|
|
|
dispatcher = _dispatcher;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
|
|
|
|
bool suppress_func_errors, int cu_id)
|
|
|
|
{
|
2016-10-27 04:47:47 +02:00
|
|
|
int block_size = cuList.at(cu_id)->cacheLineSize();
|
2016-01-19 20:28:22 +01:00
|
|
|
unsigned size = req->getSize();
|
|
|
|
|
|
|
|
Addr tmp_addr;
|
|
|
|
BaseTLB::Mode trans_mode;
|
|
|
|
|
|
|
|
if (cmd == MemCmd::ReadReq) {
|
|
|
|
trans_mode = BaseTLB::Read;
|
|
|
|
} else if (cmd == MemCmd::WriteReq) {
|
|
|
|
trans_mode = BaseTLB::Write;
|
|
|
|
} else {
|
|
|
|
fatal("unexcepted MemCmd\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_addr = req->getVaddr();
|
|
|
|
Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
|
|
|
|
|
|
|
|
assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
|
|
|
|
|
|
|
|
// Misaligned access
|
|
|
|
if (split_addr > tmp_addr) {
|
|
|
|
RequestPtr req1, req2;
|
|
|
|
req->splitOnVaddr(split_addr, req1, req2);
|
|
|
|
|
|
|
|
|
|
|
|
PacketPtr pkt1 = new Packet(req2, cmd);
|
|
|
|
PacketPtr pkt2 = new Packet(req1, cmd);
|
|
|
|
|
|
|
|
functionalTLBAccess(pkt1, cu_id, trans_mode);
|
|
|
|
functionalTLBAccess(pkt2, cu_id, trans_mode);
|
|
|
|
|
|
|
|
PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
|
|
|
|
PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
|
|
|
|
|
|
|
|
new_pkt1->dataStatic(data);
|
|
|
|
new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
|
|
|
|
|
|
|
|
if (suppress_func_errors) {
|
|
|
|
new_pkt1->setSuppressFuncError();
|
|
|
|
new_pkt2->setSuppressFuncError();
|
|
|
|
}
|
|
|
|
|
|
|
|
// fixme: this should be cuList[cu_id] if cu_id != n_cu
|
|
|
|
// The latter requires a memPort in the dispatcher
|
|
|
|
cuList[0]->memPort[0]->sendFunctional(new_pkt1);
|
|
|
|
cuList[0]->memPort[0]->sendFunctional(new_pkt2);
|
|
|
|
|
|
|
|
delete new_pkt1;
|
|
|
|
delete new_pkt2;
|
|
|
|
delete pkt1;
|
|
|
|
delete pkt2;
|
|
|
|
} else {
|
|
|
|
PacketPtr pkt = new Packet(req, cmd);
|
|
|
|
functionalTLBAccess(pkt, cu_id, trans_mode);
|
|
|
|
PacketPtr new_pkt = new Packet(pkt->req, cmd);
|
|
|
|
new_pkt->dataStatic(data);
|
|
|
|
|
|
|
|
if (suppress_func_errors) {
|
|
|
|
new_pkt->setSuppressFuncError();
|
|
|
|
};
|
|
|
|
|
|
|
|
// fixme: this should be cuList[cu_id] if cu_id != n_cu
|
|
|
|
// The latter requires a memPort in the dispatcher
|
|
|
|
cuList[0]->memPort[0]->sendFunctional(new_pkt);
|
|
|
|
|
|
|
|
delete new_pkt;
|
|
|
|
delete pkt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Shader::busy()
|
|
|
|
{
|
|
|
|
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
|
|
|
if (!cuList[i_cu]->isDone()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
|
|
|
|
{
|
|
|
|
sa_val.push_back(val);
|
|
|
|
sa_when.push_back(tick_cnt + when);
|
|
|
|
sa_x.push_back(x);
|
|
|
|
++sa_n;
|
|
|
|
}
|
|
|
|
|
|
|
|
Shader::TickEvent::TickEvent(Shader *_shader)
|
|
|
|
: Event(CPU_Tick_Pri), shader(_shader)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::TickEvent::process()
|
|
|
|
{
|
|
|
|
if (shader->busy()) {
|
|
|
|
shader->exec();
|
|
|
|
shader->schedule(this, curTick() + shader->ticks(1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const char*
|
|
|
|
Shader::TickEvent::description() const
|
|
|
|
{
|
|
|
|
return "Shader tick";
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
|
|
MemCmd cmd, bool suppress_func_errors)
|
|
|
|
{
|
|
|
|
uint8_t *data_buf = (uint8_t*)ptr;
|
|
|
|
|
2016-10-27 04:47:47 +02:00
|
|
|
for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
|
2016-01-19 20:28:22 +01:00
|
|
|
!gen.done(); gen.next()) {
|
|
|
|
Request *req = new Request(0, gen.addr(), gen.size(), 0,
|
|
|
|
cuList[0]->masterId(), 0, 0, 0);
|
|
|
|
|
|
|
|
doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
|
|
|
|
data_buf += gen.size();
|
|
|
|
delete req;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
|
|
|
|
{
|
|
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
|
|
bool suppress_func_errors)
|
|
|
|
{
|
|
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
|
|
|
|
{
|
|
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
|
|
bool suppress_func_errors)
|
|
|
|
{
|
|
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
|
|
|
|
suppress_func_errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Send a packet through the appropriate TLB functional port.
|
|
|
|
* If cu_id=n_cu, then this is the dispatcher's TLB.
|
|
|
|
* Otherwise it's the TLB of the cu_id compute unit.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
|
|
|
|
{
|
|
|
|
// update senderState. Need to know the gpuTc and the TLB mode
|
|
|
|
pkt->senderState =
|
|
|
|
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
|
|
|
|
|
|
|
|
if (cu_id == n_cu) {
|
|
|
|
dispatcher->tlbPort->sendFunctional(pkt);
|
|
|
|
} else {
|
|
|
|
// even when the perLaneTLB flag is turned on
|
|
|
|
// it's ok tp send all accesses through lane 0
|
|
|
|
// since the lane # is not known here,
|
|
|
|
// This isn't important since these are functional accesses.
|
|
|
|
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* safe_cast the senderState */
|
|
|
|
TheISA::GpuTLB::TranslationState *sender_state =
|
|
|
|
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
|
|
|
|
|
|
|
delete sender_state->tlbEntry;
|
|
|
|
delete pkt->senderState;
|
|
|
|
}
|