gpu-compute: parametrize Wavefront size
Eliminate the VSZ constant that defined the Wavefront size (in numbers of work items); replaced it with a parameter in the GPU.py configuration script. Changed all data structures dependent on the Wavefront size to be dynamically sized. Legal values of Wavefront size are 16, 32, 64 for now and checked at initialization time.
This commit is contained in:
parent
e5b7b6780f
commit
3724fb15fa
25 changed files with 256 additions and 193 deletions
|
@ -250,7 +250,8 @@ for i in xrange(n_cu):
|
||||||
vrfs = []
|
vrfs = []
|
||||||
for j in xrange(options.simds_per_cu):
|
for j in xrange(options.simds_per_cu):
|
||||||
for k in xrange(shader.n_wf):
|
for k in xrange(shader.n_wf):
|
||||||
wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
|
wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
|
||||||
|
wfSize = options.wf_size))
|
||||||
vrfs.append(VectorRegisterFile(simd_id=j,
|
vrfs.append(VectorRegisterFile(simd_id=j,
|
||||||
num_regs_per_simd=options.vreg_file_size))
|
num_regs_per_simd=options.vreg_file_size))
|
||||||
compute_units[-1].wavefronts = wavefronts
|
compute_units[-1].wavefronts = wavefronts
|
||||||
|
|
|
@ -235,7 +235,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst)
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
DestCType dest_val = $expr;
|
DestCType dest_val = $expr;
|
||||||
this->dest.set(w, lane, dest_val);
|
this->dest.set(w, lane, dest_val);
|
||||||
|
@ -256,7 +256,7 @@ $class_name::execute(GPUDynInstPtr gpuDynInst)
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
|
SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
|
||||||
DestCType dest_val = $expr;
|
DestCType dest_val = $expr;
|
||||||
|
@ -277,7 +277,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
CType dest_val;
|
CType dest_val;
|
||||||
if ($dest_is_src_flag) {
|
if ($dest_is_src_flag) {
|
||||||
|
@ -312,7 +312,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
CType dest_val;
|
CType dest_val;
|
||||||
|
|
||||||
|
@ -346,7 +346,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
DestT dest_val;
|
DestT dest_val;
|
||||||
if ($dest_is_src_flag) {
|
if ($dest_is_src_flag) {
|
||||||
|
@ -372,7 +372,7 @@ $class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||||
Wavefront *w = gpuDynInst->wavefront();
|
Wavefront *w = gpuDynInst->wavefront();
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
CType dest_val;
|
CType dest_val;
|
||||||
|
|
||||||
|
@ -401,7 +401,7 @@ $class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
|
||||||
|
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
DestCType dest_val;
|
DestCType dest_val;
|
||||||
SrcCType src_val[$num_srcs];
|
SrcCType src_val[$num_srcs];
|
||||||
|
|
|
@ -279,7 +279,7 @@ namespace HsailISA
|
||||||
// taken branch
|
// taken branch
|
||||||
const uint32_t true_pc = getTargetPc();
|
const uint32_t true_pc = getTargetPc();
|
||||||
VectorMask true_mask;
|
VectorMask true_mask;
|
||||||
for (unsigned int lane = 0; lane < VSZ; ++lane) {
|
for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
|
true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -134,7 +134,7 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
// mask off completed work-items
|
// mask off completed work-items
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
w->init_mask[lane] = 0;
|
w->init_mask[lane] = 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -457,7 +457,7 @@ namespace HsailISA
|
||||||
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
|
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
|
||||||
|
|
||||||
if (num_dest_operands > 1) {
|
if (num_dest_operands > 1) {
|
||||||
for (int i = 0; i < VSZ; ++i)
|
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
|
||||||
if (gpuDynInst->exec_mask[i])
|
if (gpuDynInst->exec_mask[i])
|
||||||
gpuDynInst->statusVector.push_back(num_dest_operands);
|
gpuDynInst->statusVector.push_back(num_dest_operands);
|
||||||
else
|
else
|
||||||
|
@ -466,9 +466,10 @@ namespace HsailISA
|
||||||
|
|
||||||
for (int k = 0; k < num_dest_operands; ++k) {
|
for (int k = 0; k < num_dest_operands; ++k) {
|
||||||
|
|
||||||
c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
|
c0 *d = &((c0*)gpuDynInst->d_data)
|
||||||
|
[k * gpuDynInst->computeUnit()->wfSize()];
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
|
||||||
if (gpuDynInst->exec_mask[i]) {
|
if (gpuDynInst->exec_mask[i]) {
|
||||||
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
|
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
|
||||||
|
|
||||||
|
@ -1004,7 +1005,7 @@ namespace HsailISA
|
||||||
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
|
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
|
||||||
|
|
||||||
if (num_src_operands > 1) {
|
if (num_src_operands > 1) {
|
||||||
for (int i = 0; i < VSZ; ++i)
|
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
|
||||||
if (gpuDynInst->exec_mask[i])
|
if (gpuDynInst->exec_mask[i])
|
||||||
gpuDynInst->statusVector.push_back(num_src_operands);
|
gpuDynInst->statusVector.push_back(num_src_operands);
|
||||||
else
|
else
|
||||||
|
@ -1012,9 +1013,10 @@ namespace HsailISA
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int k = 0; k < num_src_operands; ++k) {
|
for (int k = 0; k < num_src_operands; ++k) {
|
||||||
c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
|
c0 *d = &((c0*)gpuDynInst->d_data)
|
||||||
|
[k * gpuDynInst->computeUnit()->wfSize()];
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
|
||||||
if (gpuDynInst->exec_mask[i]) {
|
if (gpuDynInst->exec_mask[i]) {
|
||||||
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
|
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
|
||||||
|
|
||||||
|
@ -1402,7 +1404,7 @@ namespace HsailISA
|
||||||
c0 *e = &((c0*) gpuDynInst->a_data)[0];
|
c0 *e = &((c0*) gpuDynInst->a_data)[0];
|
||||||
c0 *f = &((c0*) gpuDynInst->x_data)[0];
|
c0 *f = &((c0*) gpuDynInst->x_data)[0];
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
|
||||||
if (gpuDynInst->exec_mask[i]) {
|
if (gpuDynInst->exec_mask[i]) {
|
||||||
Addr vaddr = gpuDynInst->addr[i];
|
Addr vaddr = gpuDynInst->addr[i];
|
||||||
|
|
||||||
|
|
|
@ -60,14 +60,16 @@ namespace HsailISA
|
||||||
|
|
||||||
typedef typename DestDataType::CType CType M5_VAR_USED;
|
typedef typename DestDataType::CType CType M5_VAR_USED;
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
uint64_t addr_vec[VSZ];
|
std::vector<Addr> addr_vec;
|
||||||
|
addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
|
||||||
this->addr.calcVector(w, addr_vec);
|
this->addr.calcVector(w, addr_vec);
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
this->dest.set(w, lane, addr_vec[lane]);
|
this->dest.set(w, lane, addr_vec[lane]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
addr_vec.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename MemDataType, typename DestDataType,
|
template<typename MemDataType, typename DestDataType,
|
||||||
|
@ -121,8 +123,8 @@ namespace HsailISA
|
||||||
i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
|
i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
|
||||||
assert(se);
|
assert(se);
|
||||||
|
|
||||||
return w->wfSlotId * w->privSizePerItem * VSZ +
|
return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
|
||||||
se->offset * VSZ +
|
se->offset * w->computeUnit->wfSize() +
|
||||||
lane * se->size;
|
lane * se->size;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -139,9 +141,11 @@ namespace HsailISA
|
||||||
Addr addr_div8 = addr / 8;
|
Addr addr_div8 = addr / 8;
|
||||||
Addr addr_mod8 = addr % 8;
|
Addr addr_mod8 = addr % 8;
|
||||||
|
|
||||||
Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
|
Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
|
||||||
|
addr_mod8 + w->privBase;
|
||||||
|
|
||||||
assert(ret < w->privBase + (w->privSizePerItem * VSZ));
|
assert(ret < w->privBase +
|
||||||
|
(w->privSizePerItem * w->computeUnit->wfSize()));
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -175,7 +179,7 @@ namespace HsailISA
|
||||||
|
|
||||||
DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
|
DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
this->dest.set(w, lane, val);
|
this->dest.set(w, lane, val);
|
||||||
}
|
}
|
||||||
|
@ -184,7 +188,7 @@ namespace HsailISA
|
||||||
return;
|
return;
|
||||||
} else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
|
} else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
|
||||||
uint64_t address = this->addr.calcUniform();
|
uint64_t address = this->addr.calcUniform();
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
MemCType val = w->readCallArgMem<MemCType>(lane, address);
|
MemCType val = w->readCallArgMem<MemCType>(lane, address);
|
||||||
|
|
||||||
|
@ -239,7 +243,7 @@ namespace HsailISA
|
||||||
// this is a complete hack to get around a compiler bug
|
// this is a complete hack to get around a compiler bug
|
||||||
// (the compiler currently generates global access for private
|
// (the compiler currently generates global access for private
|
||||||
// addresses (starting from 0). We need to add the private offset)
|
// addresses (starting from 0). We need to add the private offset)
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (m->addr[lane] < w->privSizePerItem) {
|
if (m->addr[lane] < w->privSizePerItem) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
// what is the size of the object we are accessing?
|
// what is the size of the object we are accessing?
|
||||||
|
@ -267,7 +271,7 @@ namespace HsailISA
|
||||||
m->pipeId = GLBMEM_PIPE;
|
m->pipeId = GLBMEM_PIPE;
|
||||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||||
{
|
{
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
// note: this calculation will NOT WORK if the compiler
|
// note: this calculation will NOT WORK if the compiler
|
||||||
// ever generates loads/stores to the same address with
|
// ever generates loads/stores to the same address with
|
||||||
// different widths (e.g., a ld_u32 addr and a ld_u16 addr)
|
// different widths (e.g., a ld_u32 addr and a ld_u16 addr)
|
||||||
|
@ -301,7 +305,7 @@ namespace HsailISA
|
||||||
m->pipeId = GLBMEM_PIPE;
|
m->pipeId = GLBMEM_PIPE;
|
||||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
|
assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
|
||||||
m->addr[lane] += w->roBase;
|
m->addr[lane] += w->roBase;
|
||||||
|
@ -318,7 +322,7 @@ namespace HsailISA
|
||||||
m->pipeId = GLBMEM_PIPE;
|
m->pipeId = GLBMEM_PIPE;
|
||||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||||
{
|
{
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
assert(m->addr[lane] < w->privSizePerItem);
|
assert(m->addr[lane] < w->privSizePerItem);
|
||||||
|
|
||||||
|
@ -360,7 +364,7 @@ namespace HsailISA
|
||||||
if (this->segment == Brig::BRIG_SEGMENT_ARG) {
|
if (this->segment == Brig::BRIG_SEGMENT_ARG) {
|
||||||
uint64_t address = this->addr.calcUniform();
|
uint64_t address = this->addr.calcUniform();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
CType data = this->src.template get<CType>(w, lane);
|
CType data = this->src.template get<CType>(w, lane);
|
||||||
DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
|
DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
|
||||||
|
@ -378,7 +382,7 @@ namespace HsailISA
|
||||||
this->addr.calcVector(w, m->addr);
|
this->addr.calcVector(w, m->addr);
|
||||||
|
|
||||||
if (num_src_operands == 1) {
|
if (num_src_operands == 1) {
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
((CType*)m->d_data)[lane] =
|
((CType*)m->d_data)[lane] =
|
||||||
this->src.template get<CType>(w, lane);
|
this->src.template get<CType>(w, lane);
|
||||||
|
@ -386,9 +390,9 @@ namespace HsailISA
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int k= 0; k < num_src_operands; ++k) {
|
for (int k= 0; k < num_src_operands; ++k) {
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
((CType*)m->d_data)[k * VSZ + lane] =
|
((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] =
|
||||||
this->src_vect[k].template get<CType>(w, lane);
|
this->src_vect[k].template get<CType>(w, lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -428,7 +432,7 @@ namespace HsailISA
|
||||||
// this is a complete hack to get around a compiler bug
|
// this is a complete hack to get around a compiler bug
|
||||||
// (the compiler currently generates global access for private
|
// (the compiler currently generates global access for private
|
||||||
// addresses (starting from 0). We need to add the private offset)
|
// addresses (starting from 0). We need to add the private offset)
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
if (m->addr[lane] < w->privSizePerItem) {
|
if (m->addr[lane] < w->privSizePerItem) {
|
||||||
|
|
||||||
|
@ -454,7 +458,7 @@ namespace HsailISA
|
||||||
m->pipeId = GLBMEM_PIPE;
|
m->pipeId = GLBMEM_PIPE;
|
||||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||||
{
|
{
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
assert(m->addr[lane] < w->spillSizePerItem);
|
assert(m->addr[lane] < w->spillSizePerItem);
|
||||||
|
|
||||||
|
@ -483,7 +487,7 @@ namespace HsailISA
|
||||||
m->pipeId = GLBMEM_PIPE;
|
m->pipeId = GLBMEM_PIPE;
|
||||||
m->latency.set(w->computeUnit->shader->ticks(1));
|
m->latency.set(w->computeUnit->shader->ticks(1));
|
||||||
{
|
{
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
assert(m->addr[lane] < w->privSizePerItem);
|
assert(m->addr[lane] < w->privSizePerItem);
|
||||||
m->addr[lane] = m->addr[lane] + lane *
|
m->addr[lane] = m->addr[lane] + lane *
|
||||||
|
@ -558,14 +562,14 @@ namespace HsailISA
|
||||||
|
|
||||||
this->addr.calcVector(w, m->addr);
|
this->addr.calcVector(w, m->addr);
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
((CType *)m->a_data)[lane] =
|
((CType *)m->a_data)[lane] =
|
||||||
this->src[0].template get<CType>(w, lane);
|
this->src[0].template get<CType>(w, lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
// load second source operand for CAS
|
// load second source operand for CAS
|
||||||
if (NumSrcOperands > 1) {
|
if (NumSrcOperands > 1) {
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
((CType*)m->x_data)[lane] =
|
((CType*)m->x_data)[lane] =
|
||||||
this->src[1].template get<CType>(w, lane);
|
this->src[1].template get<CType>(w, lane);
|
||||||
}
|
}
|
||||||
|
|
|
@ -84,7 +84,7 @@ namespace HsailISA
|
||||||
int op = 0;
|
int op = 0;
|
||||||
bool got_op = false;
|
bool got_op = false;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val0 = src1.get<int>(w, lane, 0);
|
int src_val0 = src1.get<int>(w, lane, 0);
|
||||||
if (got_op) {
|
if (got_op) {
|
||||||
|
@ -182,7 +182,7 @@ namespace HsailISA
|
||||||
{
|
{
|
||||||
#if TRACING_ON
|
#if TRACING_ON
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
int src_val2 = src1.get<int>(w, lane, 2);
|
int src_val2 = src1.get<int>(w, lane, 2);
|
||||||
|
@ -205,7 +205,7 @@ namespace HsailISA
|
||||||
{
|
{
|
||||||
#if TRACING_ON
|
#if TRACING_ON
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
|
||||||
int src_val2 = src1.get<int>(w, lane, 2);
|
int src_val2 = src1.get<int>(w, lane, 2);
|
||||||
|
@ -231,7 +231,7 @@ namespace HsailISA
|
||||||
std::string res_str;
|
std::string res_str;
|
||||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (!(lane & 7)) {
|
if (!(lane & 7)) {
|
||||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||||
}
|
}
|
||||||
|
@ -270,7 +270,7 @@ namespace HsailISA
|
||||||
int src_val3 = -1;
|
int src_val3 = -1;
|
||||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (!(lane & 7)) {
|
if (!(lane & 7)) {
|
||||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||||
}
|
}
|
||||||
|
@ -311,7 +311,7 @@ namespace HsailISA
|
||||||
std::string res_str;
|
std::string res_str;
|
||||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (!(lane & 3)) {
|
if (!(lane & 3)) {
|
||||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||||
}
|
}
|
||||||
|
@ -350,7 +350,7 @@ namespace HsailISA
|
||||||
int src_val3 = -1;
|
int src_val3 = -1;
|
||||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (!(lane & 3)) {
|
if (!(lane & 3)) {
|
||||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||||
}
|
}
|
||||||
|
@ -391,7 +391,7 @@ namespace HsailISA
|
||||||
std::string res_str;
|
std::string res_str;
|
||||||
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
res_str = csprintf("krl_prt (%s)\n", disassemble());
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (!(lane & 7)) {
|
if (!(lane & 7)) {
|
||||||
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
|
||||||
}
|
}
|
||||||
|
@ -430,7 +430,7 @@ namespace HsailISA
|
||||||
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
|
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
|
||||||
res_str += csprintf(" Exec mask: ");
|
res_str += csprintf(" Exec mask: ");
|
||||||
|
|
||||||
for (int i = VSZ - 1; i >= 0; --i) {
|
for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
|
||||||
if (w->execMask(i))
|
if (w->execMask(i))
|
||||||
res_str += "1";
|
res_str += "1";
|
||||||
else
|
else
|
||||||
|
@ -458,7 +458,7 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
dest.set<int>(w, lane, res);
|
dest.set<int>(w, lane, res);
|
||||||
|
@ -477,14 +477,14 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
res += src_val1;
|
res += src_val1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
dest.set<int>(w, lane, res);
|
dest.set<int>(w, lane, res);
|
||||||
}
|
}
|
||||||
|
@ -497,19 +497,19 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
|
|
||||||
if (src_val1) {
|
if (src_val1) {
|
||||||
if (lane < (VSZ/2)) {
|
if (lane < (w->computeUnit->wfSize()/2)) {
|
||||||
res = res | ((uint32_t)(1) << lane);
|
res = res | ((uint32_t)(1) << lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
dest.set<int>(w, lane, res);
|
dest.set<int>(w, lane, res);
|
||||||
}
|
}
|
||||||
|
@ -521,19 +521,20 @@ namespace HsailISA
|
||||||
{
|
{
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int res = 0;
|
int res = 0;
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
|
|
||||||
if (src_val1) {
|
if (src_val1) {
|
||||||
if (lane >= (VSZ/2)) {
|
if (lane >= (w->computeUnit->wfSize()/2)) {
|
||||||
res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
|
res = res | ((uint32_t)(1) <<
|
||||||
|
(lane - (w->computeUnit->wfSize()/2)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
dest.set<int>(w, lane, res);
|
dest.set<int>(w, lane, res);
|
||||||
}
|
}
|
||||||
|
@ -546,7 +547,7 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int max_cnt = 0;
|
int max_cnt = 0;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
w->bar_cnt[lane]++;
|
w->bar_cnt[lane]++;
|
||||||
|
|
||||||
|
@ -567,7 +568,7 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int max_cnt = 0;
|
int max_cnt = 0;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
w->bar_cnt[lane]--;
|
w->bar_cnt[lane]--;
|
||||||
}
|
}
|
||||||
|
@ -592,7 +593,7 @@ namespace HsailISA
|
||||||
{
|
{
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
|
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
|
||||||
|
@ -605,7 +606,7 @@ namespace HsailISA
|
||||||
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
|
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
|
||||||
{
|
{
|
||||||
// the address is in src1 | src2
|
// the address is in src1 | src2
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
int src_val1 = src1.get<int>(w, lane, 1);
|
int src_val1 = src1.get<int>(w, lane, 1);
|
||||||
int src_val2 = src1.get<int>(w, lane, 2);
|
int src_val2 = src1.get<int>(w, lane, 2);
|
||||||
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
|
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
|
||||||
|
@ -622,7 +623,7 @@ namespace HsailISA
|
||||||
|
|
||||||
calcAddr(w, m);
|
calcAddr(w, m);
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
|
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -661,7 +662,7 @@ namespace HsailISA
|
||||||
GPUDynInstPtr m = gpuDynInst;
|
GPUDynInstPtr m = gpuDynInst;
|
||||||
calcAddr(w, m);
|
calcAddr(w, m);
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
|
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -736,7 +737,7 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
int src_val1 = 0;
|
int src_val1 = 0;
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
src_val1 = src1.get<int>(w, lane, 1);
|
src_val1 = src1.get<int>(w, lane, 1);
|
||||||
break;
|
break;
|
||||||
|
@ -758,7 +759,7 @@ namespace HsailISA
|
||||||
const VectorMask &mask = w->get_pred();
|
const VectorMask &mask = w->get_pred();
|
||||||
unsigned mst = true;
|
unsigned mst = true;
|
||||||
|
|
||||||
for (int lane = VSZ - 1; lane >= 0; --lane) {
|
for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
dest.set<int>(w, lane, mst);
|
dest.set<int>(w, lane, mst);
|
||||||
mst = false;
|
mst = false;
|
||||||
|
@ -773,7 +774,7 @@ namespace HsailISA
|
||||||
int res = 0;
|
int res = 0;
|
||||||
bool got_res = false;
|
bool got_res = false;
|
||||||
|
|
||||||
for (int lane = VSZ - 1; lane >= 0; --lane) {
|
for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
|
||||||
if (mask[lane]) {
|
if (mask[lane]) {
|
||||||
if (!got_res) {
|
if (!got_res) {
|
||||||
res = src1.get<int>(w, lane, 1);
|
res = src1.get<int>(w, lane, 1);
|
||||||
|
|
|
@ -42,6 +42,7 @@
|
||||||
* Defines classes encapsulating HSAIL instruction operands.
|
* Defines classes encapsulating HSAIL instruction operands.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <limits>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "arch/hsail/Brig.h"
|
#include "arch/hsail/Brig.h"
|
||||||
|
@ -346,6 +347,8 @@ class CRegOperand : public BaseRegOperand
|
||||||
template<typename T>
|
template<typename T>
|
||||||
class ImmOperand : public BaseOperand
|
class ImmOperand : public BaseOperand
|
||||||
{
|
{
|
||||||
|
private:
|
||||||
|
uint16_t kind;
|
||||||
public:
|
public:
|
||||||
T bits;
|
T bits;
|
||||||
|
|
||||||
|
@ -355,11 +358,21 @@ class ImmOperand : public BaseOperand
|
||||||
|
|
||||||
template<typename OperandType>
|
template<typename OperandType>
|
||||||
OperandType
|
OperandType
|
||||||
get()
|
get(Wavefront *w)
|
||||||
{
|
{
|
||||||
assert(sizeof(OperandType) <= sizeof(T));
|
assert(sizeof(OperandType) <= sizeof(T));
|
||||||
|
panic_if(w == nullptr, "WF pointer needs to be set");
|
||||||
|
|
||||||
return *(OperandType*)&bits;
|
switch (kind) {
|
||||||
|
// immediate operand is WF size
|
||||||
|
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
|
||||||
|
return (OperandType)w->computeUnit->wfSize();
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return *(OperandType*)&bits;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This version of get() takes a WF* and a lane id for
|
// This version of get() takes a WF* and a lane id for
|
||||||
|
@ -368,7 +381,7 @@ class ImmOperand : public BaseOperand
|
||||||
OperandType
|
OperandType
|
||||||
get(Wavefront *w, int lane)
|
get(Wavefront *w, int lane)
|
||||||
{
|
{
|
||||||
return get<OperandType>();
|
return get<OperandType>(w);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -388,16 +401,18 @@ ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
|
||||||
auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
|
auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
|
||||||
|
|
||||||
bits = *((T*)(obj->getData(cbptr->bytes + 4)));
|
bits = *((T*)(obj->getData(cbptr->bytes + 4)));
|
||||||
|
kind = brigOp->kind;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
|
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
|
||||||
bits = VSZ;
|
kind = brigOp->kind;
|
||||||
|
bits = std::numeric_limits<unsigned long long>::digits;
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
kind = Brig::BRIG_KIND_NONE;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -409,6 +424,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
|
||||||
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
|
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
|
||||||
|
|
||||||
if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
|
if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
|
||||||
|
kind = Brig::BRIG_KIND_NONE;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -423,6 +439,7 @@ ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
|
||||||
(const Brig::BrigOperand *)obj->getOperand(*data_offset);
|
(const Brig::BrigOperand *)obj->getOperand(*data_offset);
|
||||||
|
|
||||||
if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
|
if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
|
||||||
|
kind = Brig::BRIG_KIND_NONE;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -456,7 +473,7 @@ class RegOrImmOperand : public BaseOperand
|
||||||
OperandType
|
OperandType
|
||||||
get(Wavefront *w, int lane)
|
get(Wavefront *w, int lane)
|
||||||
{
|
{
|
||||||
return is_imm ? imm_op.template get<OperandType>() :
|
return is_imm ? imm_op.template get<OperandType>(w) :
|
||||||
reg_op.template get<OperandType>(w, lane);
|
reg_op.template get<OperandType>(w, lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -571,7 +588,7 @@ class AddrOperandBase : public BaseOperand
|
||||||
uint64_t calcUniformBase();
|
uint64_t calcUniformBase();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
|
virtual void calcVector(Wavefront *w, std::vector<Addr> &addrVec) = 0;
|
||||||
virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
|
virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
|
||||||
|
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
|
@ -586,7 +603,7 @@ class RegAddrOperand : public AddrOperandBase
|
||||||
RegOperandType reg;
|
RegOperandType reg;
|
||||||
void init(unsigned opOffset, const BrigObject *obj);
|
void init(unsigned opOffset, const BrigObject *obj);
|
||||||
uint64_t calcUniform();
|
uint64_t calcUniform();
|
||||||
void calcVector(Wavefront *w, uint64_t *addrVec);
|
void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
|
||||||
uint64_t calcLane(Wavefront *w, int lane=0);
|
uint64_t calcLane(Wavefront *w, int lane=0);
|
||||||
uint32_t opSize() { return reg.opSize(); }
|
uint32_t opSize() { return reg.opSize(); }
|
||||||
bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
|
bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
|
||||||
|
@ -641,11 +658,12 @@ RegAddrOperand<RegOperandType>::calcUniform()
|
||||||
|
|
||||||
template<typename RegOperandType>
|
template<typename RegOperandType>
|
||||||
void
|
void
|
||||||
RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
|
RegAddrOperand<RegOperandType>::calcVector(Wavefront *w,
|
||||||
|
std::vector<Addr> &addrVec)
|
||||||
{
|
{
|
||||||
Addr address = calcUniformBase();
|
Addr address = calcUniformBase();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane) {
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
|
||||||
if (w->execMask(lane)) {
|
if (w->execMask(lane)) {
|
||||||
if (reg.regFileChar == 's') {
|
if (reg.regFileChar == 's') {
|
||||||
addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
|
addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
|
||||||
|
@ -680,7 +698,7 @@ class NoRegAddrOperand : public AddrOperandBase
|
||||||
public:
|
public:
|
||||||
void init(unsigned opOffset, const BrigObject *obj);
|
void init(unsigned opOffset, const BrigObject *obj);
|
||||||
uint64_t calcUniform();
|
uint64_t calcUniform();
|
||||||
void calcVector(Wavefront *w, uint64_t *addrVec);
|
void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
|
||||||
uint64_t calcLane(Wavefront *w, int lane=0);
|
uint64_t calcLane(Wavefront *w, int lane=0);
|
||||||
std::string disassemble();
|
std::string disassemble();
|
||||||
};
|
};
|
||||||
|
@ -698,11 +716,11 @@ NoRegAddrOperand::calcLane(Wavefront *w, int lane)
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void
|
inline void
|
||||||
NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
|
NoRegAddrOperand::calcVector(Wavefront *w, std::vector<Addr> &addrVec)
|
||||||
{
|
{
|
||||||
uint64_t address = calcUniformBase();
|
uint64_t address = calcUniformBase();
|
||||||
|
|
||||||
for (int lane = 0; lane < VSZ; ++lane)
|
for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane)
|
||||||
addrVec[lane] = address;
|
addrVec[lane] = address;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -59,6 +59,7 @@ class VectorRegisterFile(SimObject):
|
||||||
|
|
||||||
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
|
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
|
||||||
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
|
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
|
||||||
|
wfSize = Param.Int(64, 'Wavefront size (in work items)')
|
||||||
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
|
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
|
||||||
|
|
||||||
class Wavefront(SimObject):
|
class Wavefront(SimObject):
|
||||||
|
@ -68,6 +69,7 @@ class Wavefront(SimObject):
|
||||||
|
|
||||||
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
|
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
|
||||||
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
|
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
|
||||||
|
wfSize = Param.Int(64, 'Wavefront size (in work items)')
|
||||||
|
|
||||||
class ComputeUnit(MemObject):
|
class ComputeUnit(MemObject):
|
||||||
type = 'ComputeUnit'
|
type = 'ComputeUnit'
|
||||||
|
|
|
@ -238,7 +238,7 @@ ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
|
||||||
case HSA_GET_VSZ:
|
case HSA_GET_VSZ:
|
||||||
{
|
{
|
||||||
BufferArg buf(buf_addr, sizeof(uint32_t));
|
BufferArg buf(buf_addr, sizeof(uint32_t));
|
||||||
*((uint32_t*)buf.bufferPtr()) = VSZ;
|
*((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
|
||||||
buf.copyOut(tc->getMemProxy());
|
buf.copyOut(tc->getMemProxy());
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -32,9 +32,10 @@
|
||||||
*
|
*
|
||||||
* Author: John Kalamatianos, Anthony Gutierrez
|
* Author: John Kalamatianos, Anthony Gutierrez
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "gpu-compute/compute_unit.hh"
|
#include "gpu-compute/compute_unit.hh"
|
||||||
|
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
#include "base/output.hh"
|
#include "base/output.hh"
|
||||||
#include "debug/GPUDisp.hh"
|
#include "debug/GPUDisp.hh"
|
||||||
#include "debug/GPUExec.hh"
|
#include "debug/GPUExec.hh"
|
||||||
|
@ -76,14 +77,27 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
|
||||||
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
|
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
|
||||||
lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
|
lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
|
||||||
{
|
{
|
||||||
// this check will be eliminated once we have wavefront size support added
|
/**
|
||||||
fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
|
* This check is necessary because std::bitset only provides conversion
|
||||||
|
* to unsigned long or unsigned long long via to_ulong() or to_ullong().
|
||||||
|
* there are * a few places in the code where to_ullong() is used, however
|
||||||
|
* if VSZ is larger than a value the host can support then bitset will
|
||||||
|
* throw a runtime exception. we should remove all use of to_long() or
|
||||||
|
* to_ullong() so we can have VSZ greater than 64b, however until that is
|
||||||
|
* done this assert is required.
|
||||||
|
*/
|
||||||
|
fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
|
||||||
|
p->wfSize <= 0,
|
||||||
|
"WF size is larger than the host can support");
|
||||||
|
fatal_if(!isPowerOf2(wavefrontSize),
|
||||||
|
"Wavefront size should be a power of 2");
|
||||||
// calculate how many cycles a vector load or store will need to transfer
|
// calculate how many cycles a vector load or store will need to transfer
|
||||||
// its data over the corresponding buses
|
// its data over the corresponding buses
|
||||||
numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
|
numCyclesPerStoreTransfer =
|
||||||
/ (double)vrfToCoalescerBusWidth);
|
(uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
|
||||||
|
(double)vrfToCoalescerBusWidth);
|
||||||
|
|
||||||
numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
|
numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
|
||||||
/ coalescerToVrfBusWidth;
|
/ coalescerToVrfBusWidth;
|
||||||
|
|
||||||
lastVaddrWF.resize(numSIMDs);
|
lastVaddrWF.resize(numSIMDs);
|
||||||
|
@ -93,24 +107,24 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
|
||||||
lastVaddrWF[j].resize(p->n_wf);
|
lastVaddrWF[j].resize(p->n_wf);
|
||||||
|
|
||||||
for (int i = 0; i < p->n_wf; ++i) {
|
for (int i = 0; i < p->n_wf; ++i) {
|
||||||
lastVaddrWF[j][i].resize(VSZ);
|
lastVaddrWF[j][i].resize(wfSize());
|
||||||
|
|
||||||
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
|
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
|
||||||
wfList[j][i]->setParent(this);
|
wfList[j][i]->setParent(this);
|
||||||
|
|
||||||
for (int k = 0; k < VSZ; ++k) {
|
for (int k = 0; k < wfSize(); ++k) {
|
||||||
lastVaddrWF[j][i][k] = 0;
|
lastVaddrWF[j][i][k] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lastVaddrPhase.resize(numSIMDs);
|
lastVaddrSimd.resize(numSIMDs);
|
||||||
|
|
||||||
for (int i = 0; i < numSIMDs; ++i) {
|
for (int i = 0; i < numSIMDs; ++i) {
|
||||||
lastVaddrPhase[i] = LastVaddrWave();
|
lastVaddrSimd[i].resize(wfSize(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
lastVaddrCU = LastVaddrWave();
|
lastVaddrCU.resize(wfSize());
|
||||||
|
|
||||||
lds.setParent(this);
|
lds.setParent(this);
|
||||||
|
|
||||||
|
@ -122,10 +136,10 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
|
||||||
fatal("Invalid WF execution policy (CU)\n");
|
fatal("Invalid WF execution policy (CU)\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
memPort.resize(VSZ);
|
memPort.resize(wfSize());
|
||||||
|
|
||||||
// resize the tlbPort vectorArray
|
// resize the tlbPort vectorArray
|
||||||
int tlbPort_width = perLaneTLB ? VSZ : 1;
|
int tlbPort_width = perLaneTLB ? wfSize() : 1;
|
||||||
tlbPort.resize(tlbPort_width);
|
tlbPort.resize(tlbPort_width);
|
||||||
|
|
||||||
cuExitCallback = new CUExitCallback(this);
|
cuExitCallback = new CUExitCallback(this);
|
||||||
|
@ -144,12 +158,13 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
|
||||||
ComputeUnit::~ComputeUnit()
|
ComputeUnit::~ComputeUnit()
|
||||||
{
|
{
|
||||||
// Delete wavefront slots
|
// Delete wavefront slots
|
||||||
|
for (int j = 0; j < numSIMDs; ++j) {
|
||||||
for (int j = 0; j < numSIMDs; ++j)
|
|
||||||
for (int i = 0; i < shader->n_wf; ++i) {
|
for (int i = 0; i < shader->n_wf; ++i) {
|
||||||
delete wfList[j][i];
|
delete wfList[j][i];
|
||||||
}
|
}
|
||||||
|
lastVaddrSimd[j].clear();
|
||||||
|
}
|
||||||
|
lastVaddrCU.clear();
|
||||||
readyList.clear();
|
readyList.clear();
|
||||||
waveStatusList.clear();
|
waveStatusList.clear();
|
||||||
dispatchList.clear();
|
dispatchList.clear();
|
||||||
|
@ -187,27 +202,25 @@ ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
|
||||||
VectorMask init_mask;
|
VectorMask init_mask;
|
||||||
init_mask.reset();
|
init_mask.reset();
|
||||||
|
|
||||||
for (int k = 0; k < VSZ; ++k) {
|
for (int k = 0; k < wfSize(); ++k) {
|
||||||
if (k + cnt * VSZ < trueWgSizeTotal)
|
if (k + cnt * wfSize() < trueWgSizeTotal)
|
||||||
init_mask[k] = 1;
|
init_mask[k] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
wfCtx->init_mask = init_mask.to_ullong();
|
wfCtx->init_mask = init_mask.to_ullong();
|
||||||
wfCtx->exec_mask = init_mask.to_ullong();
|
wfCtx->exec_mask = init_mask.to_ullong();
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
wfCtx->bar_cnt.resize(wfSize(), 0);
|
||||||
wfCtx->bar_cnt[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
wfCtx->max_bar_cnt = 0;
|
wfCtx->max_bar_cnt = 0;
|
||||||
wfCtx->old_barrier_cnt = 0;
|
wfCtx->old_barrier_cnt = 0;
|
||||||
wfCtx->barrier_cnt = 0;
|
wfCtx->barrier_cnt = 0;
|
||||||
|
|
||||||
wfCtx->privBase = ndr->q.privMemStart;
|
wfCtx->privBase = ndr->q.privMemStart;
|
||||||
ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
|
ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
|
||||||
|
|
||||||
wfCtx->spillBase = ndr->q.spillMemStart;
|
wfCtx->spillBase = ndr->q.spillMemStart;
|
||||||
ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
|
ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
|
||||||
|
|
||||||
wfCtx->pc = 0;
|
wfCtx->pc = 0;
|
||||||
wfCtx->rpc = UINT32_MAX;
|
wfCtx->rpc = UINT32_MAX;
|
||||||
|
@ -265,10 +278,12 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||||
w->dynwaveid = cnt;
|
w->dynwaveid = cnt;
|
||||||
w->init_mask = wfCtx->init_mask;
|
w->init_mask = wfCtx->init_mask;
|
||||||
|
|
||||||
for (int k = 0; k < VSZ; ++k) {
|
for (int k = 0; k < wfSize(); ++k) {
|
||||||
w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
|
w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
|
||||||
w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
|
w->workitemid[1][k] =
|
||||||
w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
|
((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
|
||||||
|
w->workitemid[2][k] =
|
||||||
|
(k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
|
||||||
|
|
||||||
w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
|
w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
|
||||||
trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
|
trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
|
||||||
|
@ -277,9 +292,9 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||||
|
|
||||||
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
|
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
|
||||||
w->barrier_cnt = wfCtx->barrier_cnt;
|
w->barrier_cnt = wfCtx->barrier_cnt;
|
||||||
w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
|
w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
for (int i = 0; i < wfSize(); ++i) {
|
||||||
w->bar_cnt[i] = wfCtx->bar_cnt[i];
|
w->bar_cnt[i] = wfCtx->bar_cnt[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -315,16 +330,17 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||||
// is this the last wavefront in the workgroup
|
// is this the last wavefront in the workgroup
|
||||||
// if set the spillWidth to be the remaining work-items
|
// if set the spillWidth to be the remaining work-items
|
||||||
// so that the vector access is correct
|
// so that the vector access is correct
|
||||||
if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
|
if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
|
||||||
w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
|
w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
|
||||||
} else {
|
} else {
|
||||||
w->spillWidth = VSZ;
|
w->spillWidth = wfSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
|
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
|
||||||
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
|
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
|
||||||
|
|
||||||
w->start(++_n_wave, ndr->q.code_ptr);
|
w->start(++_n_wave, ndr->q.code_ptr);
|
||||||
|
wfCtx->bar_cnt.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -339,7 +355,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
|
||||||
// Send L1 cache acquire
|
// Send L1 cache acquire
|
||||||
// isKernel + isAcquire = Kernel Begin
|
// isKernel + isAcquire = Kernel Begin
|
||||||
if (shader->impl_kern_boundary_sync) {
|
if (shader->impl_kern_boundary_sync) {
|
||||||
GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
|
GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
|
||||||
nullptr,
|
nullptr,
|
||||||
nullptr, 0);
|
nullptr, 0);
|
||||||
|
|
||||||
|
@ -374,7 +390,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
|
||||||
if (w->status == Wavefront::S_STOPPED) {
|
if (w->status == Wavefront::S_STOPPED) {
|
||||||
// if we have scheduled all work items then stop
|
// if we have scheduled all work items then stop
|
||||||
// scheduling wavefronts
|
// scheduling wavefronts
|
||||||
if (cnt * VSZ >= trueWgSizeTotal)
|
if (cnt * wfSize() >= trueWgSizeTotal)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// reserve vector registers for the scheduled wavefront
|
// reserve vector registers for the scheduled wavefront
|
||||||
|
@ -420,7 +436,7 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr)
|
||||||
// work item of the work group
|
// work item of the work group
|
||||||
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
||||||
bool vregAvail = true;
|
bool vregAvail = true;
|
||||||
int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
|
int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
|
||||||
int freeWfSlots = 0;
|
int freeWfSlots = 0;
|
||||||
// check if the total number of VGPRs required by all WFs of the WG
|
// check if the total number of VGPRs required by all WFs of the WG
|
||||||
// fit in the VRFs of all SIMD units
|
// fit in the VRFs of all SIMD units
|
||||||
|
@ -623,7 +639,7 @@ ComputeUnit::init()
|
||||||
// Setup space for call args
|
// Setup space for call args
|
||||||
for (int j = 0; j < numSIMDs; ++j) {
|
for (int j = 0; j < numSIMDs; ++j) {
|
||||||
for (int i = 0; i < shader->n_wf; ++i) {
|
for (int i = 0; i < shader->n_wf; ++i) {
|
||||||
wfList[j][i]->initCallArgMem(shader->funcargs_size);
|
wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1193,15 +1209,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
|
||||||
Addr last = 0;
|
Addr last = 0;
|
||||||
|
|
||||||
switch(computeUnit->prefetchType) {
|
switch(computeUnit->prefetchType) {
|
||||||
case Enums::PF_CU:
|
case Enums::PF_CU:
|
||||||
last = computeUnit->lastVaddrCU[mp_index];
|
last = computeUnit->lastVaddrCU[mp_index];
|
||||||
break;
|
break;
|
||||||
case Enums::PF_PHASE:
|
case Enums::PF_PHASE:
|
||||||
last = computeUnit->lastVaddrPhase[simdId][mp_index];
|
last = computeUnit->lastVaddrSimd[simdId][mp_index];
|
||||||
break;
|
break;
|
||||||
case Enums::PF_WF:
|
case Enums::PF_WF:
|
||||||
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
|
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1215,7 +1231,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
|
||||||
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
|
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
|
||||||
|
|
||||||
computeUnit->lastVaddrCU[mp_index] = vaddr;
|
computeUnit->lastVaddrCU[mp_index] = vaddr;
|
||||||
computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
|
computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
|
||||||
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
|
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
|
||||||
|
|
||||||
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
|
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
|
||||||
|
@ -1488,7 +1504,7 @@ ComputeUnit::regStats()
|
||||||
;
|
;
|
||||||
|
|
||||||
ldsBankConflictDist
|
ldsBankConflictDist
|
||||||
.init(0, VSZ, 2)
|
.init(0, wfSize(), 2)
|
||||||
.name(name() + ".lds_bank_conflicts")
|
.name(name() + ".lds_bank_conflicts")
|
||||||
.desc("Number of bank conflicts per LDS memory packet")
|
.desc("Number of bank conflicts per LDS memory packet")
|
||||||
;
|
;
|
||||||
|
@ -1499,27 +1515,28 @@ ComputeUnit::regStats()
|
||||||
;
|
;
|
||||||
|
|
||||||
pageDivergenceDist
|
pageDivergenceDist
|
||||||
// A wavefront can touch 1 to VSZ pages per memory instruction.
|
// A wavefront can touch up to N pages per memory instruction where
|
||||||
// The number of pages per bin can be configured (here it's 4).
|
// N is equal to the wavefront size
|
||||||
.init(1, VSZ, 4)
|
// The number of pages per bin can be configured (here it's 4).
|
||||||
|
.init(1, wfSize(), 4)
|
||||||
.name(name() + ".page_divergence_dist")
|
.name(name() + ".page_divergence_dist")
|
||||||
.desc("pages touched per wf (over all mem. instr.)")
|
.desc("pages touched per wf (over all mem. instr.)")
|
||||||
;
|
;
|
||||||
|
|
||||||
controlFlowDivergenceDist
|
controlFlowDivergenceDist
|
||||||
.init(1, VSZ, 4)
|
.init(1, wfSize(), 4)
|
||||||
.name(name() + ".warp_execution_dist")
|
.name(name() + ".warp_execution_dist")
|
||||||
.desc("number of lanes active per instruction (oval all instructions)")
|
.desc("number of lanes active per instruction (oval all instructions)")
|
||||||
;
|
;
|
||||||
|
|
||||||
activeLanesPerGMemInstrDist
|
activeLanesPerGMemInstrDist
|
||||||
.init(1, VSZ, 4)
|
.init(1, wfSize(), 4)
|
||||||
.name(name() + ".gmem_lanes_execution_dist")
|
.name(name() + ".gmem_lanes_execution_dist")
|
||||||
.desc("number of active lanes per global memory instruction")
|
.desc("number of active lanes per global memory instruction")
|
||||||
;
|
;
|
||||||
|
|
||||||
activeLanesPerLMemInstrDist
|
activeLanesPerLMemInstrDist
|
||||||
.init(1, VSZ, 4)
|
.init(1, wfSize(), 4)
|
||||||
.name(name() + ".lmem_lanes_execution_dist")
|
.name(name() + ".lmem_lanes_execution_dist")
|
||||||
.desc("number of active lanes per local memory instruction")
|
.desc("number of active lanes per local memory instruction")
|
||||||
;
|
;
|
||||||
|
@ -1531,7 +1548,7 @@ ComputeUnit::regStats()
|
||||||
|
|
||||||
numVecOpsExecuted
|
numVecOpsExecuted
|
||||||
.name(name() + ".num_vec_ops_executed")
|
.name(name() + ".num_vec_ops_executed")
|
||||||
.desc("number of vec ops executed (e.g. VSZ/inst)")
|
.desc("number of vec ops executed (e.g. WF size/inst)")
|
||||||
;
|
;
|
||||||
|
|
||||||
totalCycles
|
totalCycles
|
||||||
|
|
|
@ -161,22 +161,8 @@ class ComputeUnit : public MemObject
|
||||||
// if fixed-stride prefetching, this is the stride.
|
// if fixed-stride prefetching, this is the stride.
|
||||||
int prefetchStride;
|
int prefetchStride;
|
||||||
|
|
||||||
class LastVaddrWave
|
std::vector<Addr> lastVaddrCU;
|
||||||
{
|
std::vector<std::vector<Addr>> lastVaddrSimd;
|
||||||
public:
|
|
||||||
Addr vaddrs[VSZ];
|
|
||||||
Addr& operator[](int idx) {
|
|
||||||
return vaddrs[idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
LastVaddrWave() {
|
|
||||||
for (int i = 0; i < VSZ; ++i)
|
|
||||||
vaddrs[i] = 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
LastVaddrWave lastVaddrCU;
|
|
||||||
std::vector<LastVaddrWave> lastVaddrPhase;
|
|
||||||
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
|
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
|
||||||
Enums::PrefetchType prefetchType;
|
Enums::PrefetchType prefetchType;
|
||||||
EXEC_POLICY exec_policy;
|
EXEC_POLICY exec_policy;
|
||||||
|
|
|
@ -387,6 +387,12 @@ GpuDispatcher::getNumCUs()
|
||||||
return shader->cuList.size();
|
return shader->cuList.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
GpuDispatcher::wfSize() const
|
||||||
|
{
|
||||||
|
return shader->cuList[0]->wfSize();
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
GpuDispatcher::setFuncargsSize(int funcargs_size)
|
GpuDispatcher::setFuncargsSize(int funcargs_size)
|
||||||
{
|
{
|
||||||
|
|
|
@ -157,6 +157,7 @@ class GpuDispatcher : public DmaDevice
|
||||||
|
|
||||||
// helper functions to retrieve/set GPU attributes
|
// helper functions to retrieve/set GPU attributes
|
||||||
int getNumCUs();
|
int getNumCUs();
|
||||||
|
int wfSize() const;
|
||||||
void setFuncargsSize(int funcargs_size);
|
void setFuncargsSize(int funcargs_size);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -179,9 +179,9 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
|
||||||
int physVgpr = w->remap(dst, sizeof(c0), 1);
|
int physVgpr = w->remap(dst, sizeof(c0), 1);
|
||||||
// save the physical VGPR index
|
// save the physical VGPR index
|
||||||
regVec.push_back(physVgpr);
|
regVec.push_back(physVgpr);
|
||||||
c1 *p1 = &((c1*)m->d_data)[k * VSZ];
|
c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
|
||||||
if (m->exec_mask[i]) {
|
if (m->exec_mask[i]) {
|
||||||
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
|
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
|
||||||
"$%s%d <- %d global ld done (src = wavefront "
|
"$%s%d <- %d global ld done (src = wavefront "
|
||||||
|
|
|
@ -42,11 +42,29 @@
|
||||||
|
|
||||||
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
|
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
|
||||||
GPUStaticInst *_staticInst, uint64_t instSeqNum)
|
GPUStaticInst *_staticInst, uint64_t instSeqNum)
|
||||||
: GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
|
: GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
|
||||||
|
m_op(Enums::MO_UNDEF),
|
||||||
memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
|
memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
|
||||||
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
|
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
|
||||||
{
|
{
|
||||||
tlbHitLevel.assign(VSZ, -1);
|
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
|
||||||
|
d_data = new uint8_t[computeUnit()->wfSize() * 16];
|
||||||
|
a_data = new uint8_t[computeUnit()->wfSize() * 8];
|
||||||
|
x_data = new uint8_t[computeUnit()->wfSize() * 8];
|
||||||
|
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
|
||||||
|
a_data[i] = 0;
|
||||||
|
x_data[i] = 0;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
|
||||||
|
d_data[i] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GPUDynInst::~GPUDynInst()
|
||||||
|
{
|
||||||
|
delete[] d_data;
|
||||||
|
delete[] a_data;
|
||||||
|
delete[] x_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -205,7 +205,7 @@ class GPUDynInst : public GPUExecContext
|
||||||
public:
|
public:
|
||||||
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
|
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
|
||||||
uint64_t instSeqNum);
|
uint64_t instSeqNum);
|
||||||
|
~GPUDynInst();
|
||||||
void execute();
|
void execute();
|
||||||
int numSrcRegOperands();
|
int numSrcRegOperands();
|
||||||
int numDstRegOperands();
|
int numDstRegOperands();
|
||||||
|
@ -226,15 +226,15 @@ class GPUDynInst : public GPUExecContext
|
||||||
Enums::StorageClassType executedAs();
|
Enums::StorageClassType executedAs();
|
||||||
|
|
||||||
// The address of the memory operation
|
// The address of the memory operation
|
||||||
Addr addr[VSZ];
|
std::vector<Addr> addr;
|
||||||
Addr pAddr;
|
Addr pAddr;
|
||||||
|
|
||||||
// The data to get written
|
// The data to get written
|
||||||
uint8_t d_data[VSZ * 16];
|
uint8_t *d_data;
|
||||||
// Additional data (for atomics)
|
// Additional data (for atomics)
|
||||||
uint8_t a_data[VSZ * 8];
|
uint8_t *a_data;
|
||||||
// Additional data (for atomics)
|
// Additional data (for atomics)
|
||||||
uint8_t x_data[VSZ * 8];
|
uint8_t *x_data;
|
||||||
// The execution mask
|
// The execution mask
|
||||||
VectorMask exec_mask;
|
VectorMask exec_mask;
|
||||||
|
|
||||||
|
|
|
@ -148,9 +148,9 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
|
||||||
int physVgpr = w->remap(dst,sizeof(c0),1);
|
int physVgpr = w->remap(dst,sizeof(c0),1);
|
||||||
// save the physical VGPR index
|
// save the physical VGPR index
|
||||||
regVec.push_back(physVgpr);
|
regVec.push_back(physVgpr);
|
||||||
c1 *p1 = &((c1*)m->d_data)[k * VSZ];
|
c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
|
||||||
|
|
||||||
for (int i = 0; i < VSZ; ++i) {
|
for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
|
||||||
if (m->exec_mask[i]) {
|
if (m->exec_mask[i]) {
|
||||||
// write the value into the physical VGPR. This is a purely
|
// write the value into the physical VGPR. This is a purely
|
||||||
// functional operation. No timing is modeled.
|
// functional operation. No timing is modeled.
|
||||||
|
|
|
@ -37,28 +37,14 @@
|
||||||
#define __MISC_HH__
|
#define __MISC_HH__
|
||||||
|
|
||||||
#include <bitset>
|
#include <bitset>
|
||||||
|
#include <limits>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "base/misc.hh"
|
#include "base/misc.hh"
|
||||||
|
|
||||||
class GPUDynInst;
|
class GPUDynInst;
|
||||||
|
|
||||||
// wavefront size of the machine
|
typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
|
||||||
static const int VSZ = 64;
|
|
||||||
|
|
||||||
/*
|
|
||||||
This check is necessary because std::bitset only provides conversion to
|
|
||||||
unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
|
|
||||||
a few places in the code where to_ullong() is used, however if VSZ is larger
|
|
||||||
than a value the host can support then bitset will throw a runtime exception.
|
|
||||||
|
|
||||||
we should remove all use of to_long() or to_ullong() so we can have VSZ
|
|
||||||
greater than 64b, however until that is done this assert is required.
|
|
||||||
*/
|
|
||||||
static_assert(VSZ <= sizeof(unsigned long long) * 8,
|
|
||||||
"VSZ is larger than the host can support");
|
|
||||||
|
|
||||||
typedef std::bitset<VSZ> VectorMask;
|
|
||||||
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
|
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
|
||||||
|
|
||||||
class WaitClass
|
class WaitClass
|
||||||
|
|
|
@ -100,7 +100,7 @@ struct WFContext
|
||||||
{
|
{
|
||||||
// 32 bit values
|
// 32 bit values
|
||||||
// barrier state
|
// barrier state
|
||||||
int bar_cnt[VSZ];
|
std::vector<int> bar_cnt;
|
||||||
|
|
||||||
// id (which WF in the WG)
|
// id (which WF in the WG)
|
||||||
int cnt;
|
int cnt;
|
||||||
|
|
|
@ -63,7 +63,7 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
|
||||||
nxtBusy.clear();
|
nxtBusy.clear();
|
||||||
nxtBusy.resize(numRegsPerSimd, 0);
|
nxtBusy.resize(numRegsPerSimd, 0);
|
||||||
|
|
||||||
vgprState->init(numRegsPerSimd);
|
vgprState->init(numRegsPerSimd, p->wfSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -35,6 +35,8 @@
|
||||||
|
|
||||||
#include "gpu-compute/vector_register_state.hh"
|
#include "gpu-compute/vector_register_state.hh"
|
||||||
|
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
#include "gpu-compute/compute_unit.hh"
|
#include "gpu-compute/compute_unit.hh"
|
||||||
|
|
||||||
VecRegisterState::VecRegisterState() : computeUnit(nullptr)
|
VecRegisterState::VecRegisterState() : computeUnit(nullptr)
|
||||||
|
@ -51,8 +53,19 @@ VecRegisterState::setParent(ComputeUnit *_computeUnit)
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
VecRegisterState::init(uint32_t _size)
|
VecRegisterState::init(uint32_t _size, uint32_t wf_size)
|
||||||
{
|
{
|
||||||
s_reg.resize(_size);
|
s_reg.resize(_size);
|
||||||
|
fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
|
||||||
|
wf_size <= 0,
|
||||||
|
"WF size is larger than the host can support or is zero");
|
||||||
|
fatal_if((wf_size & (wf_size - 1)) != 0,
|
||||||
|
"Wavefront size should be a power of 2");
|
||||||
|
for (int i = 0; i < s_reg.size(); ++i) {
|
||||||
|
s_reg[i].resize(wf_size, 0);
|
||||||
|
}
|
||||||
d_reg.resize(_size);
|
d_reg.resize(_size);
|
||||||
|
for (int i = 0; i < d_reg.size(); ++i) {
|
||||||
|
d_reg[i].resize(wf_size, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@ class VecRegisterState
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
VecRegisterState();
|
VecRegisterState();
|
||||||
void init(uint32_t _size);
|
void init(uint32_t _size, uint32_t wf_size);
|
||||||
|
|
||||||
const std::string& name() const { return _name; }
|
const std::string& name() const { return _name; }
|
||||||
void setParent(ComputeUnit *_computeUnit);
|
void setParent(ComputeUnit *_computeUnit);
|
||||||
|
@ -93,9 +93,9 @@ class VecRegisterState
|
||||||
ComputeUnit *computeUnit;
|
ComputeUnit *computeUnit;
|
||||||
std::string _name;
|
std::string _name;
|
||||||
// 32-bit Single Precision Vector Register State
|
// 32-bit Single Precision Vector Register State
|
||||||
std::vector<std::array<uint32_t, VSZ>> s_reg;
|
std::vector<std::vector<uint32_t>> s_reg;
|
||||||
// 64-bit Double Precision Vector Register State
|
// 64-bit Double Precision Vector Register State
|
||||||
std::vector<std::array<uint64_t, VSZ>> d_reg;
|
std::vector<std::vector<uint64_t>> d_reg;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // __VECTOR_REGISTER_STATE_HH__
|
#endif // __VECTOR_REGISTER_STATE_HH__
|
||||||
|
|
|
@ -55,7 +55,6 @@ Wavefront::Wavefront(const Params *p)
|
||||||
last_trace = 0;
|
last_trace = 0;
|
||||||
simdId = p->simdId;
|
simdId = p->simdId;
|
||||||
wfSlotId = p->wf_slot_id;
|
wfSlotId = p->wf_slot_id;
|
||||||
|
|
||||||
status = S_STOPPED;
|
status = S_STOPPED;
|
||||||
reservedVectorRegs = 0;
|
reservedVectorRegs = 0;
|
||||||
startVgprIndex = 0;
|
startVgprIndex = 0;
|
||||||
|
@ -77,12 +76,20 @@ Wavefront::Wavefront(const Params *p)
|
||||||
mem_trace_busy = 0;
|
mem_trace_busy = 0;
|
||||||
old_vgpr_tcnt = 0xffffffffffffffffll;
|
old_vgpr_tcnt = 0xffffffffffffffffll;
|
||||||
old_dgpr_tcnt = 0xffffffffffffffffll;
|
old_dgpr_tcnt = 0xffffffffffffffffll;
|
||||||
|
old_vgpr.resize(p->wfSize);
|
||||||
|
|
||||||
pendingFetch = false;
|
pendingFetch = false;
|
||||||
dropFetch = false;
|
dropFetch = false;
|
||||||
condRegState = new ConditionRegisterState();
|
condRegState = new ConditionRegisterState();
|
||||||
maxSpVgprs = 0;
|
maxSpVgprs = 0;
|
||||||
maxDpVgprs = 0;
|
maxDpVgprs = 0;
|
||||||
|
last_addr.resize(p->wfSize);
|
||||||
|
workitemFlatId.resize(p->wfSize);
|
||||||
|
old_dgpr.resize(p->wfSize);
|
||||||
|
bar_cnt.resize(p->wfSize);
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
workitemid[i].resize(p->wfSize);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -144,6 +151,7 @@ Wavefront::~Wavefront()
|
||||||
{
|
{
|
||||||
if (callArgMem)
|
if (callArgMem)
|
||||||
delete callArgMem;
|
delete callArgMem;
|
||||||
|
delete condRegState;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -83,6 +83,7 @@ class CallArgMem
|
||||||
public:
|
public:
|
||||||
// pointer to buffer for storing function arguments
|
// pointer to buffer for storing function arguments
|
||||||
uint8_t *mem;
|
uint8_t *mem;
|
||||||
|
int wfSize;
|
||||||
// size of function args
|
// size of function args
|
||||||
int funcArgsSizePerItem;
|
int funcArgsSizePerItem;
|
||||||
|
|
||||||
|
@ -90,13 +91,13 @@ class CallArgMem
|
||||||
int
|
int
|
||||||
getLaneOffset(int lane, int addr)
|
getLaneOffset(int lane, int addr)
|
||||||
{
|
{
|
||||||
return addr * VSZ + sizeof(CType) * lane;
|
return addr * wfSize + sizeof(CType) * lane;
|
||||||
}
|
}
|
||||||
|
|
||||||
CallArgMem(int func_args_size_per_item)
|
CallArgMem(int func_args_size_per_item, int wf_size)
|
||||||
: funcArgsSizePerItem(func_args_size_per_item)
|
: wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
|
||||||
{
|
{
|
||||||
mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
|
mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
~CallArgMem()
|
~CallArgMem()
|
||||||
|
@ -192,9 +193,9 @@ class Wavefront : public SimObject
|
||||||
bool isOldestInstALU();
|
bool isOldestInstALU();
|
||||||
bool isOldestInstBarrier();
|
bool isOldestInstBarrier();
|
||||||
// used for passing spill address to DDInstGPU
|
// used for passing spill address to DDInstGPU
|
||||||
uint64_t last_addr[VSZ];
|
std::vector<Addr> last_addr;
|
||||||
uint32_t workitemid[3][VSZ];
|
std::vector<uint32_t> workitemid[3];
|
||||||
uint32_t workitemFlatId[VSZ];
|
std::vector<uint32_t> workitemFlatId;
|
||||||
uint32_t workgroupid[3];
|
uint32_t workgroupid[3];
|
||||||
uint32_t workgroupsz[3];
|
uint32_t workgroupsz[3];
|
||||||
uint32_t gridsz[3];
|
uint32_t gridsz[3];
|
||||||
|
@ -230,14 +231,14 @@ class Wavefront : public SimObject
|
||||||
uint32_t startVgprIndex;
|
uint32_t startVgprIndex;
|
||||||
|
|
||||||
// Old value of destination gpr (for trace)
|
// Old value of destination gpr (for trace)
|
||||||
uint32_t old_vgpr[VSZ];
|
std::vector<uint32_t> old_vgpr;
|
||||||
// Id of destination gpr (for trace)
|
// Id of destination gpr (for trace)
|
||||||
uint32_t old_vgpr_id;
|
uint32_t old_vgpr_id;
|
||||||
// Tick count of last old_vgpr copy
|
// Tick count of last old_vgpr copy
|
||||||
uint64_t old_vgpr_tcnt;
|
uint64_t old_vgpr_tcnt;
|
||||||
|
|
||||||
// Old value of destination gpr (for trace)
|
// Old value of destination gpr (for trace)
|
||||||
uint64_t old_dgpr[VSZ];
|
std::vector<uint64_t> old_dgpr;
|
||||||
// Id of destination gpr (for trace)
|
// Id of destination gpr (for trace)
|
||||||
uint32_t old_dgpr_id;
|
uint32_t old_dgpr_id;
|
||||||
// Tick count of last old_vgpr copy
|
// Tick count of last old_vgpr copy
|
||||||
|
@ -247,7 +248,7 @@ class Wavefront : public SimObject
|
||||||
VectorMask init_mask;
|
VectorMask init_mask;
|
||||||
|
|
||||||
// number of barriers this WF has joined
|
// number of barriers this WF has joined
|
||||||
int bar_cnt[VSZ];
|
std::vector<int> bar_cnt;
|
||||||
int max_bar_cnt;
|
int max_bar_cnt;
|
||||||
// Flag to stall a wave on barrier
|
// Flag to stall a wave on barrier
|
||||||
bool stalledAtBarrier;
|
bool stalledAtBarrier;
|
||||||
|
@ -296,9 +297,9 @@ class Wavefront : public SimObject
|
||||||
// argument memory for hsail call instruction
|
// argument memory for hsail call instruction
|
||||||
CallArgMem *callArgMem;
|
CallArgMem *callArgMem;
|
||||||
void
|
void
|
||||||
initCallArgMem(int func_args_size_per_item)
|
initCallArgMem(int func_args_size_per_item, int wf_size)
|
||||||
{
|
{
|
||||||
callArgMem = new CallArgMem(func_args_size_per_item);
|
callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename CType>
|
template<typename CType>
|
||||||
|
@ -327,7 +328,6 @@ class Wavefront : public SimObject
|
||||||
}
|
}
|
||||||
|
|
||||||
void start(uint64_t _wfDynId, uint64_t _base_ptr);
|
void start(uint64_t _wfDynId, uint64_t _base_ptr);
|
||||||
|
|
||||||
void exec();
|
void exec();
|
||||||
void updateResources();
|
void updateResources();
|
||||||
int ready(itype_e type);
|
int ready(itype_e type);
|
||||||
|
|
Loading…
Reference in a new issue