* * *
mem: support for gpu-style RMWs in ruby This patch adds support for GPU-style read-modify-write (RMW) operations in ruby. Such atomic operations are traditionally executed at the memory controller (instead of through an L1 cache using cache-line locking). Currently, this patch works by propogating operation functors through the memory system.
This commit is contained in:
parent
34fb6b5e35
commit
d658b6e1cc
6 changed files with 122 additions and 39 deletions
|
@ -200,6 +200,19 @@ typedef std::shared_ptr<FaultBase> Fault;
|
||||||
constexpr decltype(nullptr) NoFault = nullptr;
|
constexpr decltype(nullptr) NoFault = nullptr;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct AtomicOpFunctor
|
||||||
|
{
|
||||||
|
virtual void operator()(uint8_t *p) = 0;
|
||||||
|
virtual ~AtomicOpFunctor() {}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
struct TypedAtomicOpFunctor : public AtomicOpFunctor
|
||||||
|
{
|
||||||
|
void operator()(uint8_t *p) { execute((T *)p); }
|
||||||
|
virtual void execute(T * p) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
enum ByteOrder {
|
enum ByteOrder {
|
||||||
BigEndianByteOrder,
|
BigEndianByteOrder,
|
||||||
LittleEndianByteOrder
|
LittleEndianByteOrder
|
||||||
|
|
|
@ -341,39 +341,46 @@ AbstractMemory::access(PacketPtr pkt)
|
||||||
uint8_t *hostAddr = pmemAddr + pkt->getAddr() - range.start();
|
uint8_t *hostAddr = pmemAddr + pkt->getAddr() - range.start();
|
||||||
|
|
||||||
if (pkt->cmd == MemCmd::SwapReq) {
|
if (pkt->cmd == MemCmd::SwapReq) {
|
||||||
std::vector<uint8_t> overwrite_val(pkt->getSize());
|
if (pkt->isAtomicOp()) {
|
||||||
uint64_t condition_val64;
|
if (pmemAddr) {
|
||||||
uint32_t condition_val32;
|
memcpy(pkt->getPtr<uint8_t>(), hostAddr, pkt->getSize());
|
||||||
|
(*(pkt->getAtomicOp()))(hostAddr);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::vector<uint8_t> overwrite_val(pkt->getSize());
|
||||||
|
uint64_t condition_val64;
|
||||||
|
uint32_t condition_val32;
|
||||||
|
|
||||||
if (!pmemAddr)
|
if (!pmemAddr)
|
||||||
panic("Swap only works if there is real memory (i.e. null=False)");
|
panic("Swap only works if there is real memory (i.e. null=False)");
|
||||||
|
|
||||||
bool overwrite_mem = true;
|
bool overwrite_mem = true;
|
||||||
// keep a copy of our possible write value, and copy what is at the
|
// keep a copy of our possible write value, and copy what is at the
|
||||||
// memory address into the packet
|
// memory address into the packet
|
||||||
std::memcpy(&overwrite_val[0], pkt->getConstPtr<uint8_t>(),
|
std::memcpy(&overwrite_val[0], pkt->getConstPtr<uint8_t>(),
|
||||||
pkt->getSize());
|
pkt->getSize());
|
||||||
std::memcpy(pkt->getPtr<uint8_t>(), hostAddr, pkt->getSize());
|
std::memcpy(pkt->getPtr<uint8_t>(), hostAddr, pkt->getSize());
|
||||||
|
|
||||||
if (pkt->req->isCondSwap()) {
|
if (pkt->req->isCondSwap()) {
|
||||||
if (pkt->getSize() == sizeof(uint64_t)) {
|
if (pkt->getSize() == sizeof(uint64_t)) {
|
||||||
condition_val64 = pkt->req->getExtraData();
|
condition_val64 = pkt->req->getExtraData();
|
||||||
overwrite_mem = !std::memcmp(&condition_val64, hostAddr,
|
overwrite_mem = !std::memcmp(&condition_val64, hostAddr,
|
||||||
sizeof(uint64_t));
|
sizeof(uint64_t));
|
||||||
} else if (pkt->getSize() == sizeof(uint32_t)) {
|
} else if (pkt->getSize() == sizeof(uint32_t)) {
|
||||||
condition_val32 = (uint32_t)pkt->req->getExtraData();
|
condition_val32 = (uint32_t)pkt->req->getExtraData();
|
||||||
overwrite_mem = !std::memcmp(&condition_val32, hostAddr,
|
overwrite_mem = !std::memcmp(&condition_val32, hostAddr,
|
||||||
sizeof(uint32_t));
|
sizeof(uint32_t));
|
||||||
} else
|
} else
|
||||||
panic("Invalid size for conditional read/write\n");
|
panic("Invalid size for conditional read/write\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (overwrite_mem)
|
||||||
|
std::memcpy(hostAddr, &overwrite_val[0], pkt->getSize());
|
||||||
|
|
||||||
|
assert(!pkt->req->isInstFetch());
|
||||||
|
TRACE_PACKET("Read/Write");
|
||||||
|
numOther[pkt->req->masterId()]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (overwrite_mem)
|
|
||||||
std::memcpy(hostAddr, &overwrite_val[0], pkt->getSize());
|
|
||||||
|
|
||||||
assert(!pkt->req->isInstFetch());
|
|
||||||
TRACE_PACKET("Read/Write");
|
|
||||||
numOther[pkt->req->masterId()]++;
|
|
||||||
} else if (pkt->isRead()) {
|
} else if (pkt->isRead()) {
|
||||||
assert(!pkt->isWrite());
|
assert(!pkt->isWrite());
|
||||||
if (pkt->isLLSC()) {
|
if (pkt->isLLSC()) {
|
||||||
|
|
|
@ -660,6 +660,12 @@ class Packet : public Printable
|
||||||
return _isSecure;
|
return _isSecure;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Accessor function to atomic op.
|
||||||
|
*/
|
||||||
|
AtomicOpFunctor *getAtomicOp() const { return req->getAtomicOpFunctor(); }
|
||||||
|
bool isAtomicOp() const { return req->isAtomic(); }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* It has been determined that the SC packet should successfully update
|
* It has been determined that the SC packet should successfully update
|
||||||
* memory. Therefore, convert this SC packet to a normal write.
|
* memory. Therefore, convert this SC packet to a normal write.
|
||||||
|
|
|
@ -56,6 +56,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt);
|
||||||
enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent") {
|
enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent") {
|
||||||
// Valid data
|
// Valid data
|
||||||
Read_Only, desc="block is Read Only (modulo functional writes)";
|
Read_Only, desc="block is Read Only (modulo functional writes)";
|
||||||
|
Write_Only, desc="block is Write Only";
|
||||||
Read_Write, desc="block is Read/Write";
|
Read_Write, desc="block is Read/Write";
|
||||||
|
|
||||||
// Possibly Invalid data
|
// Possibly Invalid data
|
||||||
|
@ -144,7 +145,9 @@ enumeration(TransitionResult, desc="...") {
|
||||||
enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
|
enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
|
||||||
LD, desc="Load";
|
LD, desc="Load";
|
||||||
ST, desc="Store";
|
ST, desc="Store";
|
||||||
ATOMIC, desc="Atomic Load/Store";
|
ATOMIC, desc="Atomic Load/Store -- depricated. use ATOMIC_RETURN or ATOMIC_NO_RETURN";
|
||||||
|
ATOMIC_RETURN, desc="Atomic Load/Store, return data";
|
||||||
|
ATOMIC_NO_RETURN, desc="Atomic Load/Store, do not return data";
|
||||||
IFETCH, desc="Instruction fetch";
|
IFETCH, desc="Instruction fetch";
|
||||||
IO, desc="I/O";
|
IO, desc="I/O";
|
||||||
REPLACEMENT, desc="Replacement";
|
REPLACEMENT, desc="Replacement";
|
||||||
|
@ -166,6 +169,8 @@ enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL
|
||||||
Default, desc="Replace this with access_types passed to the DMA Ruby object";
|
Default, desc="Replace this with access_types passed to the DMA Ruby object";
|
||||||
LD, desc="Load";
|
LD, desc="Load";
|
||||||
ST, desc="Store";
|
ST, desc="Store";
|
||||||
|
ATOMIC, desc="Atomic Load/Store";
|
||||||
|
REPLACEMENT, desc="Replacement";
|
||||||
FLUSH, desc="Flush request type";
|
FLUSH, desc="Flush request type";
|
||||||
NULL, desc="Invalid request type";
|
NULL, desc="Invalid request type";
|
||||||
}
|
}
|
||||||
|
|
|
@ -126,6 +126,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
|
||||||
int Size, desc="size in bytes of access";
|
int Size, desc="size in bytes of access";
|
||||||
PrefetchBit Prefetch, desc="Is this a prefetch request";
|
PrefetchBit Prefetch, desc="Is this a prefetch request";
|
||||||
int contextId, desc="this goes away but must be replace with Nilay";
|
int contextId, desc="this goes away but must be replace with Nilay";
|
||||||
|
int wfid, desc="Writethrough wavefront";
|
||||||
HSAScope scope, desc="HSA scope";
|
HSAScope scope, desc="HSA scope";
|
||||||
HSASegment segment, desc="HSA segment";
|
HSASegment segment, desc="HSA segment";
|
||||||
}
|
}
|
||||||
|
|
|
@ -160,6 +160,11 @@ class Request
|
||||||
/** The request should be marked with RELEASE. */
|
/** The request should be marked with RELEASE. */
|
||||||
RELEASE = 0x00040000,
|
RELEASE = 0x00040000,
|
||||||
|
|
||||||
|
/** The request is an atomic that returns data. */
|
||||||
|
ATOMIC_RETURN_OP = 0x40000000,
|
||||||
|
/** The request is an atomic that does not return data. */
|
||||||
|
ATOMIC_NO_RETURN_OP = 0x80000000,
|
||||||
|
|
||||||
/** The request should be marked with KERNEL.
|
/** The request should be marked with KERNEL.
|
||||||
* Used to indicate the synchronization associated with a GPU kernel
|
* Used to indicate the synchronization associated with a GPU kernel
|
||||||
* launch or completion.
|
* launch or completion.
|
||||||
|
@ -345,6 +350,9 @@ class Request
|
||||||
/** Sequence number of the instruction that creates the request */
|
/** Sequence number of the instruction that creates the request */
|
||||||
InstSeqNum _reqInstSeqNum;
|
InstSeqNum _reqInstSeqNum;
|
||||||
|
|
||||||
|
/** A pointer to an atomic operation */
|
||||||
|
AtomicOpFunctor *atomicOpFunctor;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -356,7 +364,8 @@ class Request
|
||||||
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
||||||
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
||||||
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
||||||
_reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0)
|
_reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0),
|
||||||
|
accessDelta(0), depth(0)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
Request(Addr paddr, unsigned size, Flags flags, MasterID mid,
|
Request(Addr paddr, unsigned size, Flags flags, MasterID mid,
|
||||||
|
@ -364,7 +373,8 @@ class Request
|
||||||
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
||||||
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
||||||
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
||||||
_reqInstSeqNum(seq_num), translateDelta(0), accessDelta(0), depth(0)
|
_reqInstSeqNum(seq_num), atomicOpFunctor(nullptr), translateDelta(0),
|
||||||
|
accessDelta(0), depth(0)
|
||||||
{
|
{
|
||||||
setPhys(paddr, size, flags, mid, curTick());
|
setPhys(paddr, size, flags, mid, curTick());
|
||||||
setThreadContext(cid, tid);
|
setThreadContext(cid, tid);
|
||||||
|
@ -380,7 +390,8 @@ class Request
|
||||||
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
||||||
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
||||||
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
||||||
_reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0)
|
_reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0),
|
||||||
|
accessDelta(0), depth(0)
|
||||||
{
|
{
|
||||||
setPhys(paddr, size, flags, mid, curTick());
|
setPhys(paddr, size, flags, mid, curTick());
|
||||||
}
|
}
|
||||||
|
@ -389,7 +400,8 @@ class Request
|
||||||
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
||||||
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
||||||
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
||||||
_reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0)
|
_reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0),
|
||||||
|
accessDelta(0), depth(0)
|
||||||
{
|
{
|
||||||
setPhys(paddr, size, flags, mid, time);
|
setPhys(paddr, size, flags, mid, time);
|
||||||
}
|
}
|
||||||
|
@ -398,12 +410,12 @@ class Request
|
||||||
Addr pc)
|
Addr pc)
|
||||||
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
||||||
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
||||||
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
_extraData(0), _contextId(0), _threadId(0), _pc(pc),
|
||||||
_reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0)
|
_reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0),
|
||||||
|
accessDelta(0), depth(0)
|
||||||
{
|
{
|
||||||
setPhys(paddr, size, flags, mid, time);
|
setPhys(paddr, size, flags, mid, time);
|
||||||
privateFlags.set(VALID_PC);
|
privateFlags.set(VALID_PC);
|
||||||
_pc = pc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Request(int asid, Addr vaddr, unsigned size, Flags flags, MasterID mid,
|
Request(int asid, Addr vaddr, unsigned size, Flags flags, MasterID mid,
|
||||||
|
@ -411,13 +423,27 @@ class Request
|
||||||
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
: _paddr(0), _size(0), _masterId(invldMasterId), _time(0),
|
||||||
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
_taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0),
|
||||||
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
_extraData(0), _contextId(0), _threadId(0), _pc(0),
|
||||||
_reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0)
|
_reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0),
|
||||||
|
accessDelta(0), depth(0)
|
||||||
{
|
{
|
||||||
setVirt(asid, vaddr, size, flags, mid, pc);
|
setVirt(asid, vaddr, size, flags, mid, pc);
|
||||||
setThreadContext(cid, tid);
|
setThreadContext(cid, tid);
|
||||||
}
|
}
|
||||||
|
|
||||||
~Request() {}
|
Request(int asid, Addr vaddr, int size, Flags flags, MasterID mid, Addr pc,
|
||||||
|
int cid, ThreadID tid, AtomicOpFunctor *atomic_op)
|
||||||
|
: atomicOpFunctor(atomic_op)
|
||||||
|
{
|
||||||
|
setVirt(asid, vaddr, size, flags, mid, pc);
|
||||||
|
setThreadContext(cid, tid);
|
||||||
|
}
|
||||||
|
|
||||||
|
~Request()
|
||||||
|
{
|
||||||
|
if (hasAtomicOpFunctor()) {
|
||||||
|
delete atomicOpFunctor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set up CPU and thread numbers.
|
* Set up CPU and thread numbers.
|
||||||
|
@ -541,6 +567,22 @@ class Request
|
||||||
return _time;
|
return _time;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Accessor for atomic-op functor.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
hasAtomicOpFunctor()
|
||||||
|
{
|
||||||
|
return atomicOpFunctor != NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
AtomicOpFunctor *
|
||||||
|
getAtomicOpFunctor()
|
||||||
|
{
|
||||||
|
assert(atomicOpFunctor != NULL);
|
||||||
|
return atomicOpFunctor;
|
||||||
|
}
|
||||||
|
|
||||||
/** Accessor for flags. */
|
/** Accessor for flags. */
|
||||||
Flags
|
Flags
|
||||||
getFlags()
|
getFlags()
|
||||||
|
@ -749,6 +791,15 @@ class Request
|
||||||
bool isAcquire() const { return _flags.isSet(ACQUIRE); }
|
bool isAcquire() const { return _flags.isSet(ACQUIRE); }
|
||||||
bool isRelease() const { return _flags.isSet(RELEASE); }
|
bool isRelease() const { return _flags.isSet(RELEASE); }
|
||||||
bool isKernel() const { return _flags.isSet(KERNEL); }
|
bool isKernel() const { return _flags.isSet(KERNEL); }
|
||||||
|
bool isAtomicReturn() const { return _flags.isSet(ATOMIC_RETURN_OP); }
|
||||||
|
bool isAtomicNoReturn() const { return _flags.isSet(ATOMIC_NO_RETURN_OP); }
|
||||||
|
|
||||||
|
bool
|
||||||
|
isAtomic() const
|
||||||
|
{
|
||||||
|
return _flags.isSet(ATOMIC_RETURN_OP) ||
|
||||||
|
_flags.isSet(ATOMIC_NO_RETURN_OP);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Accessor functions for the memory space configuration flags and used by
|
* Accessor functions for the memory space configuration flags and used by
|
||||||
|
|
Loading…
Reference in a new issue