diff --git a/src/arch/alpha/isa_traits.hh b/src/arch/alpha/isa_traits.hh index 66c240ef3..a5a8bf5a0 100644 --- a/src/arch/alpha/isa_traits.hh +++ b/src/arch/alpha/isa_traits.hh @@ -131,6 +131,9 @@ enum { // Alpha UNOP (ldq_u r31,0(r0)) const ExtMachInst NoopMachInst = 0x2ffe0000; +// Memory accesses cannot be unaligned +const bool HasUnalignedMemAcc = false; + } // namespace AlphaISA #endif // __ARCH_ALPHA_ISA_TRAITS_HH__ diff --git a/src/arch/arm/isa_traits.hh b/src/arch/arm/isa_traits.hh index 91c51c46b..59eaeaa5c 100644 --- a/src/arch/arm/isa_traits.hh +++ b/src/arch/arm/isa_traits.hh @@ -106,6 +106,9 @@ namespace ArmISA const int ByteBytes = 1; const uint32_t HighVecs = 0xFFFF0000; + + // Memory accesses cannot be unaligned + const bool HasUnalignedMemAcc = false; }; using namespace ArmISA; diff --git a/src/arch/mips/isa_traits.hh b/src/arch/mips/isa_traits.hh index 38b43af9d..aa64be71d 100644 --- a/src/arch/mips/isa_traits.hh +++ b/src/arch/mips/isa_traits.hh @@ -164,6 +164,9 @@ const int ByteBytes = 1; const int ANNOTE_NONE = 0; const uint32_t ITOUCH_ANNOTE = 0xffffffff; +// Memory accesses cannot be unaligned +const bool HasUnalignedMemAcc = false; + }; #endif // __ARCH_MIPS_ISA_TRAITS_HH__ diff --git a/src/arch/power/isa_traits.hh b/src/arch/power/isa_traits.hh index 886c2cb0b..ab6a56760 100644 --- a/src/arch/power/isa_traits.hh +++ b/src/arch/power/isa_traits.hh @@ -70,6 +70,9 @@ const int MachineBytes = 4; // This is ori 0, 0, 0 const ExtMachInst NoopMachInst = 0x60000000; +// Memory accesses can be unaligned +const bool HasUnalignedMemAcc = true; + } // PowerISA namespace #endif // __ARCH_POWER_ISA_TRAITS_HH__ diff --git a/src/arch/sparc/isa_traits.hh b/src/arch/sparc/isa_traits.hh index 2af624d39..a4dc7322d 100644 --- a/src/arch/sparc/isa_traits.hh +++ b/src/arch/sparc/isa_traits.hh @@ -98,6 +98,9 @@ namespace SparcISA }; #endif + +// Memory accesses cannot be unaligned +const bool HasUnalignedMemAcc = false; } #endif // __ARCH_SPARC_ISA_TRAITS_HH__ diff --git a/src/arch/x86/isa_traits.hh b/src/arch/x86/isa_traits.hh index 9f1b7b7c4..80af12c91 100644 --- a/src/arch/x86/isa_traits.hh +++ b/src/arch/x86/isa_traits.hh @@ -91,6 +91,9 @@ namespace X86ISA StaticInstPtr decodeInst(ExtMachInst); const Addr LoadAddrMask = ULL(-1); + + // Memory accesses can be unaligned + const bool HasUnalignedMemAcc = true; }; #endif // __ARCH_X86_ISATRAITS_HH__ diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 7732b71f8..65578379b 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -131,8 +131,13 @@ class BaseDynInst : public FastAlloc, public RefCounted template Fault write(T data, Addr addr, unsigned flags, uint64_t *res); + /** Splits a request in two if it crosses a dcache block. */ + void splitRequest(RequestPtr req, RequestPtr &sreqLow, + RequestPtr &sreqHigh); + /** Initiate a DTB address translation. */ - void initiateTranslation(RequestPtr req, uint64_t *res, + void initiateTranslation(RequestPtr req, RequestPtr sreqLow, + RequestPtr sreqHigh, uint64_t *res, BaseTLB::Mode mode); /** Finish a DTB address translation. */ @@ -870,12 +875,19 @@ BaseDynInst::read(Addr addr, T &data, unsigned flags) Request *req = new Request(asid, addr, sizeof(T), flags, this->PC, thread->contextId(), threadNumber); - initiateTranslation(req, NULL, BaseTLB::Read); + Request *sreqLow = NULL; + Request *sreqHigh = NULL; + + // Only split the request if the ISA supports unaligned accesses. + if (TheISA::HasUnalignedMemAcc) { + splitRequest(req, sreqLow, sreqHigh); + } + initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read); if (fault == NoFault) { effAddr = req->getVaddr(); effAddrValid = true; - cpu->read(req, data, lqIdx); + cpu->read(req, sreqLow, sreqHigh, data, lqIdx); } else { // Return a fixed value to keep simulation deterministic even @@ -909,12 +921,19 @@ BaseDynInst::write(T data, Addr addr, unsigned flags, uint64_t *res) Request *req = new Request(asid, addr, sizeof(T), flags, this->PC, thread->contextId(), threadNumber); - initiateTranslation(req, res, BaseTLB::Write); + Request *sreqLow = NULL; + Request *sreqHigh = NULL; + + // Only split the request if the ISA supports unaligned accesses. + if (TheISA::HasUnalignedMemAcc) { + splitRequest(req, sreqLow, sreqHigh); + } + initiateTranslation(req, sreqLow, sreqHigh, res, BaseTLB::Write); if (fault == NoFault) { effAddr = req->getVaddr(); effAddrValid = true; - cpu->write(req, data, sqIdx); + cpu->write(req, sreqLow, sreqHigh, data, sqIdx); } return fault; @@ -922,14 +941,48 @@ BaseDynInst::write(T data, Addr addr, unsigned flags, uint64_t *res) template inline void -BaseDynInst::initiateTranslation(RequestPtr req, uint64_t *res, +BaseDynInst::splitRequest(RequestPtr req, RequestPtr &sreqLow, + RequestPtr &sreqHigh) +{ + // Check to see if the request crosses the next level block boundary. + unsigned block_size = cpu->getDcachePort()->peerBlockSize(); + Addr addr = req->getVaddr(); + Addr split_addr = roundDown(addr + req->getSize() - 1, block_size); + assert(split_addr <= addr || split_addr - addr < block_size); + + // Spans two blocks. + if (split_addr > addr) { + req->splitOnVaddr(split_addr, sreqLow, sreqHigh); + } +} + +template +inline void +BaseDynInst::initiateTranslation(RequestPtr req, RequestPtr sreqLow, + RequestPtr sreqHigh, uint64_t *res, BaseTLB::Mode mode) { - WholeTranslationState *state = - new WholeTranslationState(req, NULL, res, mode); - DataTranslation > *trans = - new DataTranslation >(this, state); - cpu->dtb->translateTiming(req, thread->getTC(), trans, mode); + if (!TheISA::HasUnalignedMemAcc || sreqLow == NULL) { + WholeTranslationState *state = + new WholeTranslationState(req, NULL, res, mode); + + // One translation if the request isn't split. + DataTranslation > *trans = + new DataTranslation >(this, state); + cpu->dtb->translateTiming(req, thread->getTC(), trans, mode); + } else { + WholeTranslationState *state = + new WholeTranslationState(req, sreqLow, sreqHigh, NULL, res, mode); + + // Two translations when the request is split. + DataTranslation > *stransLow = + new DataTranslation >(this, state, 0); + DataTranslation > *stransHigh = + new DataTranslation >(this, state, 1); + + cpu->dtb->translateTiming(sreqLow, thread->getTC(), stransLow, mode); + cpu->dtb->translateTiming(sreqHigh, thread->getTC(), stransHigh, mode); + } } template diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 2ea918983..82d4ca25b 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -703,18 +703,25 @@ class FullO3CPU : public BaseO3CPU /** CPU read function, forwards read to LSQ. */ template - Fault read(RequestPtr &req, T &data, int load_idx) + Fault read(RequestPtr &req, RequestPtr &sreqLow, RequestPtr &sreqHigh, + T &data, int load_idx) { - return this->iew.ldstQueue.read(req, data, load_idx); + return this->iew.ldstQueue.read(req, sreqLow, sreqHigh, + data, load_idx); } /** CPU write function, forwards write to LSQ. */ template - Fault write(RequestPtr &req, T &data, int store_idx) + Fault write(RequestPtr &req, RequestPtr &sreqLow, RequestPtr &sreqHigh, + T &data, int store_idx) { - return this->iew.ldstQueue.write(req, data, store_idx); + return this->iew.ldstQueue.write(req, sreqLow, sreqHigh, + data, store_idx); } + /** Get the dcache port (used to find block size for translations). */ + Port *getDcachePort() { return this->iew.ldstQueue.getDcachePort(); } + Addr lockAddr; /** Temporary fix for the lock flag, works in the UP case. */ diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index a0bae058c..7a7ea917f 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -270,15 +270,19 @@ class LSQ { void dumpInsts(ThreadID tid) { thread[tid].dumpInsts(); } - /** Executes a read operation, using the load specified at the load index. */ - template - Fault read(RequestPtr req, T &data, int load_idx); - - /** Executes a store operation, using the store specified at the store - * index. + /** Executes a read operation, using the load specified at the load + * index. */ template - Fault write(RequestPtr req, T &data, int store_idx); + Fault read(RequestPtr req, RequestPtr sreqLow, RequestPtr sreqHigh, + T &data, int load_idx); + + /** Executes a store operation, using the store specified at the store + * index. + */ + template + Fault write(RequestPtr req, RequestPtr sreqLow, RequestPtr sreqHigh, + T &data, int store_idx); /** The CPU pointer. */ O3CPU *cpu; @@ -369,21 +373,23 @@ class LSQ { template template Fault -LSQ::read(RequestPtr req, T &data, int load_idx) +LSQ::read(RequestPtr req, RequestPtr sreqLow, RequestPtr sreqHigh, + T &data, int load_idx) { ThreadID tid = req->threadId(); - return thread[tid].read(req, data, load_idx); + return thread[tid].read(req, sreqLow, sreqHigh, data, load_idx); } template template Fault -LSQ::write(RequestPtr req, T &data, int store_idx) +LSQ::write(RequestPtr req, RequestPtr sreqLow, RequestPtr sreqHigh, + T &data, int store_idx) { ThreadID tid = req->threadId(); - return thread[tid].write(req, data, store_idx); + return thread[tid].write(req, sreqLow, sreqHigh, data, store_idx); } #endif // __CPU_O3_LSQ_HH__ diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 6ff36d929..cf51f8eab 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -216,12 +216,18 @@ class LSQUnit { /** Writes back the instruction, sending it to IEW. */ void writeback(DynInstPtr &inst, PacketPtr pkt); + /** Writes back a store that couldn't be completed the previous cycle. */ + void writebackPendingStore(); + /** Handles completing the send of a store to memory. */ void storePostSend(PacketPtr pkt); /** Completes the store at the specified index. */ void completeStore(int store_idx); + /** Attempts to send a store to the cache. */ + bool sendStore(PacketPtr data_pkt); + /** Increments the given store index (circular queue). */ inline void incrStIdx(int &store_idx); /** Decrements the given store index (circular queue). */ @@ -254,7 +260,8 @@ class LSQUnit { public: /** Default constructor. */ LSQSenderState() - : noWB(false) + : noWB(false), isSplit(false), pktToSend(false), outstanding(1), + mainPkt(NULL), pendingPacket(NULL) { } /** Instruction who initiated the access to memory. */ @@ -265,6 +272,19 @@ class LSQUnit { int idx; /** Whether or not the instruction will need to writeback. */ bool noWB; + /** Whether or not this access is split in two. */ + bool isSplit; + /** Whether or not there is a packet that needs sending. */ + bool pktToSend; + /** Number of outstanding packets to complete. */ + int outstanding; + /** The main packet from a split load, used during writeback. */ + PacketPtr mainPkt; + /** A second packet from a split store that needs sending. */ + PacketPtr pendingPacket; + + /** Completes a packet and returns whether the access is finished. */ + inline bool complete() { return --outstanding == 0; } }; /** Writeback event, specifically for when stores forward data to loads. */ @@ -302,8 +322,8 @@ class LSQUnit { /** Constructs a store queue entry for a given instruction. */ SQEntry(DynInstPtr &_inst) - : inst(_inst), req(NULL), size(0), - canWB(0), committed(0), completed(0) + : inst(_inst), req(NULL), sreqLow(NULL), sreqHigh(NULL), size(0), + isSplit(0), canWB(0), committed(0), completed(0) { std::memset(data, 0, sizeof(data)); } @@ -312,10 +332,15 @@ class LSQUnit { DynInstPtr inst; /** The request for the store. */ RequestPtr req; + /** The split requests for the store. */ + RequestPtr sreqLow; + RequestPtr sreqHigh; /** The size of the store. */ int size; /** The store data. */ char data[sizeof(IntReg)]; + /** Whether or not the store is split into two requests. */ + bool isSplit; /** Whether or not the store can writeback. */ bool canWB; /** Whether or not the store is committed. */ @@ -406,6 +431,13 @@ class LSQUnit { /** The oldest load that caused a memory ordering violation. */ DynInstPtr memDepViolator; + /** Whether or not there is a packet that couldn't be sent because of + * a lack of cache ports. */ + bool hasPendingPkt; + + /** The packet that is pending free cache ports. */ + PacketPtr pendingPkt; + // Will also need how many read/write ports the Dcache has. Or keep track // of that in stage that is one level up, and only call executeLoad/Store // the appropriate number of times. @@ -443,11 +475,13 @@ class LSQUnit { public: /** Executes the load at the given index. */ template - Fault read(Request *req, T &data, int load_idx); + Fault read(Request *req, Request *sreqLow, Request *sreqHigh, T &data, + int load_idx); /** Executes the store at the given index. */ template - Fault write(Request *req, T &data, int store_idx); + Fault write(Request *req, Request *sreqLow, Request *sreqHigh, T &data, + int store_idx); /** Returns the index of the head load instruction. */ int getLoadHead() { return loadHead; } @@ -482,7 +516,8 @@ class LSQUnit { template template Fault -LSQUnit::read(Request *req, T &data, int load_idx) +LSQUnit::read(Request *req, Request *sreqLow, Request *sreqHigh, + T &data, int load_idx) { DynInstPtr load_inst = loadQueue[load_idx]; @@ -503,6 +538,10 @@ LSQUnit::read(Request *req, T &data, int load_idx) // memory. This is quite ugly. @todo: Figure out the proper // place to really handle request deletes. delete req; + if (TheISA::HasUnalignedMemAcc && sreqLow) { + delete sreqLow; + delete sreqHigh; + } return TheISA::genMachineCheckFault(); } @@ -512,10 +551,12 @@ LSQUnit::read(Request *req, T &data, int load_idx) int store_size = 0; DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, " - "storeHead: %i addr: %#x\n", - load_idx, store_idx, storeHead, req->getPaddr()); + "storeHead: %i addr: %#x%s\n", + load_idx, store_idx, storeHead, req->getPaddr(), + sreqLow ? " split" : ""); if (req->isLLSC()) { + assert(!sreqLow); // Disable recording the result temporarily. Writing to misc // regs normally updates the result, but this is not the // desired behavior when handling store conditionals. @@ -587,6 +628,12 @@ LSQUnit::read(Request *req, T &data, int load_idx) // @todo: Need to make this a parameter. cpu->schedule(wb, curTick); + // Don't need to do anything special for split loads. + if (TheISA::HasUnalignedMemAcc && sreqLow) { + delete sreqLow; + delete sreqHigh; + } + ++lsqForwLoads; return NoFault; } else if ((store_has_lower_limit && lower_load_has_store_part) || @@ -630,6 +677,10 @@ LSQUnit::read(Request *req, T &data, int load_idx) // memory. This is quite ugly. @todo: Figure out the // proper place to really handle request deletes. delete req; + if (TheISA::HasUnalignedMemAcc && sreqLow) { + delete sreqLow; + delete sreqHigh; + } return NoFault; } @@ -645,12 +696,14 @@ LSQUnit::read(Request *req, T &data, int load_idx) ++usedPorts; // if we the cache is not blocked, do cache access + bool completedFirst = false; if (!lsq->cacheBlocked()) { - PacketPtr data_pkt = - new Packet(req, - (req->isLLSC() ? - MemCmd::LoadLockedReq : MemCmd::ReadReq), - Packet::Broadcast); + MemCmd command = + req->isLLSC() ? MemCmd::LoadLockedReq : MemCmd::ReadReq; + PacketPtr data_pkt = new Packet(req, command, Packet::Broadcast); + PacketPtr fst_data_pkt = NULL; + PacketPtr snd_data_pkt = NULL; + data_pkt->dataStatic(load_inst->memData); LSQSenderState *state = new LSQSenderState; @@ -659,18 +712,66 @@ LSQUnit::read(Request *req, T &data, int load_idx) state->inst = load_inst; data_pkt->senderState = state; - if (!dcachePort->sendTiming(data_pkt)) { + if (!TheISA::HasUnalignedMemAcc || !sreqLow) { + + // Point the first packet at the main data packet. + fst_data_pkt = data_pkt; + } else { + + // Create the split packets. + fst_data_pkt = new Packet(sreqLow, command, Packet::Broadcast); + snd_data_pkt = new Packet(sreqHigh, command, Packet::Broadcast); + + fst_data_pkt->dataStatic(load_inst->memData); + snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize()); + + fst_data_pkt->senderState = state; + snd_data_pkt->senderState = state; + + state->isSplit = true; + state->outstanding = 2; + state->mainPkt = data_pkt; + } + + if (!dcachePort->sendTiming(fst_data_pkt)) { // Delete state and data packet because a load retry // initiates a pipeline restart; it does not retry. delete state; delete data_pkt->req; delete data_pkt; + if (TheISA::HasUnalignedMemAcc && sreqLow) { + delete fst_data_pkt->req; + delete fst_data_pkt; + delete snd_data_pkt->req; + delete snd_data_pkt; + } req = NULL; // If the access didn't succeed, tell the LSQ by setting // the retry thread id. lsq->setRetryTid(lsqID); + } else if (TheISA::HasUnalignedMemAcc && sreqLow) { + completedFirst = true; + + // The first packet was sent without problems, so send this one + // too. If there is a problem with this packet then the whole + // load will be squashed, so indicate this to the state object. + // The first packet will return in completeDataAccess and be + // handled there. + ++usedPorts; + if (!dcachePort->sendTiming(snd_data_pkt)) { + + // The main packet will be deleted in completeDataAccess. + delete snd_data_pkt->req; + delete snd_data_pkt; + + state->complete(); + + req = NULL; + + lsq->setRetryTid(lsqID); + } } } @@ -679,6 +780,10 @@ LSQUnit::read(Request *req, T &data, int load_idx) if (lsq->cacheBlocked()) { if (req) delete req; + if (TheISA::HasUnalignedMemAcc && sreqLow && !completedFirst) { + delete sreqLow; + delete sreqHigh; + } ++lsqCacheBlocked; @@ -703,7 +808,8 @@ LSQUnit::read(Request *req, T &data, int load_idx) template template Fault -LSQUnit::write(Request *req, T &data, int store_idx) +LSQUnit::write(Request *req, Request *sreqLow, Request *sreqHigh, + T &data, int store_idx) { assert(storeQueue[store_idx].inst); @@ -713,6 +819,8 @@ LSQUnit::write(Request *req, T &data, int store_idx) storeQueue[store_idx].inst->seqNum); storeQueue[store_idx].req = req; + storeQueue[store_idx].sreqLow = sreqLow; + storeQueue[store_idx].sreqHigh = sreqHigh; storeQueue[store_idx].size = sizeof(T); assert(sizeof(T) <= sizeof(storeQueue[store_idx].data)); diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 9ee1de45a..fcc57ab09 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -85,11 +85,23 @@ LSQUnit::completeDataAccess(PacketPtr pkt) assert(!pkt->wasNacked()); + // If this is a split access, wait until all packets are received. + if (TheISA::HasUnalignedMemAcc && !state->complete()) { + delete pkt->req; + delete pkt; + return; + } + if (isSwitchedOut() || inst->isSquashed()) { iewStage->decrWb(inst->seqNum); } else { if (!state->noWB) { - writeback(inst, pkt); + if (!TheISA::HasUnalignedMemAcc || !state->isSplit || + !state->isLoad) { + writeback(inst, pkt); + } else { + writeback(inst, state->mainPkt); + } } if (inst->isStore()) { @@ -97,6 +109,10 @@ LSQUnit::completeDataAccess(PacketPtr pkt) } } + if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) { + delete state->mainPkt->req; + delete state->mainPkt; + } delete state; delete pkt->req; delete pkt; @@ -106,7 +122,7 @@ template LSQUnit::LSQUnit() : loads(0), stores(0), storesToWB(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), - loadBlockedHandled(false) + loadBlockedHandled(false), hasPendingPkt(false) { } @@ -603,10 +619,32 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst) } } +template +void +LSQUnit::writebackPendingStore() +{ + if (hasPendingPkt) { + assert(pendingPkt != NULL); + + // If the cache is blocked, this will store the packet for retry. + if (sendStore(pendingPkt)) { + storePostSend(pendingPkt); + } + pendingPkt = NULL; + hasPendingPkt = false; + } +} + template void LSQUnit::writebackStores() { + // First writeback the second packet from any split store that didn't + // complete last cycle because there weren't enough cache ports available. + if (TheISA::HasUnalignedMemAcc) { + writebackPendingStore(); + } + while (storesToWB > 0 && storeWBIdx != storeTail && storeQueue[storeWBIdx].inst && @@ -640,6 +678,11 @@ LSQUnit::writebackStores() assert(storeQueue[storeWBIdx].req); assert(!storeQueue[storeWBIdx].committed); + if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) { + assert(storeQueue[storeWBIdx].sreqLow); + assert(storeQueue[storeWBIdx].sreqHigh); + } + DynInstPtr inst = storeQueue[storeWBIdx].inst; Request *req = storeQueue[storeWBIdx].req; @@ -653,15 +696,41 @@ LSQUnit::writebackStores() MemCmd command = req->isSwap() ? MemCmd::SwapReq : (req->isLLSC() ? MemCmd::StoreCondReq : MemCmd::WriteReq); - PacketPtr data_pkt = new Packet(req, command, - Packet::Broadcast); - data_pkt->dataStatic(inst->memData); + PacketPtr data_pkt; + PacketPtr snd_data_pkt = NULL; LSQSenderState *state = new LSQSenderState; state->isLoad = false; state->idx = storeWBIdx; state->inst = inst; - data_pkt->senderState = state; + + if (!TheISA::HasUnalignedMemAcc || !storeQueue[storeWBIdx].isSplit) { + + // Build a single data packet if the store isn't split. + data_pkt = new Packet(req, command, Packet::Broadcast); + data_pkt->dataStatic(inst->memData); + data_pkt->senderState = state; + } else { + RequestPtr sreqLow = storeQueue[storeWBIdx].sreqLow; + RequestPtr sreqHigh = storeQueue[storeWBIdx].sreqHigh; + + // Create two packets if the store is split in two. + data_pkt = new Packet(sreqLow, command, Packet::Broadcast); + snd_data_pkt = new Packet(sreqHigh, command, Packet::Broadcast); + + data_pkt->dataStatic(inst->memData); + snd_data_pkt->dataStatic(inst->memData + sreqLow->getSize()); + + data_pkt->senderState = state; + snd_data_pkt->senderState = state; + + state->isSplit = true; + state->outstanding = 2; + + // Can delete the main request now. + delete req; + req = sreqLow; + } DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x " "to Addr:%#x, data:%#x [sn:%lli]\n", @@ -671,6 +740,7 @@ LSQUnit::writebackStores() // @todo: Remove this SC hack once the memory system handles it. if (inst->isStoreConditional()) { + assert(!storeQueue[storeWBIdx].isSplit); // Disable recording the result temporarily. Writing to // misc regs normally updates the result, but this is not // the desired behavior when handling store conditionals. @@ -694,18 +764,44 @@ LSQUnit::writebackStores() state->noWB = true; } - if (!dcachePort->sendTiming(data_pkt)) { - // Need to handle becoming blocked on a store. + if (!sendStore(data_pkt)) { DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will" "retry later\n", inst->seqNum); - isStoreBlocked = true; - ++lsqCacheBlocked; - assert(retryPkt == NULL); - retryPkt = data_pkt; - lsq->setRetryTid(lsqID); + + // Need to store the second packet, if split. + if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) { + state->pktToSend = true; + state->pendingPacket = snd_data_pkt; + } } else { - storePostSend(data_pkt); + + // If split, try to send the second packet too + if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) { + assert(snd_data_pkt); + + // Ensure there are enough ports to use. + if (usedPorts < cachePorts) { + ++usedPorts; + if (sendStore(snd_data_pkt)) { + storePostSend(snd_data_pkt); + } else { + DPRINTF(IEW, "D-Cache became blocked when writing" + " [sn:%lli] second packet, will retry later\n", + inst->seqNum); + } + } else { + + // Store the packet for when there's free ports. + assert(pendingPkt == NULL); + pendingPkt = snd_data_pkt; + hasPendingPkt = true; + } + } else { + + // Not a split store. + storePostSend(data_pkt); + } } } @@ -808,6 +904,13 @@ LSQUnit::squash(const InstSeqNum &squashed_num) // memory. This is quite ugly. @todo: Figure out the proper // place to really handle request deletes. delete storeQueue[store_idx].req; + if (TheISA::HasUnalignedMemAcc && storeQueue[store_idx].isSplit) { + delete storeQueue[store_idx].sreqLow; + delete storeQueue[store_idx].sreqHigh; + + storeQueue[store_idx].sreqLow = NULL; + storeQueue[store_idx].sreqHigh = NULL; + } storeQueue[store_idx].req = NULL; --stores; @@ -926,6 +1029,22 @@ LSQUnit::completeStore(int store_idx) #endif } +template +bool +LSQUnit::sendStore(PacketPtr data_pkt) +{ + if (!dcachePort->sendTiming(data_pkt)) { + // Need to handle becoming blocked on a store. + isStoreBlocked = true; + ++lsqCacheBlocked; + assert(retryPkt == NULL); + retryPkt = data_pkt; + lsq->setRetryTid(lsqID); + return false; + } + return true; +} + template void LSQUnit::recvRetry() @@ -935,10 +1054,24 @@ LSQUnit::recvRetry() assert(retryPkt != NULL); if (dcachePort->sendTiming(retryPkt)) { - storePostSend(retryPkt); + LSQSenderState *state = + dynamic_cast(retryPkt->senderState); + + // Don't finish the store unless this is the last packet. + if (!TheISA::HasUnalignedMemAcc || !state->pktToSend) { + storePostSend(retryPkt); + } retryPkt = NULL; isStoreBlocked = false; lsq->setRetryTid(InvalidThreadID); + + // Send any outstanding packet. + if (TheISA::HasUnalignedMemAcc && state->pktToSend) { + assert(state->pendingPacket); + if (sendStore(state->pendingPacket)) { + storePostSend(state->pendingPacket); + } + } } else { // Still blocked! ++lsqCacheBlocked;