2006-04-23 00:26:48 +02:00
|
|
|
/*
|
2012-01-31 16:46:03 +01:00
|
|
|
* Copyright (c) 2010-2011 ARM Limited
|
2010-08-23 18:18:40 +02:00
|
|
|
* All rights reserved
|
|
|
|
*
|
|
|
|
* The license below extends only to copyright in the software and shall
|
|
|
|
* not be construed as granting a license to any other intellectual
|
|
|
|
* property including but not limited to intellectual property relating
|
|
|
|
* to a hardware implementation of the functionality of the software
|
|
|
|
* licensed hereunder. You may use the software subject to the license
|
|
|
|
* terms below provided that you ensure that this notice is replicated
|
|
|
|
* unmodified and in its entirety in all distributions of the software,
|
|
|
|
* modified or unmodified, in source code or in binary form.
|
|
|
|
*
|
2006-04-23 00:26:48 +02:00
|
|
|
* Copyright (c) 2004-2005 The Regents of The University of Michigan
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are
|
|
|
|
* met: redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer;
|
|
|
|
* redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution;
|
|
|
|
* neither the name of the copyright holders nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived from
|
|
|
|
* this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2006-06-07 22:02:55 +02:00
|
|
|
*
|
|
|
|
* Authors: Kevin Lim
|
|
|
|
* Korey Sewell
|
2006-04-23 00:26:48 +02:00
|
|
|
*/
|
|
|
|
|
2011-09-27 09:24:43 +02:00
|
|
|
#include "arch/generic/debugfaults.hh"
|
2006-10-23 20:00:07 +02:00
|
|
|
#include "arch/locked_mem.hh"
|
2011-04-15 19:44:06 +02:00
|
|
|
#include "base/str.hh"
|
2009-09-23 17:34:21 +02:00
|
|
|
#include "config/the_isa.hh"
|
2012-03-09 15:59:27 +01:00
|
|
|
#include "cpu/checker/cpu.hh"
|
2006-07-13 19:12:51 +02:00
|
|
|
#include "cpu/o3/lsq.hh"
|
2006-04-23 00:26:48 +02:00
|
|
|
#include "cpu/o3/lsq_unit.hh"
|
2011-04-15 19:44:32 +02:00
|
|
|
#include "debug/Activity.hh"
|
|
|
|
#include "debug/IEW.hh"
|
|
|
|
#include "debug/LSQUnit.hh"
|
2006-06-13 17:38:16 +02:00
|
|
|
#include "mem/packet.hh"
|
2006-06-03 00:15:20 +02:00
|
|
|
#include "mem/request.hh"
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-06-03 00:15:20 +02:00
|
|
|
template<class Impl>
|
2006-06-06 00:14:39 +02:00
|
|
|
LSQUnit<Impl>::WritebackEvent::WritebackEvent(DynInstPtr &_inst, PacketPtr _pkt,
|
|
|
|
LSQUnit *lsq_ptr)
|
2011-09-23 03:59:55 +02:00
|
|
|
: Event(Default_Pri, AutoDelete),
|
|
|
|
inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
|
2006-06-03 00:15:20 +02:00
|
|
|
{
|
2006-06-06 00:14:39 +02:00
|
|
|
}
|
2006-06-03 00:15:20 +02:00
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::WritebackEvent::process()
|
|
|
|
{
|
|
|
|
if (!lsqPtr->isSwitchedOut()) {
|
|
|
|
lsqPtr->writeback(inst, pkt);
|
2006-06-03 00:15:20 +02:00
|
|
|
}
|
2007-04-03 20:25:24 +02:00
|
|
|
|
|
|
|
if (pkt->senderState)
|
|
|
|
delete pkt->senderState;
|
|
|
|
|
|
|
|
delete pkt->req;
|
2006-06-06 00:14:39 +02:00
|
|
|
delete pkt;
|
|
|
|
}
|
2006-06-03 00:15:20 +02:00
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
template<class Impl>
|
|
|
|
const char *
|
2008-02-06 22:32:40 +01:00
|
|
|
LSQUnit<Impl>::WritebackEvent::description() const
|
2006-06-06 00:14:39 +02:00
|
|
|
{
|
2007-07-01 02:45:58 +02:00
|
|
|
return "Store writeback";
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
2006-06-03 00:15:20 +02:00
|
|
|
template<class Impl>
|
2006-04-23 00:26:48 +02:00
|
|
|
void
|
2006-06-06 00:14:39 +02:00
|
|
|
LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
|
2006-04-23 00:26:48 +02:00
|
|
|
{
|
2006-06-06 00:14:39 +02:00
|
|
|
LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
|
|
|
|
DynInstPtr inst = state->inst;
|
2011-01-18 23:30:05 +01:00
|
|
|
DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum);
|
|
|
|
DPRINTF(Activity, "Activity: Writeback event [sn:%lli].\n", inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
//iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
|
2006-05-25 20:41:36 +02:00
|
|
|
|
2010-02-12 20:53:20 +01:00
|
|
|
// If this is a split access, wait until all packets are received.
|
|
|
|
if (TheISA::HasUnalignedMemAcc && !state->complete()) {
|
|
|
|
delete pkt->req;
|
|
|
|
delete pkt;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
if (isSwitchedOut() || inst->isSquashed()) {
|
2006-07-05 21:51:36 +02:00
|
|
|
iewStage->decrWb(inst->seqNum);
|
2006-06-06 00:14:39 +02:00
|
|
|
} else {
|
|
|
|
if (!state->noWB) {
|
2010-02-12 20:53:20 +01:00
|
|
|
if (!TheISA::HasUnalignedMemAcc || !state->isSplit ||
|
|
|
|
!state->isLoad) {
|
|
|
|
writeback(inst, pkt);
|
|
|
|
} else {
|
|
|
|
writeback(inst, state->mainPkt);
|
|
|
|
}
|
2006-06-06 00:14:39 +02:00
|
|
|
}
|
2006-05-04 17:36:20 +02:00
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
if (inst->isStore()) {
|
|
|
|
completeStore(state->idx);
|
|
|
|
}
|
|
|
|
}
|
2006-06-03 00:15:20 +02:00
|
|
|
|
2010-02-12 20:53:20 +01:00
|
|
|
if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) {
|
|
|
|
delete state->mainPkt->req;
|
|
|
|
delete state->mainPkt;
|
|
|
|
}
|
2006-06-06 00:14:39 +02:00
|
|
|
delete state;
|
2007-03-23 16:33:08 +01:00
|
|
|
delete pkt->req;
|
2006-06-06 00:14:39 +02:00
|
|
|
delete pkt;
|
2006-06-03 00:15:20 +02:00
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template <class Impl>
|
|
|
|
LSQUnit<Impl>::LSQUnit()
|
2011-09-13 18:58:08 +02:00
|
|
|
: loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
|
2006-06-06 00:14:39 +02:00
|
|
|
isStoreBlocked(false), isLoadBlocked(false),
|
2012-01-29 02:09:04 +01:00
|
|
|
loadBlockedHandled(false), storeInFlight(false), hasPendingPkt(false)
|
2006-04-23 00:26:48 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class Impl>
|
|
|
|
void
|
2008-08-11 21:22:16 +02:00
|
|
|
LSQUnit<Impl>::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params,
|
|
|
|
LSQ *lsq_ptr, unsigned maxLQEntries, unsigned maxSQEntries,
|
|
|
|
unsigned id)
|
2006-04-23 00:26:48 +02:00
|
|
|
{
|
2007-04-04 21:38:59 +02:00
|
|
|
cpu = cpu_ptr;
|
|
|
|
iewStage = iew_ptr;
|
|
|
|
|
|
|
|
DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",id);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-05-04 17:36:20 +02:00
|
|
|
switchedOut = false;
|
|
|
|
|
2011-09-13 18:58:08 +02:00
|
|
|
cacheBlockMask = 0;
|
|
|
|
|
2006-07-13 19:12:51 +02:00
|
|
|
lsq = lsq_ptr;
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
lsqID = id;
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
// Add 1 for the sentinel entry (they are circular queues).
|
|
|
|
LQEntries = maxLQEntries + 1;
|
|
|
|
SQEntries = maxSQEntries + 1;
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
loadQueue.resize(LQEntries);
|
|
|
|
storeQueue.resize(SQEntries);
|
|
|
|
|
2011-04-04 18:42:23 +02:00
|
|
|
depCheckShift = params->LSQDepCheckShift;
|
|
|
|
checkLoads = params->LSQCheckLoads;
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
loadHead = loadTail = 0;
|
|
|
|
|
|
|
|
storeHead = storeWBIdx = storeTail = 0;
|
|
|
|
|
|
|
|
usedPorts = 0;
|
|
|
|
cachePorts = params->cachePorts;
|
|
|
|
|
2006-11-06 02:29:38 +01:00
|
|
|
retryPkt = NULL;
|
2006-05-19 21:53:17 +02:00
|
|
|
memDepViolator = NULL;
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
blockedLoadSeqNum = 0;
|
2012-01-29 02:09:04 +01:00
|
|
|
needsTSO = params->needsTSO;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
template<class Impl>
|
|
|
|
std::string
|
|
|
|
LSQUnit<Impl>::name() const
|
|
|
|
{
|
|
|
|
if (Impl::MaxThreads == 1) {
|
|
|
|
return iewStage->name() + ".lsq";
|
|
|
|
} else {
|
2011-04-21 04:07:45 +02:00
|
|
|
return iewStage->name() + ".lsq.thread" + to_string(lsqID);
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-06-14 04:35:05 +02:00
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::regStats()
|
|
|
|
{
|
|
|
|
lsqForwLoads
|
|
|
|
.name(name() + ".forwLoads")
|
|
|
|
.desc("Number of loads that had data forwarded from stores");
|
|
|
|
|
|
|
|
invAddrLoads
|
|
|
|
.name(name() + ".invAddrLoads")
|
|
|
|
.desc("Number of loads ignored due to an invalid address");
|
|
|
|
|
|
|
|
lsqSquashedLoads
|
|
|
|
.name(name() + ".squashedLoads")
|
|
|
|
.desc("Number of loads squashed");
|
|
|
|
|
|
|
|
lsqIgnoredResponses
|
|
|
|
.name(name() + ".ignoredResponses")
|
|
|
|
.desc("Number of memory responses ignored because the instruction is squashed");
|
|
|
|
|
2006-08-24 23:29:34 +02:00
|
|
|
lsqMemOrderViolation
|
|
|
|
.name(name() + ".memOrderViolation")
|
|
|
|
.desc("Number of memory ordering violations");
|
|
|
|
|
2006-06-14 04:35:05 +02:00
|
|
|
lsqSquashedStores
|
|
|
|
.name(name() + ".squashedStores")
|
|
|
|
.desc("Number of stores squashed");
|
|
|
|
|
|
|
|
invAddrSwpfs
|
|
|
|
.name(name() + ".invAddrSwpfs")
|
|
|
|
.desc("Number of software prefetches ignored due to an invalid address");
|
|
|
|
|
|
|
|
lsqBlockedLoads
|
|
|
|
.name(name() + ".blockedLoads")
|
|
|
|
.desc("Number of blocked loads due to partial load-store forwarding");
|
|
|
|
|
|
|
|
lsqRescheduledLoads
|
|
|
|
.name(name() + ".rescheduledLoads")
|
|
|
|
.desc("Number of loads that were rescheduled");
|
|
|
|
|
|
|
|
lsqCacheBlocked
|
|
|
|
.name(name() + ".cacheBlocked")
|
|
|
|
.desc("Number of times an access to memory failed due to the cache being blocked");
|
|
|
|
}
|
|
|
|
|
2007-04-04 21:38:59 +02:00
|
|
|
template<class Impl>
|
|
|
|
void
|
MEM: Introduce the master/slave port sub-classes in C++
This patch introduces the notion of a master and slave port in the C++
code, thus bringing the previous classification from the Python
classes into the corresponding simulation objects and memory objects.
The patch enables us to classify behaviours into the two bins and add
assumptions and enfore compliance, also simplifying the two
interfaces. As a starting point, isSnooping is confined to a master
port, and getAddrRanges to slave ports. More of these specilisations
are to come in later patches.
The getPort function is not getMasterPort and getSlavePort, and
returns a port reference rather than a pointer as NULL would never be
a valid return value. The default implementation of these two
functions is placed in MemObject, and calls fatal.
The one drawback with this specific patch is that it requires some
code duplication, e.g. QueuedPort becomes QueuedMasterPort and
QueuedSlavePort, and BusPort becomes BusMasterPort and BusSlavePort
(avoiding multiple inheritance). With the later introduction of the
port interfaces, moving the functionality outside the port itself, a
lot of the duplicated code will disappear again.
2012-03-30 15:40:11 +02:00
|
|
|
LSQUnit<Impl>::setDcachePort(MasterPort *dcache_port)
|
2007-04-04 21:38:59 +02:00
|
|
|
{
|
|
|
|
dcachePort = dcache_port;
|
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::clearLQ()
|
|
|
|
{
|
|
|
|
loadQueue.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::clearSQ()
|
|
|
|
{
|
|
|
|
storeQueue.clear();
|
|
|
|
}
|
|
|
|
|
2006-05-04 17:36:20 +02:00
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::switchOut()
|
|
|
|
{
|
|
|
|
switchedOut = true;
|
2006-09-28 06:09:27 +02:00
|
|
|
for (int i = 0; i < loadQueue.size(); ++i) {
|
|
|
|
assert(!loadQueue[i]);
|
2006-05-04 17:36:20 +02:00
|
|
|
loadQueue[i] = NULL;
|
2006-09-28 06:09:27 +02:00
|
|
|
}
|
2006-05-04 17:36:20 +02:00
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
assert(storesToWB == 0);
|
2006-05-04 17:36:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::takeOverFrom()
|
|
|
|
{
|
|
|
|
switchedOut = false;
|
|
|
|
loads = stores = storesToWB = 0;
|
|
|
|
|
|
|
|
loadHead = loadTail = 0;
|
|
|
|
|
|
|
|
storeHead = storeWBIdx = storeTail = 0;
|
|
|
|
|
|
|
|
usedPorts = 0;
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
memDepViolator = NULL;
|
2006-05-04 17:36:20 +02:00
|
|
|
|
|
|
|
blockedLoadSeqNum = 0;
|
|
|
|
|
|
|
|
stalled = false;
|
|
|
|
isLoadBlocked = false;
|
|
|
|
loadBlockedHandled = false;
|
2011-09-13 18:58:08 +02:00
|
|
|
|
|
|
|
// Just incase the memory system changed out from under us
|
|
|
|
cacheBlockMask = 0;
|
2006-05-04 17:36:20 +02:00
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::resizeLQ(unsigned size)
|
|
|
|
{
|
2006-05-19 21:53:17 +02:00
|
|
|
unsigned size_plus_sentinel = size + 1;
|
|
|
|
assert(size_plus_sentinel >= LQEntries);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
if (size_plus_sentinel > LQEntries) {
|
|
|
|
while (size_plus_sentinel > loadQueue.size()) {
|
2006-04-23 00:26:48 +02:00
|
|
|
DynInstPtr dummy;
|
|
|
|
loadQueue.push_back(dummy);
|
|
|
|
LQEntries++;
|
|
|
|
}
|
|
|
|
} else {
|
2006-05-19 21:53:17 +02:00
|
|
|
LQEntries = size_plus_sentinel;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::resizeSQ(unsigned size)
|
|
|
|
{
|
2006-05-19 21:53:17 +02:00
|
|
|
unsigned size_plus_sentinel = size + 1;
|
|
|
|
if (size_plus_sentinel > SQEntries) {
|
|
|
|
while (size_plus_sentinel > storeQueue.size()) {
|
2006-04-23 00:26:48 +02:00
|
|
|
SQEntry dummy;
|
|
|
|
storeQueue.push_back(dummy);
|
|
|
|
SQEntries++;
|
|
|
|
}
|
|
|
|
} else {
|
2006-05-19 21:53:17 +02:00
|
|
|
SQEntries = size_plus_sentinel;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::insert(DynInstPtr &inst)
|
|
|
|
{
|
|
|
|
assert(inst->isMemRef());
|
|
|
|
|
|
|
|
assert(inst->isLoad() || inst->isStore());
|
|
|
|
|
|
|
|
if (inst->isLoad()) {
|
|
|
|
insertLoad(inst);
|
|
|
|
} else {
|
|
|
|
insertStore(inst);
|
|
|
|
}
|
|
|
|
|
|
|
|
inst->setInLSQ();
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst)
|
|
|
|
{
|
2006-05-19 21:53:17 +02:00
|
|
|
assert((loadTail + 1) % LQEntries != loadHead);
|
|
|
|
assert(loads < LQEntries);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit, "Inserting load PC %s, idx:%i [sn:%lli]\n",
|
|
|
|
load_inst->pcState(), loadTail, load_inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
load_inst->lqIdx = loadTail;
|
|
|
|
|
|
|
|
if (stores == 0) {
|
|
|
|
load_inst->sqIdx = -1;
|
|
|
|
} else {
|
|
|
|
load_inst->sqIdx = storeTail;
|
|
|
|
}
|
|
|
|
|
|
|
|
loadQueue[loadTail] = load_inst;
|
|
|
|
|
|
|
|
incrLdIdx(loadTail);
|
|
|
|
|
|
|
|
++loads;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::insertStore(DynInstPtr &store_inst)
|
|
|
|
{
|
|
|
|
// Make sure it is not full before inserting an instruction.
|
|
|
|
assert((storeTail + 1) % SQEntries != storeHead);
|
|
|
|
assert(stores < SQEntries);
|
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit, "Inserting store PC %s, idx:%i [sn:%lli]\n",
|
|
|
|
store_inst->pcState(), storeTail, store_inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
store_inst->sqIdx = storeTail;
|
|
|
|
store_inst->lqIdx = loadTail;
|
|
|
|
|
|
|
|
storeQueue[storeTail] = SQEntry(store_inst);
|
|
|
|
|
|
|
|
incrStIdx(storeTail);
|
|
|
|
|
|
|
|
++stores;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
typename Impl::DynInstPtr
|
|
|
|
LSQUnit<Impl>::getMemDepViolator()
|
|
|
|
{
|
|
|
|
DynInstPtr temp = memDepViolator;
|
|
|
|
|
|
|
|
memDepViolator = NULL;
|
|
|
|
|
|
|
|
return temp;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
unsigned
|
|
|
|
LSQUnit<Impl>::numFreeEntries()
|
|
|
|
{
|
|
|
|
unsigned free_lq_entries = LQEntries - loads;
|
|
|
|
unsigned free_sq_entries = SQEntries - stores;
|
|
|
|
|
|
|
|
// Both the LQ and SQ entries have an extra dummy entry to differentiate
|
|
|
|
// empty/full conditions. Subtract 1 from the free entries.
|
|
|
|
if (free_lq_entries < free_sq_entries) {
|
|
|
|
return free_lq_entries - 1;
|
|
|
|
} else {
|
|
|
|
return free_sq_entries - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-13 18:58:08 +02:00
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
|
|
|
|
{
|
|
|
|
int load_idx = loadHead;
|
|
|
|
|
|
|
|
if (!cacheBlockMask) {
|
|
|
|
assert(dcachePort);
|
|
|
|
Addr bs = dcachePort->peerBlockSize();
|
|
|
|
|
|
|
|
// Make sure we actually got a size
|
|
|
|
assert(bs != 0);
|
|
|
|
|
|
|
|
cacheBlockMask = ~(bs - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If this is the only load in the LSQ we don't care
|
|
|
|
if (load_idx == loadTail)
|
|
|
|
return;
|
|
|
|
incrLdIdx(load_idx);
|
|
|
|
|
|
|
|
DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr());
|
|
|
|
Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;
|
|
|
|
while (load_idx != loadTail) {
|
|
|
|
DynInstPtr ld_inst = loadQueue[load_idx];
|
|
|
|
|
2012-06-05 07:23:09 +02:00
|
|
|
if (!ld_inst->effAddrValid() || ld_inst->uncacheable()) {
|
2011-09-13 18:58:08 +02:00
|
|
|
incrLdIdx(load_idx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Addr load_addr = ld_inst->physEffAddr & cacheBlockMask;
|
|
|
|
DPRINTF(LSQUnit, "-- inst [sn:%lli] load_addr: %#x to pktAddr:%#x\n",
|
|
|
|
ld_inst->seqNum, load_addr, invalidate_addr);
|
|
|
|
|
|
|
|
if (load_addr == invalidate_addr) {
|
2012-06-05 07:23:09 +02:00
|
|
|
if (ld_inst->possibleLoadViolation()) {
|
2011-09-13 18:58:08 +02:00
|
|
|
DPRINTF(LSQUnit, "Conflicting load at addr %#x [sn:%lli]\n",
|
|
|
|
ld_inst->physEffAddr, pkt->getAddr(), ld_inst->seqNum);
|
|
|
|
|
|
|
|
// Mark the load for re-execution
|
|
|
|
ld_inst->fault = new ReExec;
|
|
|
|
} else {
|
|
|
|
// If a older load checks this and it's true
|
|
|
|
// then we might have missed the snoop
|
|
|
|
// in which case we need to invalidate to be sure
|
2012-06-05 07:23:09 +02:00
|
|
|
ld_inst->hitExternalSnoop(true);
|
2011-09-13 18:58:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
incrLdIdx(load_idx);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-04-04 18:42:23 +02:00
|
|
|
template <class Impl>
|
|
|
|
Fault
|
|
|
|
LSQUnit<Impl>::checkViolations(int load_idx, DynInstPtr &inst)
|
|
|
|
{
|
|
|
|
Addr inst_eff_addr1 = inst->effAddr >> depCheckShift;
|
|
|
|
Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift;
|
|
|
|
|
|
|
|
/** @todo in theory you only need to check an instruction that has executed
|
|
|
|
* however, there isn't a good way in the pipeline at the moment to check
|
|
|
|
* all instructions that will execute before the store writes back. Thus,
|
|
|
|
* like the implementation that came before it, we're overly conservative.
|
|
|
|
*/
|
|
|
|
while (load_idx != loadTail) {
|
|
|
|
DynInstPtr ld_inst = loadQueue[load_idx];
|
2012-06-05 07:23:09 +02:00
|
|
|
if (!ld_inst->effAddrValid() || ld_inst->uncacheable()) {
|
2011-04-04 18:42:23 +02:00
|
|
|
incrLdIdx(load_idx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Addr ld_eff_addr1 = ld_inst->effAddr >> depCheckShift;
|
|
|
|
Addr ld_eff_addr2 =
|
|
|
|
(ld_inst->effAddr + ld_inst->effSize - 1) >> depCheckShift;
|
|
|
|
|
2011-05-05 03:38:26 +02:00
|
|
|
if (inst_eff_addr2 >= ld_eff_addr1 && inst_eff_addr1 <= ld_eff_addr2) {
|
2011-09-13 18:58:08 +02:00
|
|
|
if (inst->isLoad()) {
|
|
|
|
// If this load is to the same block as an external snoop
|
|
|
|
// invalidate that we've observed then the load needs to be
|
|
|
|
// squashed as it could have newer data
|
2012-06-05 07:23:09 +02:00
|
|
|
if (ld_inst->hitExternalSnoop()) {
|
2011-09-13 18:58:08 +02:00
|
|
|
if (!memDepViolator ||
|
|
|
|
ld_inst->seqNum < memDepViolator->seqNum) {
|
|
|
|
DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] "
|
2011-09-27 09:25:26 +02:00
|
|
|
"and [sn:%lli] at address %#x\n",
|
|
|
|
inst->seqNum, ld_inst->seqNum, ld_eff_addr1);
|
2011-09-13 18:58:08 +02:00
|
|
|
memDepViolator = ld_inst;
|
|
|
|
|
|
|
|
++lsqMemOrderViolation;
|
|
|
|
|
2011-09-27 09:24:43 +02:00
|
|
|
return new GenericISA::M5PanicFault(
|
|
|
|
"Detected fault with inst [sn:%lli] and "
|
|
|
|
"[sn:%lli] at address %#x\n",
|
|
|
|
inst->seqNum, ld_inst->seqNum, ld_eff_addr1);
|
2011-09-13 18:58:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, mark the load has a possible load violation
|
|
|
|
// and if we see a snoop before it's commited, we need to squash
|
2012-06-05 07:23:09 +02:00
|
|
|
ld_inst->possibleLoadViolation(true);
|
2011-09-13 18:58:08 +02:00
|
|
|
DPRINTF(LSQUnit, "Found possible load violaiton at addr: %#x"
|
|
|
|
" between instructions [sn:%lli] and [sn:%lli]\n",
|
|
|
|
inst_eff_addr1, inst->seqNum, ld_inst->seqNum);
|
|
|
|
} else {
|
|
|
|
// A load/store incorrectly passed this store.
|
|
|
|
// Check if we already have a violator, or if it's newer
|
|
|
|
// squash and refetch.
|
|
|
|
if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum)
|
|
|
|
break;
|
2011-04-04 18:42:23 +02:00
|
|
|
|
2011-09-27 09:25:26 +02:00
|
|
|
DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] and "
|
|
|
|
"[sn:%lli] at address %#x\n",
|
|
|
|
inst->seqNum, ld_inst->seqNum, ld_eff_addr1);
|
2011-09-13 18:58:08 +02:00
|
|
|
memDepViolator = ld_inst;
|
2011-04-04 18:42:23 +02:00
|
|
|
|
2011-09-13 18:58:08 +02:00
|
|
|
++lsqMemOrderViolation;
|
2011-04-04 18:42:23 +02:00
|
|
|
|
2011-09-27 09:24:43 +02:00
|
|
|
return new GenericISA::M5PanicFault("Detected fault with "
|
|
|
|
"inst [sn:%lli] and [sn:%lli] at address %#x\n",
|
|
|
|
inst->seqNum, ld_inst->seqNum, ld_eff_addr1);
|
2011-09-13 18:58:08 +02:00
|
|
|
}
|
2011-04-04 18:42:23 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
incrLdIdx(load_idx);
|
|
|
|
}
|
|
|
|
return NoFault;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template <class Impl>
|
|
|
|
Fault
|
|
|
|
LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
|
|
|
|
{
|
2007-03-23 16:33:08 +01:00
|
|
|
using namespace TheISA;
|
2006-04-23 00:26:48 +02:00
|
|
|
// Execute a specific load.
|
|
|
|
Fault load_fault = NoFault;
|
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n",
|
2011-02-12 01:29:35 +01:00
|
|
|
inst->pcState(), inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2007-03-23 16:33:08 +01:00
|
|
|
assert(!inst->isSquashed());
|
|
|
|
|
2006-06-03 00:15:20 +02:00
|
|
|
load_fault = inst->initiateAcc();
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2011-02-12 01:29:35 +01:00
|
|
|
if (inst->isTranslationDelayed() &&
|
|
|
|
load_fault == NoFault)
|
|
|
|
return load_fault;
|
|
|
|
|
2010-08-23 18:18:40 +02:00
|
|
|
// If the instruction faulted or predicated false, then we need to send it
|
|
|
|
// along to commit without the instruction completing.
|
|
|
|
if (load_fault != NoFault || inst->readPredicate() == false) {
|
2006-05-19 21:53:17 +02:00
|
|
|
// Send this instruction to commit, also make sure iew stage
|
|
|
|
// realizes there is activity.
|
2006-09-28 06:09:27 +02:00
|
|
|
// Mark it as executed unless it is an uncached load that
|
|
|
|
// needs to hit the head of commit.
|
2011-01-18 23:30:02 +01:00
|
|
|
if (inst->readPredicate() == false)
|
|
|
|
inst->forwardOldRegs();
|
2010-08-23 18:18:41 +02:00
|
|
|
DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n",
|
|
|
|
inst->seqNum,
|
|
|
|
(load_fault != NoFault ? "fault" : "predication"));
|
2007-03-23 16:33:08 +01:00
|
|
|
if (!(inst->hasRequest() && inst->uncacheable()) ||
|
2006-12-12 05:51:21 +01:00
|
|
|
inst->isAtCommit()) {
|
2006-09-28 06:09:27 +02:00
|
|
|
inst->setExecuted();
|
|
|
|
}
|
2006-04-23 00:26:48 +02:00
|
|
|
iewStage->instToCommit(inst);
|
|
|
|
iewStage->activityThisCycle();
|
2007-03-23 16:33:08 +01:00
|
|
|
} else if (!loadBlocked()) {
|
2012-06-05 07:23:09 +02:00
|
|
|
assert(inst->effAddrValid());
|
2007-03-23 16:33:08 +01:00
|
|
|
int load_idx = inst->lqIdx;
|
|
|
|
incrLdIdx(load_idx);
|
|
|
|
|
2011-04-04 18:42:23 +02:00
|
|
|
if (checkLoads)
|
|
|
|
return checkViolations(load_idx, inst);
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return load_fault;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
Fault
|
|
|
|
LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
|
|
|
|
{
|
|
|
|
using namespace TheISA;
|
|
|
|
// Make sure that a store exists.
|
|
|
|
assert(stores != 0);
|
|
|
|
|
|
|
|
int store_idx = store_inst->sqIdx;
|
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n",
|
|
|
|
store_inst->pcState(), store_inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2007-03-23 16:33:08 +01:00
|
|
|
assert(!store_inst->isSquashed());
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
// Check the recently completed loads to see if any match this store's
|
|
|
|
// address. If so, then we have a memory ordering violation.
|
|
|
|
int load_idx = store_inst->lqIdx;
|
|
|
|
|
|
|
|
Fault store_fault = store_inst->initiateAcc();
|
|
|
|
|
2011-02-12 01:29:35 +01:00
|
|
|
if (store_inst->isTranslationDelayed() &&
|
|
|
|
store_fault == NoFault)
|
|
|
|
return store_fault;
|
|
|
|
|
2011-01-18 23:30:02 +01:00
|
|
|
if (store_inst->readPredicate() == false)
|
|
|
|
store_inst->forwardOldRegs();
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
if (storeQueue[store_idx].size == 0) {
|
2010-12-08 01:19:57 +01:00
|
|
|
DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n",
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
store_inst->pcState(), store_inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
return store_fault;
|
2010-12-08 01:19:57 +01:00
|
|
|
} else if (store_inst->readPredicate() == false) {
|
|
|
|
DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n",
|
|
|
|
store_inst->seqNum);
|
|
|
|
return store_fault;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
assert(store_fault == NoFault);
|
|
|
|
|
2006-05-23 20:38:16 +02:00
|
|
|
if (store_inst->isStoreConditional()) {
|
|
|
|
// Store conditionals need to set themselves as able to
|
|
|
|
// writeback if we haven't had a fault by here.
|
2006-05-19 21:53:17 +02:00
|
|
|
storeQueue[store_idx].canWB = true;
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
++storesToWB;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
2011-04-04 18:42:23 +02:00
|
|
|
return checkViolations(load_idx, store_inst);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::commitLoad()
|
|
|
|
{
|
|
|
|
assert(loadQueue[loadHead]);
|
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n",
|
|
|
|
loadQueue[loadHead]->pcState());
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
loadQueue[loadHead] = NULL;
|
|
|
|
|
|
|
|
incrLdIdx(loadHead);
|
|
|
|
|
|
|
|
--loads;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
|
|
|
|
{
|
|
|
|
assert(loads == 0 || loadQueue[loadHead]);
|
|
|
|
|
|
|
|
while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) {
|
|
|
|
commitLoad();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
|
|
|
|
{
|
|
|
|
assert(stores == 0 || storeQueue[storeHead].inst);
|
|
|
|
|
|
|
|
int store_idx = storeHead;
|
|
|
|
|
|
|
|
while (store_idx != storeTail) {
|
|
|
|
assert(storeQueue[store_idx].inst);
|
2006-05-19 21:53:17 +02:00
|
|
|
// Mark any stores that are now committed and have not yet
|
|
|
|
// been marked as able to write back.
|
2006-04-23 00:26:48 +02:00
|
|
|
if (!storeQueue[store_idx].canWB) {
|
|
|
|
if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
DPRINTF(LSQUnit, "Marking store as able to write back, PC "
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
"%s [sn:%lli]\n",
|
|
|
|
storeQueue[store_idx].inst->pcState(),
|
2006-04-23 00:26:48 +02:00
|
|
|
storeQueue[store_idx].inst->seqNum);
|
|
|
|
|
|
|
|
storeQueue[store_idx].canWB = true;
|
|
|
|
|
|
|
|
++storesToWB;
|
|
|
|
}
|
|
|
|
|
|
|
|
incrStIdx(store_idx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-02-12 20:53:20 +01:00
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::writebackPendingStore()
|
|
|
|
{
|
|
|
|
if (hasPendingPkt) {
|
|
|
|
assert(pendingPkt != NULL);
|
|
|
|
|
|
|
|
// If the cache is blocked, this will store the packet for retry.
|
|
|
|
if (sendStore(pendingPkt)) {
|
|
|
|
storePostSend(pendingPkt);
|
|
|
|
}
|
|
|
|
pendingPkt = NULL;
|
|
|
|
hasPendingPkt = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::writebackStores()
|
|
|
|
{
|
2010-02-12 20:53:20 +01:00
|
|
|
// First writeback the second packet from any split store that didn't
|
|
|
|
// complete last cycle because there weren't enough cache ports available.
|
|
|
|
if (TheISA::HasUnalignedMemAcc) {
|
|
|
|
writebackPendingStore();
|
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
while (storesToWB > 0 &&
|
|
|
|
storeWBIdx != storeTail &&
|
|
|
|
storeQueue[storeWBIdx].inst &&
|
|
|
|
storeQueue[storeWBIdx].canWB &&
|
2012-01-29 02:09:04 +01:00
|
|
|
((!needsTSO) || (!storeInFlight)) &&
|
2006-04-23 00:26:48 +02:00
|
|
|
usedPorts < cachePorts) {
|
|
|
|
|
2006-07-13 19:12:51 +02:00
|
|
|
if (isStoreBlocked || lsq->cacheBlocked()) {
|
2006-06-06 00:14:39 +02:00
|
|
|
DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
|
|
|
|
" is blocked!\n");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
// Store didn't write any data so no need to write it back to
|
|
|
|
// memory.
|
2006-04-23 00:26:48 +02:00
|
|
|
if (storeQueue[storeWBIdx].size == 0) {
|
|
|
|
completeStore(storeWBIdx);
|
|
|
|
|
|
|
|
incrStIdx(storeWBIdx);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
2006-06-06 00:14:39 +02:00
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
++usedPorts;
|
|
|
|
|
|
|
|
if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
|
|
|
|
incrStIdx(storeWBIdx);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(storeQueue[storeWBIdx].req);
|
|
|
|
assert(!storeQueue[storeWBIdx].committed);
|
|
|
|
|
2010-02-12 20:53:20 +01:00
|
|
|
if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) {
|
|
|
|
assert(storeQueue[storeWBIdx].sreqLow);
|
|
|
|
assert(storeQueue[storeWBIdx].sreqHigh);
|
|
|
|
}
|
|
|
|
|
2006-06-03 00:15:20 +02:00
|
|
|
DynInstPtr inst = storeQueue[storeWBIdx].inst;
|
|
|
|
|
|
|
|
Request *req = storeQueue[storeWBIdx].req;
|
2011-08-01 04:21:17 +02:00
|
|
|
RequestPtr sreqLow = storeQueue[storeWBIdx].sreqLow;
|
|
|
|
RequestPtr sreqHigh = storeQueue[storeWBIdx].sreqHigh;
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
storeQueue[storeWBIdx].committed = true;
|
|
|
|
|
2006-06-03 00:15:20 +02:00
|
|
|
assert(!inst->memData);
|
|
|
|
inst->memData = new uint8_t[64];
|
2006-12-06 11:54:16 +01:00
|
|
|
|
2007-04-04 00:53:26 +02:00
|
|
|
memcpy(inst->memData, storeQueue[storeWBIdx].data, req->getSize());
|
2006-06-03 00:15:20 +02:00
|
|
|
|
2007-07-01 05:35:42 +02:00
|
|
|
MemCmd command =
|
|
|
|
req->isSwap() ? MemCmd::SwapReq :
|
2009-04-20 06:44:15 +02:00
|
|
|
(req->isLLSC() ? MemCmd::StoreCondReq : MemCmd::WriteReq);
|
2010-02-12 20:53:20 +01:00
|
|
|
PacketPtr data_pkt;
|
|
|
|
PacketPtr snd_data_pkt = NULL;
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
LSQSenderState *state = new LSQSenderState;
|
|
|
|
state->isLoad = false;
|
|
|
|
state->idx = storeWBIdx;
|
|
|
|
state->inst = inst;
|
2010-02-12 20:53:20 +01:00
|
|
|
|
|
|
|
if (!TheISA::HasUnalignedMemAcc || !storeQueue[storeWBIdx].isSplit) {
|
|
|
|
|
|
|
|
// Build a single data packet if the store isn't split.
|
MEM: Remove the Broadcast destination from the packet
This patch simplifies the packet by removing the broadcast flag and
instead more firmly relying on (and enforcing) the semantics of
transactions in the classic memory system, i.e. request packets are
routed from a master to a slave based on the address, and when they
are created they have neither a valid source, nor destination. On
their way to the slave, the request packet is updated with a source
field for all modules that multiplex packets from multiple master
(e.g. a bus). When a request packet is turned into a response packet
(at the final slave), it moves the potentially populated source field
to the destination field, and the response packet is routed through
any multiplexing components back to the master based on the
destination field.
Modules that connect multiplexing components, such as caches and
bridges store any existing source and destination field in the sender
state as a stack (just as before).
The packet constructor is simplified in that there is no longer a need
to pass the Packet::Broadcast as the destination (this was always the
case for the classic memory system). In the case of Ruby, rather than
using the parameter to the constructor we now rely on setDest, as
there is already another three-argument constructor in the packet
class.
In many places where the packet information was printed as part of
DPRINTFs, request packets would be printed with a numeric "dest" that
would always be -1 (Broadcast) and that field is now removed from the
printing.
2012-04-14 11:45:55 +02:00
|
|
|
data_pkt = new Packet(req, command);
|
2010-02-12 20:53:20 +01:00
|
|
|
data_pkt->dataStatic(inst->memData);
|
|
|
|
data_pkt->senderState = state;
|
|
|
|
} else {
|
|
|
|
// Create two packets if the store is split in two.
|
MEM: Remove the Broadcast destination from the packet
This patch simplifies the packet by removing the broadcast flag and
instead more firmly relying on (and enforcing) the semantics of
transactions in the classic memory system, i.e. request packets are
routed from a master to a slave based on the address, and when they
are created they have neither a valid source, nor destination. On
their way to the slave, the request packet is updated with a source
field for all modules that multiplex packets from multiple master
(e.g. a bus). When a request packet is turned into a response packet
(at the final slave), it moves the potentially populated source field
to the destination field, and the response packet is routed through
any multiplexing components back to the master based on the
destination field.
Modules that connect multiplexing components, such as caches and
bridges store any existing source and destination field in the sender
state as a stack (just as before).
The packet constructor is simplified in that there is no longer a need
to pass the Packet::Broadcast as the destination (this was always the
case for the classic memory system). In the case of Ruby, rather than
using the parameter to the constructor we now rely on setDest, as
there is already another three-argument constructor in the packet
class.
In many places where the packet information was printed as part of
DPRINTFs, request packets would be printed with a numeric "dest" that
would always be -1 (Broadcast) and that field is now removed from the
printing.
2012-04-14 11:45:55 +02:00
|
|
|
data_pkt = new Packet(sreqLow, command);
|
|
|
|
snd_data_pkt = new Packet(sreqHigh, command);
|
2010-02-12 20:53:20 +01:00
|
|
|
|
|
|
|
data_pkt->dataStatic(inst->memData);
|
|
|
|
snd_data_pkt->dataStatic(inst->memData + sreqLow->getSize());
|
|
|
|
|
|
|
|
data_pkt->senderState = state;
|
|
|
|
snd_data_pkt->senderState = state;
|
|
|
|
|
|
|
|
state->isSplit = true;
|
|
|
|
state->outstanding = 2;
|
|
|
|
|
|
|
|
// Can delete the main request now.
|
|
|
|
delete req;
|
|
|
|
req = sreqLow;
|
|
|
|
}
|
2006-06-06 00:14:39 +02:00
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%s "
|
2006-04-23 00:26:48 +02:00
|
|
|
"to Addr:%#x, data:%#x [sn:%lli]\n",
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
storeWBIdx, inst->pcState(),
|
2006-12-16 13:34:34 +01:00
|
|
|
req->getPaddr(), (int)*(inst->memData),
|
2006-10-08 06:53:41 +02:00
|
|
|
inst->seqNum);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
2006-06-09 17:46:35 +02:00
|
|
|
// @todo: Remove this SC hack once the memory system handles it.
|
2007-04-08 03:42:42 +02:00
|
|
|
if (inst->isStoreConditional()) {
|
2010-02-12 20:53:20 +01:00
|
|
|
assert(!storeQueue[storeWBIdx].isSplit);
|
2006-10-23 20:00:07 +02:00
|
|
|
// Disable recording the result temporarily. Writing to
|
|
|
|
// misc regs normally updates the result, but this is not
|
|
|
|
// the desired behavior when handling store conditionals.
|
2012-06-05 07:23:09 +02:00
|
|
|
inst->recordResult(false);
|
2006-10-23 20:00:07 +02:00
|
|
|
bool success = TheISA::handleLockedWrite(inst.get(), req);
|
2012-06-05 07:23:09 +02:00
|
|
|
inst->recordResult(true);
|
2006-10-23 20:00:07 +02:00
|
|
|
|
|
|
|
if (!success) {
|
|
|
|
// Instantly complete this store.
|
|
|
|
DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed. "
|
|
|
|
"Instantly completing it.\n",
|
|
|
|
inst->seqNum);
|
|
|
|
WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
|
2011-01-08 06:50:29 +01:00
|
|
|
cpu->schedule(wb, curTick() + 1);
|
2012-03-09 15:59:27 +01:00
|
|
|
if (cpu->checker) {
|
|
|
|
// Make sure to set the LLSC data for verification
|
|
|
|
// if checker is loaded
|
|
|
|
inst->reqToVerify->setExtraData(0);
|
|
|
|
inst->completeAcc(data_pkt);
|
|
|
|
}
|
2006-10-23 20:00:07 +02:00
|
|
|
completeStore(storeWBIdx);
|
|
|
|
incrStIdx(storeWBIdx);
|
|
|
|
continue;
|
2006-05-19 21:53:17 +02:00
|
|
|
}
|
2006-06-09 17:46:35 +02:00
|
|
|
} else {
|
|
|
|
// Non-store conditionals do not need a writeback.
|
|
|
|
state->noWB = true;
|
|
|
|
}
|
2006-06-06 00:14:39 +02:00
|
|
|
|
2011-08-01 04:21:17 +02:00
|
|
|
bool split =
|
|
|
|
TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit;
|
|
|
|
|
|
|
|
ThreadContext *thread = cpu->tcBase(lsqID);
|
|
|
|
|
|
|
|
if (req->isMmappedIpr()) {
|
|
|
|
assert(!inst->isStoreConditional());
|
|
|
|
TheISA::handleIprWrite(thread, data_pkt);
|
|
|
|
delete data_pkt;
|
|
|
|
if (split) {
|
|
|
|
assert(snd_data_pkt->req->isMmappedIpr());
|
|
|
|
TheISA::handleIprWrite(thread, snd_data_pkt);
|
|
|
|
delete snd_data_pkt;
|
|
|
|
delete sreqLow;
|
|
|
|
delete sreqHigh;
|
|
|
|
}
|
|
|
|
delete state;
|
|
|
|
delete req;
|
|
|
|
completeStore(storeWBIdx);
|
|
|
|
incrStIdx(storeWBIdx);
|
|
|
|
} else if (!sendStore(data_pkt)) {
|
2007-03-23 16:33:08 +01:00
|
|
|
DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will"
|
2006-10-08 06:53:41 +02:00
|
|
|
"retry later\n",
|
|
|
|
inst->seqNum);
|
2010-02-12 20:53:20 +01:00
|
|
|
|
|
|
|
// Need to store the second packet, if split.
|
2011-08-01 04:21:17 +02:00
|
|
|
if (split) {
|
2010-02-12 20:53:20 +01:00
|
|
|
state->pktToSend = true;
|
|
|
|
state->pendingPacket = snd_data_pkt;
|
|
|
|
}
|
2006-06-09 17:46:35 +02:00
|
|
|
} else {
|
2010-02-12 20:53:20 +01:00
|
|
|
|
|
|
|
// If split, try to send the second packet too
|
2011-08-01 04:21:17 +02:00
|
|
|
if (split) {
|
2010-02-12 20:53:20 +01:00
|
|
|
assert(snd_data_pkt);
|
|
|
|
|
|
|
|
// Ensure there are enough ports to use.
|
|
|
|
if (usedPorts < cachePorts) {
|
|
|
|
++usedPorts;
|
|
|
|
if (sendStore(snd_data_pkt)) {
|
|
|
|
storePostSend(snd_data_pkt);
|
|
|
|
} else {
|
|
|
|
DPRINTF(IEW, "D-Cache became blocked when writing"
|
|
|
|
" [sn:%lli] second packet, will retry later\n",
|
|
|
|
inst->seqNum);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Store the packet for when there's free ports.
|
|
|
|
assert(pendingPkt == NULL);
|
|
|
|
pendingPkt = snd_data_pkt;
|
|
|
|
hasPendingPkt = true;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Not a split store.
|
|
|
|
storePostSend(data_pkt);
|
|
|
|
}
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Not sure this should set it to 0.
|
|
|
|
usedPorts = 0;
|
|
|
|
|
|
|
|
assert(stores >= 0 && storesToWB >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::removeMSHR(InstSeqNum seqNum)
|
|
|
|
{
|
|
|
|
list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(),
|
|
|
|
mshrSeqNums.end(),
|
|
|
|
seqNum);
|
|
|
|
|
|
|
|
if (mshr_it != mshrSeqNums.end()) {
|
|
|
|
mshrSeqNums.erase(mshr_it);
|
|
|
|
DPRINTF(LSQUnit, "Removing MSHR. count = %i\n",mshrSeqNums.size());
|
|
|
|
}
|
|
|
|
}*/
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
|
|
|
|
{
|
|
|
|
DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
|
2006-05-19 21:53:17 +02:00
|
|
|
"(Loads:%i Stores:%i)\n", squashed_num, loads, stores);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
int load_idx = loadTail;
|
|
|
|
decrLdIdx(load_idx);
|
|
|
|
|
|
|
|
while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit,"Load Instruction PC %s squashed, "
|
2006-04-23 00:26:48 +02:00
|
|
|
"[sn:%lli]\n",
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
loadQueue[load_idx]->pcState(),
|
2006-04-23 00:26:48 +02:00
|
|
|
loadQueue[load_idx]->seqNum);
|
|
|
|
|
|
|
|
if (isStalled() && load_idx == stallingLoadIdx) {
|
|
|
|
stalled = false;
|
|
|
|
stallingStoreIsn = 0;
|
|
|
|
stallingLoadIdx = 0;
|
|
|
|
}
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
// Clear the smart pointer to make sure it is decremented.
|
2006-06-14 19:12:41 +02:00
|
|
|
loadQueue[load_idx]->setSquashed();
|
2006-04-23 00:26:48 +02:00
|
|
|
loadQueue[load_idx] = NULL;
|
|
|
|
--loads;
|
|
|
|
|
|
|
|
// Inefficient!
|
|
|
|
loadTail = load_idx;
|
|
|
|
|
|
|
|
decrLdIdx(load_idx);
|
2006-06-14 04:35:05 +02:00
|
|
|
++lsqSquashedLoads;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (isLoadBlocked) {
|
|
|
|
if (squashed_num < blockedLoadSeqNum) {
|
|
|
|
isLoadBlocked = false;
|
|
|
|
loadBlockedHandled = false;
|
|
|
|
blockedLoadSeqNum = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-03-23 16:33:08 +01:00
|
|
|
if (memDepViolator && squashed_num < memDepViolator->seqNum) {
|
|
|
|
memDepViolator = NULL;
|
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
int store_idx = storeTail;
|
|
|
|
decrStIdx(store_idx);
|
|
|
|
|
|
|
|
while (stores != 0 &&
|
|
|
|
storeQueue[store_idx].inst->seqNum > squashed_num) {
|
2006-05-19 21:53:17 +02:00
|
|
|
// Instructions marked as can WB are already committed.
|
2006-04-23 00:26:48 +02:00
|
|
|
if (storeQueue[store_idx].canWB) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
DPRINTF(LSQUnit,"Store Instruction PC %s squashed, "
|
2006-04-23 00:26:48 +02:00
|
|
|
"idx:%i [sn:%lli]\n",
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
storeQueue[store_idx].inst->pcState(),
|
2006-04-23 00:26:48 +02:00
|
|
|
store_idx, storeQueue[store_idx].inst->seqNum);
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
// I don't think this can happen. It should have been cleared
|
|
|
|
// by the stalling load.
|
2006-04-23 00:26:48 +02:00
|
|
|
if (isStalled() &&
|
|
|
|
storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
|
|
|
|
panic("Is stalled should have been cleared by stalling load!\n");
|
|
|
|
stalled = false;
|
|
|
|
stallingStoreIsn = 0;
|
|
|
|
}
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
// Clear the smart pointer to make sure it is decremented.
|
2006-06-14 19:12:41 +02:00
|
|
|
storeQueue[store_idx].inst->setSquashed();
|
2006-04-23 00:26:48 +02:00
|
|
|
storeQueue[store_idx].inst = NULL;
|
|
|
|
storeQueue[store_idx].canWB = 0;
|
|
|
|
|
2007-03-23 16:33:08 +01:00
|
|
|
// Must delete request now that it wasn't handed off to
|
|
|
|
// memory. This is quite ugly. @todo: Figure out the proper
|
|
|
|
// place to really handle request deletes.
|
|
|
|
delete storeQueue[store_idx].req;
|
2010-02-12 20:53:20 +01:00
|
|
|
if (TheISA::HasUnalignedMemAcc && storeQueue[store_idx].isSplit) {
|
|
|
|
delete storeQueue[store_idx].sreqLow;
|
|
|
|
delete storeQueue[store_idx].sreqHigh;
|
|
|
|
|
|
|
|
storeQueue[store_idx].sreqLow = NULL;
|
|
|
|
storeQueue[store_idx].sreqHigh = NULL;
|
|
|
|
}
|
2007-03-23 16:33:08 +01:00
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
storeQueue[store_idx].req = NULL;
|
|
|
|
--stores;
|
|
|
|
|
|
|
|
// Inefficient!
|
|
|
|
storeTail = store_idx;
|
|
|
|
|
|
|
|
decrStIdx(store_idx);
|
2006-06-14 04:35:05 +02:00
|
|
|
++lsqSquashedStores;
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-06-09 17:46:35 +02:00
|
|
|
template <class Impl>
|
|
|
|
void
|
2006-10-20 09:10:12 +02:00
|
|
|
LSQUnit<Impl>::storePostSend(PacketPtr pkt)
|
2006-06-09 17:46:35 +02:00
|
|
|
{
|
|
|
|
if (isStalled() &&
|
|
|
|
storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) {
|
|
|
|
DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
|
|
|
|
"load idx:%i\n",
|
|
|
|
stallingStoreIsn, stallingLoadIdx);
|
|
|
|
stalled = false;
|
|
|
|
stallingStoreIsn = 0;
|
|
|
|
iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!storeQueue[storeWBIdx].inst->isStoreConditional()) {
|
|
|
|
// The store is basically completed at this time. This
|
|
|
|
// only works so long as the checker doesn't try to
|
|
|
|
// verify the value in memory for stores.
|
|
|
|
storeQueue[storeWBIdx].inst->setCompleted();
|
2012-03-09 15:59:27 +01:00
|
|
|
|
2006-06-09 17:46:35 +02:00
|
|
|
if (cpu->checker) {
|
2006-06-16 19:10:47 +02:00
|
|
|
cpu->checker->verify(storeQueue[storeWBIdx].inst);
|
2006-06-09 17:46:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-01-29 02:09:04 +01:00
|
|
|
if (needsTSO) {
|
|
|
|
storeInFlight = true;
|
|
|
|
}
|
|
|
|
|
2006-06-09 17:46:35 +02:00
|
|
|
incrStIdx(storeWBIdx);
|
|
|
|
}
|
|
|
|
|
2006-06-06 00:14:39 +02:00
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt)
|
|
|
|
{
|
|
|
|
iewStage->wakeCPU();
|
|
|
|
|
|
|
|
// Squashed instructions do not need to complete their access.
|
|
|
|
if (inst->isSquashed()) {
|
2006-07-19 21:28:02 +02:00
|
|
|
iewStage->decrWb(inst->seqNum);
|
2006-06-06 00:14:39 +02:00
|
|
|
assert(!inst->isStore());
|
2006-06-14 04:35:05 +02:00
|
|
|
++lsqIgnoredResponses;
|
2006-06-06 00:14:39 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!inst->isExecuted()) {
|
|
|
|
inst->setExecuted();
|
|
|
|
|
|
|
|
// Complete access to copy data to proper place.
|
|
|
|
inst->completeAcc(pkt);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Need to insert instruction into queue to commit
|
|
|
|
iewStage->instToCommit(inst);
|
|
|
|
|
|
|
|
iewStage->activityThisCycle();
|
2010-08-23 18:18:40 +02:00
|
|
|
|
|
|
|
// see if this load changed the PC
|
|
|
|
iewStage->checkMisprediction(inst);
|
2006-06-06 00:14:39 +02:00
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::completeStore(int store_idx)
|
|
|
|
{
|
|
|
|
assert(storeQueue[store_idx].inst);
|
|
|
|
storeQueue[store_idx].completed = true;
|
|
|
|
--storesToWB;
|
|
|
|
// A bit conservative because a store completion may not free up entries,
|
|
|
|
// but hopefully avoids two store completions in one cycle from making
|
|
|
|
// the CPU tick twice.
|
2006-10-02 17:58:09 +02:00
|
|
|
cpu->wakeCPU();
|
2006-04-23 00:26:48 +02:00
|
|
|
cpu->activityThisCycle();
|
|
|
|
|
|
|
|
if (store_idx == storeHead) {
|
|
|
|
do {
|
|
|
|
incrStIdx(storeHead);
|
|
|
|
|
|
|
|
--stores;
|
|
|
|
} while (storeQueue[storeHead].completed &&
|
|
|
|
storeHead != storeTail);
|
|
|
|
|
|
|
|
iewStage->updateLSQNextCycle = true;
|
|
|
|
}
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
|
|
|
|
"idx:%i\n",
|
|
|
|
storeQueue[store_idx].inst->seqNum, store_idx, storeHead);
|
2006-04-23 00:26:48 +02:00
|
|
|
|
|
|
|
if (isStalled() &&
|
|
|
|
storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
|
|
|
|
DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
|
|
|
|
"load idx:%i\n",
|
|
|
|
stallingStoreIsn, stallingLoadIdx);
|
|
|
|
stalled = false;
|
|
|
|
stallingStoreIsn = 0;
|
|
|
|
iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
|
|
|
|
}
|
2006-05-16 20:06:35 +02:00
|
|
|
|
|
|
|
storeQueue[store_idx].inst->setCompleted();
|
2006-05-19 21:53:17 +02:00
|
|
|
|
2012-01-29 02:09:04 +01:00
|
|
|
if (needsTSO) {
|
|
|
|
storeInFlight = false;
|
|
|
|
}
|
|
|
|
|
2006-05-19 21:53:17 +02:00
|
|
|
// Tell the checker we've completed this instruction. Some stores
|
|
|
|
// may get reported twice to the checker, but the checker can
|
|
|
|
// handle that case.
|
2006-05-16 20:06:35 +02:00
|
|
|
if (cpu->checker) {
|
2006-06-16 19:10:47 +02:00
|
|
|
cpu->checker->verify(storeQueue[store_idx].inst);
|
2006-05-16 20:06:35 +02:00
|
|
|
}
|
2006-04-23 00:26:48 +02:00
|
|
|
}
|
|
|
|
|
2010-02-12 20:53:20 +01:00
|
|
|
template <class Impl>
|
|
|
|
bool
|
|
|
|
LSQUnit<Impl>::sendStore(PacketPtr data_pkt)
|
|
|
|
{
|
MEM: Separate requests and responses for timing accesses
This patch moves send/recvTiming and send/recvTimingSnoop from the
Port base class to the MasterPort and SlavePort, and also splits them
into separate member functions for requests and responses:
send/recvTimingReq, send/recvTimingResp, and send/recvTimingSnoopReq,
send/recvTimingSnoopResp. A master port sends requests and receives
responses, and also receives snoop requests and sends snoop
responses. A slave port has the reciprocal behaviour as it receives
requests and sends responses, and sends snoop requests and receives
snoop responses.
For all MemObjects that have only master ports or slave ports (but not
both), e.g. a CPU, or a PIO device, this patch merely adds more
clarity to what kind of access is taking place. For example, a CPU
port used to call sendTiming, and will now call
sendTimingReq. Similarly, a response previously came back through
recvTiming, which is now recvTimingResp. For the modules that have
both master and slave ports, e.g. the bus, the behaviour was
previously relying on branches based on pkt->isRequest(), and this is
now replaced with a direct call to the apprioriate member function
depending on the type of access. Please note that send/recvRetry is
still shared by all the timing accessors and remains in the Port base
class for now (to maintain the current bus functionality and avoid
changing the statistics of all regressions).
The packet queue is split into a MasterPort and SlavePort version to
facilitate the use of the new timing accessors. All uses of the
PacketQueue are updated accordingly.
With this patch, the type of packet (request or response) is now well
defined for each type of access, and asserts on pkt->isRequest() and
pkt->isResponse() are now moved to the appropriate send member
functions. It is also worth noting that sendTimingSnoopReq no longer
returns a boolean, as the semantics do not alow snoop requests to be
rejected or stalled. All these assumptions are now excplicitly part of
the port interface itself.
2012-05-01 19:40:42 +02:00
|
|
|
if (!dcachePort->sendTimingReq(data_pkt)) {
|
2010-02-12 20:53:20 +01:00
|
|
|
// Need to handle becoming blocked on a store.
|
|
|
|
isStoreBlocked = true;
|
|
|
|
++lsqCacheBlocked;
|
|
|
|
assert(retryPkt == NULL);
|
|
|
|
retryPkt = data_pkt;
|
|
|
|
lsq->setRetryTid(lsqID);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2006-06-09 17:46:35 +02:00
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::recvRetry()
|
|
|
|
{
|
|
|
|
if (isStoreBlocked) {
|
2007-08-22 01:16:56 +02:00
|
|
|
DPRINTF(LSQUnit, "Receiving retry: store blocked\n");
|
2006-06-09 22:28:17 +02:00
|
|
|
assert(retryPkt != NULL);
|
|
|
|
|
2011-09-26 19:18:32 +02:00
|
|
|
LSQSenderState *state =
|
|
|
|
dynamic_cast<LSQSenderState *>(retryPkt->senderState);
|
2010-02-12 20:53:20 +01:00
|
|
|
|
MEM: Separate requests and responses for timing accesses
This patch moves send/recvTiming and send/recvTimingSnoop from the
Port base class to the MasterPort and SlavePort, and also splits them
into separate member functions for requests and responses:
send/recvTimingReq, send/recvTimingResp, and send/recvTimingSnoopReq,
send/recvTimingSnoopResp. A master port sends requests and receives
responses, and also receives snoop requests and sends snoop
responses. A slave port has the reciprocal behaviour as it receives
requests and sends responses, and sends snoop requests and receives
snoop responses.
For all MemObjects that have only master ports or slave ports (but not
both), e.g. a CPU, or a PIO device, this patch merely adds more
clarity to what kind of access is taking place. For example, a CPU
port used to call sendTiming, and will now call
sendTimingReq. Similarly, a response previously came back through
recvTiming, which is now recvTimingResp. For the modules that have
both master and slave ports, e.g. the bus, the behaviour was
previously relying on branches based on pkt->isRequest(), and this is
now replaced with a direct call to the apprioriate member function
depending on the type of access. Please note that send/recvRetry is
still shared by all the timing accessors and remains in the Port base
class for now (to maintain the current bus functionality and avoid
changing the statistics of all regressions).
The packet queue is split into a MasterPort and SlavePort version to
facilitate the use of the new timing accessors. All uses of the
PacketQueue are updated accordingly.
With this patch, the type of packet (request or response) is now well
defined for each type of access, and asserts on pkt->isRequest() and
pkt->isResponse() are now moved to the appropriate send member
functions. It is also worth noting that sendTimingSnoopReq no longer
returns a boolean, as the semantics do not alow snoop requests to be
rejected or stalled. All these assumptions are now excplicitly part of
the port interface itself.
2012-05-01 19:40:42 +02:00
|
|
|
if (dcachePort->sendTimingReq(retryPkt)) {
|
2010-02-12 20:53:20 +01:00
|
|
|
// Don't finish the store unless this is the last packet.
|
2011-03-18 01:20:19 +01:00
|
|
|
if (!TheISA::HasUnalignedMemAcc || !state->pktToSend ||
|
|
|
|
state->pendingPacket == retryPkt) {
|
|
|
|
state->pktToSend = false;
|
2010-02-12 20:53:20 +01:00
|
|
|
storePostSend(retryPkt);
|
|
|
|
}
|
2006-06-09 23:21:37 +02:00
|
|
|
retryPkt = NULL;
|
2006-06-09 17:46:35 +02:00
|
|
|
isStoreBlocked = false;
|
2009-05-26 18:23:13 +02:00
|
|
|
lsq->setRetryTid(InvalidThreadID);
|
2010-02-12 20:53:20 +01:00
|
|
|
|
|
|
|
// Send any outstanding packet.
|
|
|
|
if (TheISA::HasUnalignedMemAcc && state->pktToSend) {
|
|
|
|
assert(state->pendingPacket);
|
|
|
|
if (sendStore(state->pendingPacket)) {
|
|
|
|
storePostSend(state->pendingPacket);
|
|
|
|
}
|
|
|
|
}
|
2006-06-09 17:46:35 +02:00
|
|
|
} else {
|
|
|
|
// Still blocked!
|
2006-06-14 04:35:05 +02:00
|
|
|
++lsqCacheBlocked;
|
2006-07-13 19:12:51 +02:00
|
|
|
lsq->setRetryTid(lsqID);
|
2006-06-09 17:46:35 +02:00
|
|
|
}
|
|
|
|
} else if (isLoadBlocked) {
|
|
|
|
DPRINTF(LSQUnit, "Loads squash themselves and all younger insts, "
|
|
|
|
"no need to resend packet.\n");
|
|
|
|
} else {
|
|
|
|
DPRINTF(LSQUnit, "Retry received but LSQ is no longer blocked.\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-04-23 00:26:48 +02:00
|
|
|
template <class Impl>
|
|
|
|
inline void
|
|
|
|
LSQUnit<Impl>::incrStIdx(int &store_idx)
|
|
|
|
{
|
|
|
|
if (++store_idx >= SQEntries)
|
|
|
|
store_idx = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
inline void
|
|
|
|
LSQUnit<Impl>::decrStIdx(int &store_idx)
|
|
|
|
{
|
|
|
|
if (--store_idx < 0)
|
|
|
|
store_idx += SQEntries;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
inline void
|
|
|
|
LSQUnit<Impl>::incrLdIdx(int &load_idx)
|
|
|
|
{
|
|
|
|
if (++load_idx >= LQEntries)
|
|
|
|
load_idx = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
inline void
|
|
|
|
LSQUnit<Impl>::decrLdIdx(int &load_idx)
|
|
|
|
{
|
|
|
|
if (--load_idx < 0)
|
|
|
|
load_idx += LQEntries;
|
|
|
|
}
|
2006-05-19 21:53:17 +02:00
|
|
|
|
|
|
|
template <class Impl>
|
|
|
|
void
|
|
|
|
LSQUnit<Impl>::dumpInsts()
|
|
|
|
{
|
|
|
|
cprintf("Load store queue: Dumping instructions.\n");
|
|
|
|
cprintf("Load queue size: %i\n", loads);
|
|
|
|
cprintf("Load queue: ");
|
|
|
|
|
|
|
|
int load_idx = loadHead;
|
|
|
|
|
|
|
|
while (load_idx != loadTail && loadQueue[load_idx]) {
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
cprintf("%s ", loadQueue[load_idx]->pcState());
|
2006-05-19 21:53:17 +02:00
|
|
|
|
|
|
|
incrLdIdx(load_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
cprintf("Store queue size: %i\n", stores);
|
|
|
|
cprintf("Store queue: ");
|
|
|
|
|
|
|
|
int store_idx = storeHead;
|
|
|
|
|
|
|
|
while (store_idx != storeTail && storeQueue[store_idx].inst) {
|
ISA,CPU,etc: Create an ISA defined PC type that abstracts out ISA behaviors.
This change is a low level and pervasive reorganization of how PCs are managed
in M5. Back when Alpha was the only ISA, there were only 2 PCs to worry about,
the PC and the NPC, and the lsb of the PC signaled whether or not you were in
PAL mode. As other ISAs were added, we had to add an NNPC, micro PC and next
micropc, x86 and ARM introduced variable length instruction sets, and ARM
started to keep track of mode bits in the PC. Each CPU model handled PCs in
its own custom way that needed to be updated individually to handle the new
dimensions of variability, or, in the case of ARMs mode-bit-in-the-pc hack,
the complexity could be hidden in the ISA at the ISA implementation's expense.
Areas like the branch predictor hadn't been updated to handle branch delay
slots or micropcs, and it turns out that had introduced a significant (10s of
percent) performance bug in SPARC and to a lesser extend MIPS. Rather than
perpetuate the problem by reworking O3 again to handle the PC features needed
by x86, this change was introduced to rework PC handling in a more modular,
transparent, and hopefully efficient way.
PC type:
Rather than having the superset of all possible elements of PC state declared
in each of the CPU models, each ISA defines its own PCState type which has
exactly the elements it needs. A cross product of canned PCState classes are
defined in the new "generic" ISA directory for ISAs with/without delay slots
and microcode. These are either typedef-ed or subclassed by each ISA. To read
or write this structure through a *Context, you use the new pcState() accessor
which reads or writes depending on whether it has an argument. If you just
want the address of the current or next instruction or the current micro PC,
you can get those through read-only accessors on either the PCState type or
the *Contexts. These are instAddr(), nextInstAddr(), and microPC(). Note the
move away from readPC. That name is ambiguous since it's not clear whether or
not it should be the actual address to fetch from, or if it should have extra
bits in it like the PAL mode bit. Each class is free to define its own
functions to get at whatever values it needs however it needs to to be used in
ISA specific code. Eventually Alpha's PAL mode bit could be moved out of the
PC and into a separate field like ARM.
These types can be reset to a particular pc (where npc = pc +
sizeof(MachInst), nnpc = npc + sizeof(MachInst), upc = 0, nupc = 1 as
appropriate), printed, serialized, and compared. There is a branching()
function which encapsulates code in the CPU models that checked if an
instruction branched or not. Exactly what that means in the context of branch
delay slots which can skip an instruction when not taken is ambiguous, and
ideally this function and its uses can be eliminated. PCStates also generally
know how to advance themselves in various ways depending on if they point at
an instruction, a microop, or the last microop of a macroop. More on that
later.
Ideally, accessing all the PCs at once when setting them will improve
performance of M5 even though more data needs to be moved around. This is
because often all the PCs need to be manipulated together, and by getting them
all at once you avoid multiple function calls. Also, the PCs of a particular
thread will have spatial locality in the cache. Previously they were grouped
by element in arrays which spread out accesses.
Advancing the PC:
The PCs were previously managed entirely by the CPU which had to know about PC
semantics, try to figure out which dimension to increment the PC in, what to
set NPC/NNPC, etc. These decisions are best left to the ISA in conjunction
with the PC type itself. Because most of the information about how to
increment the PC (mainly what type of instruction it refers to) is contained
in the instruction object, a new advancePC virtual function was added to the
StaticInst class. Subclasses provide an implementation that moves around the
right element of the PC with a minimal amount of decision making. In ISAs like
Alpha, the instructions always simply assign NPC to PC without having to worry
about micropcs, nnpcs, etc. The added cost of a virtual function call should
be outweighed by not having to figure out as much about what to do with the
PCs and mucking around with the extra elements.
One drawback of making the StaticInsts advance the PC is that you have to
actually have one to advance the PC. This would, superficially, seem to
require decoding an instruction before fetch could advance. This is, as far as
I can tell, realistic. fetch would advance through memory addresses, not PCs,
perhaps predicting new memory addresses using existing ones. More
sophisticated decisions about control flow would be made later on, after the
instruction was decoded, and handed back to fetch. If branching needs to
happen, some amount of decoding needs to happen to see that it's a branch,
what the target is, etc. This could get a little more complicated if that gets
done by the predecoder, but I'm choosing to ignore that for now.
Variable length instructions:
To handle variable length instructions in x86 and ARM, the predecoder now
takes in the current PC by reference to the getExtMachInst function. It can
modify the PC however it needs to (by setting NPC to be the PC + instruction
length, for instance). This could be improved since the CPU doesn't know if
the PC was modified and always has to write it back.
ISA parser:
To support the new API, all PC related operand types were removed from the
parser and replaced with a PCState type. There are two warts on this
implementation. First, as with all the other operand types, the PCState still
has to have a valid operand type even though it doesn't use it. Second, using
syntax like PCS.npc(target) doesn't work for two reasons, this looks like the
syntax for operand type overriding, and the parser can't figure out if you're
reading or writing. Instructions that use the PCS operand (which I've
consistently called it) need to first read it into a local variable,
manipulate it, and then write it back out.
Return address stack:
The return address stack needed a little extra help because, in the presence
of branch delay slots, it has to merge together elements of the return PC and
the call PC. To handle that, a buildRetPC utility function was added. There
are basically only two versions in all the ISAs, but it didn't seem short
enough to put into the generic ISA directory. Also, the branch predictor code
in O3 and InOrder were adjusted so that they always store the PC of the actual
call instruction in the RAS, not the next PC. If the call instruction is a
microop, the next PC refers to the next microop in the same macroop which is
probably not desirable. The buildRetPC function advances the PC intelligently
to the next macroop (in an ISA specific way) so that that case works.
Change in stats:
There were no change in stats except in MIPS and SPARC in the O3 model. MIPS
runs in about 9% fewer ticks. SPARC runs with 30%-50% fewer ticks, which could
likely be improved further by setting call/return instruction flags and taking
advantage of the RAS.
TODO:
Add != operators to the PCState classes, defined trivially to be !(a==b).
Smooth out places where PCs are split apart, passed around, and put back
together later. I think this might happen in SPARC's fault code. Add ISA
specific constructors that allow setting PC elements without calling a bunch
of accessors. Try to eliminate the need for the branching() function. Factor
out Alpha's PAL mode pc bit into a separate flag field, and eliminate places
where it's blindly masked out or tested in the PC.
2010-10-31 08:07:20 +01:00
|
|
|
cprintf("%s ", storeQueue[store_idx].inst->pcState());
|
2006-05-19 21:53:17 +02:00
|
|
|
|
|
|
|
incrStIdx(store_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
cprintf("\n");
|
|
|
|
}
|