ruby: patch checkpoint restore with garnet

Due to recent changes to clocking system in Ruby and the way Ruby restores
state from a checkpoint, garnet was failing to run from a checkpointed state.
The problem is that Ruby resets the time to zero while warming up the caches.
If any component records a local copy of the time (read calls curCycle())
before the simulation has started, then that component will not operate until
that time is reached. In the context of this particular patch, the Garnet
Network class calls curCycle() at multiple places. Any non-operational
component can block in requests in the memory system, which the system
interprets as a deadlock. This patch makes changes so that Garnet can
successfully run from checkpointed state.

It adds a globally visible time at which the actual execution started. This
time is initialized in RubySystem::startup() function. This variable is only
meant for components with in Ruby. This replaces the private variable that
was maintained within Garnet since it is not possible to figure out the
correct time when the value of this variable can be set.

The patch also does away with all cases where curCycle() is called with in
some Ruby component before the system has actually started executing. This
is required due to the quirky manner in which ruby restores from a checkpoint.
This commit is contained in:
Nilay Vaish 2013-04-23 00:03:02 -05:00
parent e23e3bea8b
commit aa86800e7a
18 changed files with 44 additions and 50 deletions

View file

@ -32,3 +32,4 @@ using namespace std;
RubySystem* g_system_ptr = 0;
vector<map<uint32_t, AbstractController *> > g_abs_controls;
Cycles g_ruby_start;

View file

@ -33,6 +33,7 @@
#include <vector>
#include "base/str.hh"
#include "base/types.hh"
class RubySystem;
extern RubySystem* g_system_ptr;
@ -40,5 +41,9 @@ extern RubySystem* g_system_ptr;
class AbstractController;
extern std::vector<std::map<uint32_t, AbstractController *> > g_abs_controls;
// A globally visible time at which the actual execution started. Meant only
// for components with in Ruby. Initialized in RubySystem::startup().
extern Cycles g_ruby_start;
#endif // __MEM_RUBY_COMMON_GLOBAL_HH__

View file

@ -36,7 +36,7 @@
using namespace std;
BaseGarnetNetwork::BaseGarnetNetwork(const Params *p)
: Network(p), m_ruby_start(0)
: Network(p)
{
m_ni_flit_size = p->ni_flit_size;
m_vcs_per_vnet = p->vcs_per_vnet;
@ -123,13 +123,6 @@ BaseGarnetNetwork::getFromNetQueue(NodeID id, bool ordered, int network_num,
void
BaseGarnetNetwork::clearStats()
{
m_ruby_start = curCycle();
}
Cycles
BaseGarnetNetwork::getRubyStartTime()
{
return m_ruby_start;
}
void

View file

@ -80,7 +80,6 @@ class BaseGarnetNetwork : public Network
virtual void checkNetworkAllocation(NodeID id, bool ordered,
int network_num, std::string vnet_type) = 0;
Cycles getRubyStartTime();
void clearStats();
void printStats(std::ostream& out) const;
void printPerformanceStats(std::ostream& out) const;
@ -102,8 +101,6 @@ class BaseGarnetNetwork : public Network
std::vector<std::vector<MessageBuffer*> > m_toNetQueues;
std::vector<std::vector<MessageBuffer*> > m_fromNetQueues;
Cycles m_ruby_start;
};
#endif // __MEM_RUBY_NETWORK_GARNET_BASEGARNETNETWORK_HH__

View file

@ -270,7 +270,7 @@ GarnetNetwork_d::printLinkStats(ostream& out) const
for (int i = 0; i < m_link_ptr_vector.size(); i++) {
average_link_utilization +=
(double(m_link_ptr_vector[i]->getLinkUtilization())) /
(double(curCycle() - m_ruby_start));
(double(curCycle() - g_ruby_start));
vector<int> vc_load = m_link_ptr_vector[i]->getVcLoad();
for (int j = 0; j < vc_load.size(); j++) {
@ -289,7 +289,7 @@ GarnetNetwork_d::printLinkStats(ostream& out) const
continue;
average_vc_load[i] = (double(average_vc_load[i])) /
(double(curCycle() - m_ruby_start));
(double(curCycle() - g_ruby_start));
out << "Average VC Load [" << i << "] = " << average_vc_load[i]
<< " flits/cycle " << endl;
}

View file

@ -53,7 +53,7 @@ InputUnit_d::InputUnit_d(int id, Router_d *router) : Consumer(router)
// Instantiating the virtual channels
m_vcs.resize(m_num_vcs);
for (int i=0; i < m_num_vcs; i++) {
m_vcs[i] = new VirtualChannel_d(i, m_router->curCycle());
m_vcs[i] = new VirtualChannel_d(i);
}
}

View file

@ -71,7 +71,6 @@ NetworkInterface_d::NetworkInterface_d(int id, int virtual_networks,
for (int i = 0; i < m_num_vcs; i++) {
m_out_vc_state.push_back(new OutVcState_d(i, m_net_ptr));
m_out_vc_state[i]->setState(IDLE_, m_net_ptr->curCycle());
}
}

View file

@ -33,16 +33,15 @@
#include "mem/ruby/system/System.hh"
OutVcState_d::OutVcState_d(int id, GarnetNetwork_d *network_ptr)
: m_time(0)
{
m_network_ptr = network_ptr;
m_id = id;
m_vc_state = IDLE_;
m_time = m_network_ptr->curCycle();
if (m_network_ptr->get_vnet_type(id) == DATA_VNET_)
m_credit_count = m_network_ptr->getBuffersPerDataVC();
if (network_ptr->get_vnet_type(id) == DATA_VNET_)
m_credit_count = network_ptr->getBuffersPerDataVC();
else
m_credit_count = m_network_ptr->getBuffersPerCtrlVC();
m_credit_count = network_ptr->getBuffersPerCtrlVC();
assert(m_credit_count >= 1);
}

View file

@ -61,7 +61,6 @@ class OutVcState_d
inline void decrement_credit() { m_credit_count--; }
private:
GarnetNetwork_d *m_network_ptr;
int m_id ;
Cycles m_time;
VC_state_type m_vc_state;

View file

@ -30,13 +30,13 @@
#include "mem/ruby/network/garnet/fixed-pipeline/VirtualChannel_d.hh"
VirtualChannel_d::VirtualChannel_d(int id, Cycles curTime)
VirtualChannel_d::VirtualChannel_d(int id)
: m_enqueue_time(INFINITE_)
{
m_id = id;
m_input_buffer = new flitBuffer_d();
m_vc_state.first = IDLE_;
m_vc_state.second = curTime;
m_vc_state.second = Cycles(0);
}
VirtualChannel_d::~VirtualChannel_d()

View file

@ -39,7 +39,7 @@
class VirtualChannel_d
{
public:
VirtualChannel_d(int id, Cycles curTime);
VirtualChannel_d(int id);
~VirtualChannel_d();
bool need_stage(VC_state_type state, flit_stage stage, Cycles curTime);

View file

@ -254,7 +254,7 @@ GarnetNetwork::printLinkStats(ostream& out) const
for (int i = 0; i < m_link_ptr_vector.size(); i++) {
average_link_utilization +=
(double(m_link_ptr_vector[i]->getLinkUtilization())) /
(double(curCycle() - m_ruby_start));
(double(curCycle() - g_ruby_start));
vector<int> vc_load = m_link_ptr_vector[i]->getVcLoad();
for (int j = 0; j < vc_load.size(); j++) {
@ -273,7 +273,7 @@ GarnetNetwork::printLinkStats(ostream& out) const
continue;
average_vc_load[i] = double(average_vc_load[i]) /
(double(curCycle() - m_ruby_start));
(double(curCycle() - g_ruby_start));
out << "Average VC Load [" << i << "] = " << average_vc_load[i]
<< " flits/cycle " << endl;
}

View file

@ -68,7 +68,6 @@ NetworkInterface::NetworkInterface(int id, int virtual_networks,
for (int i = 0; i < m_num_vcs; i++) {
m_out_vc_state.push_back(new OutVcState(i));
m_out_vc_state[i]->setState(IDLE_, m_net_ptr->curCycle());
}
}

View file

@ -31,6 +31,7 @@
#include "mem/ruby/network/garnet/flexible-pipeline/OutVcState.hh"
OutVcState::OutVcState(int id)
: m_time(0)
{
m_id = id;
m_vc_state = IDLE_;

View file

@ -39,7 +39,7 @@ Router_d::calculate_power()
{
//Network Activities from garnet
calculate_performance_numbers();
double sim_cycles = curCycle() - m_network_ptr->getRubyStartTime();
double sim_cycles = curCycle() - g_ruby_start;
// Number of virtual networks/message classes declared in Ruby
// maybe greater than active virtual networks.
@ -245,8 +245,7 @@ NetworkLink_d::calculate_power()
channel_width_bits,
orion_cfg_ptr);
double sim_cycles =
(double)(m_net_ptr->curCycle() - m_net_ptr->getRubyStartTime());
double sim_cycles = (double)(m_net_ptr->curCycle() - g_ruby_start);
// Dynamic Power
// Assume half the bits flipped on every link activity

View file

@ -515,11 +515,7 @@ Profiler::clearStats()
m_cycles_executed_at_start.resize(m_num_of_sequencers);
for (int i = 0; i < m_num_of_sequencers; i++) {
if (g_system_ptr == NULL) {
m_cycles_executed_at_start[i] = 0;
} else {
m_cycles_executed_at_start[i] = g_system_ptr->curCycle();
}
m_cycles_executed_at_start[i] = g_system_ptr->curCycle();
}
m_busyBankCount = 0;

View file

@ -93,13 +93,6 @@ RubySystem::RubySystem(const Params *p)
g_abs_controls.resize(MachineType_NUM);
}
void
RubySystem::init()
{
m_profiler_ptr->clearStats();
m_network_ptr->clearStats();
}
void
RubySystem::registerNetwork(Network* network_ptr)
{
@ -311,12 +304,6 @@ RubySystem::readCompressedTrace(string filename, uint8_t *&raw_data,
void
RubySystem::unserialize(Checkpoint *cp, const string &section)
{
//
// The main purpose for clearing stats in the unserialize process is so
// that the profiler can correctly set its start time to the unserialized
// value of curTick()
//
resetStats();
uint8_t *uncompressed_trace = NULL;
if (m_mem_vec_ptr != NULL) {
@ -368,6 +355,23 @@ RubySystem::unserialize(Checkpoint *cp, const string &section)
void
RubySystem::startup()
{
// Ruby restores state from a checkpoint by resetting the clock to 0 and
// playing the requests that can possibly re-generate the cache state.
// The clock value is set to the actual checkpointed value once all the
// requests have been executed.
//
// This way of restoring state is pretty finicky. For example, if a
// Ruby component reads time before the state has been restored, it would
// cache this value and hence its clock would not be reset to 0, when
// Ruby resets the global clock. This can potentially result in a
// deadlock.
//
// The solution is that no Ruby component should read time before the
// simulation starts. And then one also needs to hope that the time
// Ruby finishes restoring the state is less than the time when the
// state was checkpointed.
if (m_warmup_enabled) {
// save the current tick value
Tick curtick_original = curTick();
@ -397,6 +401,8 @@ RubySystem::startup()
setCurTick(curtick_original);
resetClock();
}
resetStats();
}
void
@ -417,6 +423,8 @@ RubySystem::resetStats()
for (uint32_t cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
m_abs_cntrl_vec[cntrl]->clearStats();
}
g_ruby_start = curCycle();
}
bool

View file

@ -134,8 +134,6 @@ class RubySystem : public ClockedObject
RubySystem(const RubySystem& obj);
RubySystem& operator=(const RubySystem& obj);
void init();
void readCompressedTrace(std::string filename,
uint8_t *&raw_data,
uint64& uncompressed_trace_size);