uby: Fix checkpointing and restore

There are 2 problems with the existing checkpoint and restore code in ruby.
The first is that when the event queue is altered by ruby during serialization,
some events that are currently scheduled cannot be found (e.g. the event to
stop simulation that always lives on the queue), causing a panic.
The second is that ruby is sometimes serialized after the memory system,
meaning that the dirty data in its cache is flushed back to memory too late
and so isn't included in the checkpoint.

These are fixed by implementing memory writeback in ruby, using the same
technique of hijacking the event queue, but first descheduling all events that
are currently on it.  They are saved, along with their scheduled time, so that
the event queue can be faithfully reconstructed after writeback has finished.
Events with the AutoDelete flag set will delete themselves when they
are descheduled, causing an error when attempting to schedule them again.
This is fixed by simply not recording them when taking them off the queue.

Writeback is still implemented using flushing, so the cache recorder object,
that is created to generate the trace and manage flushing, is kept
around and used during serialization to write the trace to disk.

Committed by: Nilay Vaish <nilay@cs.wisc.edu>
This commit is contained in:
Timothy Jones 2015-08-03 23:08:40 -05:00
parent 676ae57827
commit 96091f358b
4 changed files with 129 additions and 64 deletions

View file

@ -95,6 +95,8 @@ CacheRecorder::enqueueNextFlushRequest()
m_sequencer_ptr->makeRequest(pkt);
DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
} else {
DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed);
}
}
@ -137,6 +139,8 @@ CacheRecorder::enqueueNextFetchRequest()
m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
m_records_read++;
} else {
DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read);
}
}

View file

@ -30,6 +30,7 @@
#include <zlib.h>
#include <cstdio>
#include <list>
#include "base/intmath.hh"
#include "base/statistics.hh"
@ -56,7 +57,8 @@ unsigned RubySystem::m_systems_to_warmup = 0;
bool RubySystem::m_cooldown_enabled = false;
RubySystem::RubySystem(const Params *p)
: ClockedObject(p), m_access_backing_store(p->access_backing_store)
: ClockedObject(p), m_access_backing_store(p->access_backing_store),
m_cache_recorder(NULL)
{
m_random_seed = p->random_seed;
srandom(m_random_seed);
@ -98,6 +100,111 @@ RubySystem::~RubySystem()
delete m_profiler;
}
void
RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
uint64 cache_trace_size,
uint64 block_size_bytes)
{
vector<Sequencer*> sequencer_map;
Sequencer* sequencer_ptr = NULL;
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
if (sequencer_ptr == NULL) {
sequencer_ptr = sequencer_map[cntrl];
}
}
assert(sequencer_ptr != NULL);
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
if (sequencer_map[cntrl] == NULL) {
sequencer_map[cntrl] = sequencer_ptr;
}
}
// Remove the old CacheRecorder if it's still hanging about.
if (m_cache_recorder != NULL) {
delete m_cache_recorder;
}
// Create the CacheRecorder and record the cache trace
m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
sequencer_map, block_size_bytes);
}
void
RubySystem::memWriteback()
{
m_cooldown_enabled = true;
// Make the trace so we know what to write back.
DPRINTF(RubyCacheTrace, "Recording Cache Trace\n");
makeCacheRecorder(NULL, 0, getBlockSizeBytes());
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
m_abs_cntrl_vec[cntrl]->recordCacheTrace(cntrl, m_cache_recorder);
}
DPRINTF(RubyCacheTrace, "Cache Trace Complete\n");
// save the current tick value
Tick curtick_original = curTick();
DPRINTF(RubyCacheTrace, "Recording current tick %ld\n", curtick_original);
// Deschedule all prior events on the event queue, but record the tick they
// were scheduled at so they can be restored correctly later.
list<pair<Event*, Tick> > original_events;
while (!eventq->empty()) {
Event *curr_head = eventq->getHead();
if (curr_head->isAutoDelete()) {
DPRINTF(RubyCacheTrace, "Event %s auto-deletes when descheduled,"
" not recording\n", curr_head->name());
} else {
original_events.push_back(make_pair(curr_head, curr_head->when()));
}
eventq->deschedule(curr_head);
}
// Schedule an event to start cache cooldown
DPRINTF(RubyCacheTrace, "Starting cache flush\n");
enqueueRubyEvent(curTick());
simulate();
DPRINTF(RubyCacheTrace, "Cache flush complete\n");
// Deschedule any events left on the event queue.
while (!eventq->empty()) {
eventq->deschedule(eventq->getHead());
}
// Restore curTick
setCurTick(curtick_original);
// Restore all events that were originally on the event queue. This is
// done after setting curTick back to its original value so that events do
// not seem to be scheduled in the past.
while (!original_events.empty()) {
pair<Event*, Tick> event = original_events.back();
eventq->schedule(event.first, event.second);
original_events.pop_back();
}
// No longer flushing back to memory.
m_cooldown_enabled = false;
// There are several issues with continuing simulation after calling
// memWriteback() at the moment, that stem from taking events off the
// queue, simulating again, and then putting them back on, whilst
// pretending that no time has passed. One is that some events will have
// been deleted, so can't be put back. Another is that any object
// recording the tick something happens may end up storing a tick in the
// future. A simple warning here alerts the user that things may not work
// as expected.
warn_once("Ruby memory writeback is experimental. Continuing simulation "
"afterwards may not always work as intended.");
// Keep the cache recorder around so that we can dump the trace if a
// checkpoint is immediately taken.
}
void
RubySystem::writeCompressedTrace(uint8_t *raw_data, string filename,
uint64 uncompressed_trace_size)
@ -130,59 +237,19 @@ RubySystem::writeCompressedTrace(uint8_t *raw_data, string filename,
void
RubySystem::serializeOld(CheckpointOut &cp)
{
m_cooldown_enabled = true;
vector<Sequencer*> sequencer_map;
Sequencer* sequencer_ptr = NULL;
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
if (sequencer_ptr == NULL) {
sequencer_ptr = sequencer_map[cntrl];
}
}
assert(sequencer_ptr != NULL);
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
if (sequencer_map[cntrl] == NULL) {
sequencer_map[cntrl] = sequencer_ptr;
}
}
// Store the cache-block size, so we are able to restore on systems with a
// different cache-block size. CacheRecorder depends on the correct
// cache-block size upon unserializing.
uint64 block_size_bytes = getBlockSizeBytes();
SERIALIZE_SCALAR(block_size_bytes);
DPRINTF(RubyCacheTrace, "Recording Cache Trace\n");
// Create the CacheRecorder and record the cache trace
m_cache_recorder = new CacheRecorder(NULL, 0, sequencer_map,
block_size_bytes);
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
m_abs_cntrl_vec[cntrl]->recordCacheTrace(cntrl, m_cache_recorder);
// Check that there's a valid trace to use. If not, then memory won't be
// up-to-date and the simulation will probably fail when restoring from the
// checkpoint.
if (m_cache_recorder == NULL) {
fatal("Call memWriteback() before serialize() to create ruby trace");
}
DPRINTF(RubyCacheTrace, "Cache Trace Complete\n");
// save the current tick value
Tick curtick_original = curTick();
// save the event queue head
Event* eventq_head = eventq->replaceHead(NULL);
DPRINTF(RubyCacheTrace, "Recording current tick %ld and event queue\n",
curtick_original);
// Schedule an event to start cache cooldown
DPRINTF(RubyCacheTrace, "Starting cache flush\n");
enqueueRubyEvent(curTick());
simulate();
DPRINTF(RubyCacheTrace, "Cache flush complete\n");
// Restore eventq head
eventq_head = eventq->replaceHead(eventq_head);
// Restore curTick
setCurTick(curtick_original);
// Aggregate the trace entries together into a single array
uint8_t *raw_data = new uint8_t[4096];
uint64 cache_trace_size = m_cache_recorder->aggregateRecords(&raw_data,
@ -193,7 +260,9 @@ RubySystem::serializeOld(CheckpointOut &cp)
SERIALIZE_SCALAR(cache_trace_file);
SERIALIZE_SCALAR(cache_trace_size);
m_cooldown_enabled = false;
// Now finished with the cache recorder.
delete m_cache_recorder;
m_cache_recorder = NULL;
}
void
@ -250,23 +319,8 @@ RubySystem::unserialize(CheckpointIn &cp)
m_warmup_enabled = true;
m_systems_to_warmup++;
vector<Sequencer*> sequencer_map;
Sequencer* t = NULL;
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
if (t == NULL) t = sequencer_map[cntrl];
}
assert(t != NULL);
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
if (sequencer_map[cntrl] == NULL) {
sequencer_map[cntrl] = t;
}
}
m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
sequencer_map, block_size_bytes);
// Create the cache recorder that will hang around until startup.
makeCacheRecorder(uncompressed_trace, cache_trace_size, block_size_bytes);
}
void
@ -290,6 +344,7 @@ RubySystem::startup()
// state was checkpointed.
if (m_warmup_enabled) {
DPRINTF(RubyCacheTrace, "Starting ruby cache warmup\n");
// save the current tick value
Tick curtick_original = curTick();
// save the event queue head

View file

@ -94,6 +94,7 @@ class RubySystem : public ClockedObject
void collateStats() { m_profiler->collateStats(); }
void resetStats();
void memWriteback();
void serializeOld(CheckpointOut &cp) M5_ATTR_OVERRIDE;
void unserialize(CheckpointIn &cp) M5_ATTR_OVERRIDE;
void process();
@ -116,6 +117,10 @@ class RubySystem : public ClockedObject
RubySystem(const RubySystem& obj);
RubySystem& operator=(const RubySystem& obj);
void makeCacheRecorder(uint8_t *uncompressed_trace,
uint64 cache_trace_size,
uint64 block_size_bytes);
void readCompressedTrace(std::string filename,
uint8_t *&raw_data,
uint64& uncompressed_trace_size);

View file

@ -564,6 +564,7 @@ class EventQueue : public Serializable
Tick nextTick() const { return head->when(); }
void setCurTick(Tick newVal) { _curTick = newVal; }
Tick getCurTick() { return _curTick; }
Event *getHead() const { return head; }
Event *serviceOne();