cpu: Add SMT support to MinorCPU

This patch adds SMT support to the MinorCPU. Currently RoundRobin or Random thread scheduling are supported. Change-Id: I91faf39ff881af5918cca05051829fc6261f20e3
2016-07-21 17:19:16 +01:00 · 2016-07-21 17:19:16 +01:00 · ff4009ac00
commit ff4009ac00
parent 8a476d387c
21 changed files with 1247 additions and 693 deletions
--- a/src/cpu/minor/MinorCPU.py
+++ b/src/cpu/minor/MinorCPU.py
@ -169,6 +169,8 @@ class MinorDefaultFUPool(MinorFUPool):
        MinorDefaultFloatSimdFU(), MinorDefaultMemFU(),
        MinorDefaultMiscFU()]
 class ThreadPolicy(Enum): vals = ['SingleThreaded', 'RoundRobin', 'Random']
 class MinorCPU(BaseCPU):
    type = 'MinorCPU'
    cxx_header = "cpu/minor/cpu.hh"
@ -185,6 +187,8 @@ class MinorCPU(BaseCPU):
    def support_take_over(cls):
        return True
    threadPolicy = Param.ThreadPolicy('RoundRobin',
            "Thread scheduling policy")
    fetch1FetchLimit = Param.Unsigned(1,
        "Number of line fetches allowable in flight at once")
    fetch1LineSnapWidth = Param.Unsigned(0,
--- a/src/cpu/minor/cpu.cc
+++ b/src/cpu/minor/cpu.cc
@ -47,32 +47,33 @@
 #include "debug/Quiesce.hh"
 MinorCPU::MinorCPU(MinorCPUParams *params) :
-    BaseCPU(params)
+    BaseCPU(params),
    threadPolicy(params->threadPolicy)
 {
    /* This is only written for one thread at the moment */
    Minor::MinorThread *thread;
-    if (FullSystem) {
+    for (ThreadID i = 0; i < numThreads; i++) {
-        thread = new Minor::MinorThread(this, 0, params->system, params->itb,
+        if (FullSystem) {
-            params->dtb, params->isa[0]);
+            thread = new Minor::MinorThread(this, i, params->system,
-    } else {
+                    params->itb, params->dtb, params->isa[i]);
-        /* thread_id 0 */
+            thread->setStatus(ThreadContext::Halted);
-        thread = new Minor::MinorThread(this, 0, params->system,
+        } else {
-            params->workload[0], params->itb, params->dtb, params->isa[0]);
+            thread = new Minor::MinorThread(this, i, params->system,
                    params->workload[i], params->itb, params->dtb,
                    params->isa[i]);
        }
        threads.push_back(thread);
        ThreadContext *tc = thread->getTC();
        threadContexts.push_back(tc);
    }
    threads.push_back(thread);
    thread->setStatus(ThreadContext::Halted);
    ThreadContext *tc = thread->getTC();
    if (params->checker) {
        fatal("The Minor model doesn't support checking (yet)\n");
    }
    threadContexts.push_back(tc);
    Minor::MinorDynInst::init();
    pipeline = new Minor::Pipeline(*this, *params);
@ -137,9 +138,6 @@ MinorCPU::serializeThread(CheckpointOut &cp, ThreadID thread_id) const
 void
 MinorCPU::unserializeThread(CheckpointIn &cp, ThreadID thread_id)
 {
    if (thread_id != 0)
        fatal("Trying to load more than one thread into a MinorCPU\n");
    threads[thread_id]->unserialize(cp);
 }
@ -170,11 +168,11 @@ void
 MinorCPU::wakeup(ThreadID tid)
 {
    DPRINTF(Drain, "[tid:%d] MinorCPU wakeup\n", tid);
    assert(tid < numThreads);
-    if (threads[tid]->status() == ThreadContext::Suspended)
+    if (threads[tid]->status() == ThreadContext::Suspended) {
        threads[tid]->activate();
-
+    }
    DPRINTF(Drain,"Suspended Processor awoke\n");
 }
 void
@ -187,13 +185,10 @@ MinorCPU::startup()
    for (auto i = threads.begin(); i != threads.end(); i ++)
        (*i)->startup();
-    /* Workaround cases in SE mode where a thread is activated with an
+    for (ThreadID tid = 0; tid < numThreads; tid++) {
-     * incorrect PC that is updated after the call to activate. This
+        threads[tid]->startup();
-     * causes problems for Minor since it instantiates a virtual
+        pipeline->wakeupFetch(tid);
-     * branch instruction when activateContext() is called which ends
+    }
     * up pointing to an illegal address.  */
    if (threads[0]->status() == ThreadContext::Active)
        activateContext(0);
 }
 DrainState
@ -246,6 +241,7 @@ MinorCPU::drainResume()
    for (ThreadID tid = 0; tid < numThreads; tid++)
        wakeup(tid);
    pipeline->drainResume();
 }
@ -278,7 +274,7 @@ MinorCPU::takeOverFrom(BaseCPU *old_cpu)
 void
 MinorCPU::activateContext(ThreadID thread_id)
 {
-    DPRINTF(MinorCPU, "ActivateContext thread: %d", thread_id);
+    DPRINTF(MinorCPU, "ActivateContext thread: %d\n", thread_id);
    /* Do some cycle accounting.  lastStopped is reset to stop the
     *  wakeup call on the pipeline from adding the quiesce period
@ -289,7 +285,7 @@ MinorCPU::activateContext(ThreadID thread_id)
    /* Wake up the thread, wakeup the pipeline tick */
    threads[thread_id]->activate();
    wakeupOnEvent(Minor::Pipeline::CPUStageId);
-    pipeline->wakeupFetch();
+    pipeline->wakeupFetch(thread_id);
    BaseCPU::activateContext(thread_id);
 }
@ -317,9 +313,6 @@ MinorCPU::wakeupOnEvent(unsigned int stage_id)
 MinorCPU *
 MinorCPUParams::create()
 {
    numThreads = 1;
    if (!FullSystem && workload.size() != 1)
        panic("only one workload allowed");
    return new MinorCPU(this);
 }
--- a/src/cpu/minor/cpu.hh
+++ b/src/cpu/minor/cpu.hh
@ -50,6 +50,7 @@
 #include "cpu/minor/stats.hh"
 #include "cpu/base.hh"
 #include "cpu/simple_thread.hh"
 #include "enums/ThreadPolicy.hh"
 #include "params/MinorCPU.hh"
 namespace Minor
@ -109,6 +110,8 @@ class MinorCPU : public BaseCPU
    };
    /** Thread Scheduling Policy (RoundRobin, Random, etc) */
    Enums::ThreadPolicy threadPolicy;
  protected:
     /** Return a reference to the data port. */
    MasterPort &getDataPort() override;
@ -162,6 +165,26 @@ class MinorCPU : public BaseCPU
    void activateContext(ThreadID thread_id) override;
    void suspendContext(ThreadID thread_id) override;
    /** Thread scheduling utility functions */
    std::vector<ThreadID> roundRobinPriority(ThreadID priority)
    {
        std::vector<ThreadID> prio_list;
        for (ThreadID i = 1; i <= numThreads; i++) {
            prio_list.push_back((priority + i) % numThreads);
        }
        return prio_list;
    }
    std::vector<ThreadID> randomPriority()
    {
        std::vector<ThreadID> prio_list;
        for (ThreadID i = 0; i < numThreads; i++) {
            prio_list.push_back(i);
        }
        std::random_shuffle(prio_list.begin(), prio_list.end());
        return prio_list;
    }
    /** Interface for stages to signal that they have become active after
     *  a callback or eventq event where the pipeline itself may have
     *  already been idled.  The stage argument should be from the
--- a/src/cpu/minor/decode.cc
+++ b/src/cpu/minor/decode.cc
@ -49,7 +49,7 @@ Decode::Decode(const std::string &name,
    MinorCPUParams &params,
    Latch<ForwardInstData>::Output inp_,
    Latch<ForwardInstData>::Input out_,
-    Reservable &next_stage_input_buffer) :
+    std::vector<InputBuffer<ForwardInstData>> &next_stage_input_buffer) :
    Named(name),
    cpu(cpu_),
    inp(inp_),
@ -57,11 +57,8 @@ Decode::Decode(const std::string &name,
    nextStageReserve(next_stage_input_buffer),
    outputWidth(params.executeInputWidth),
    processMoreThanOneInput(params.decodeCycleInput),
-    inputBuffer(name + ".inputBuffer", "insts", params.decodeInputBufferSize),
+    decodeInfo(params.numThreads),
-    inputIndex(0),
+    threadPriority(0)
    inMacroop(false),
    execSeqNum(InstId::firstExecSeqNum),
    blocked(false)
 {
    if (outputWidth < 1)
        fatal("%s: executeInputWidth must be >= 1 (%d)\n", name, outputWidth);
@ -70,29 +67,37 @@ Decode::Decode(const std::string &name,
        fatal("%s: decodeInputBufferSize must be >= 1 (%d)\n", name,
        params.decodeInputBufferSize);
    }
    /* Per-thread input buffers */
    for (ThreadID tid = 0; tid < params.numThreads; tid++) {
        inputBuffer.push_back(
            InputBuffer<ForwardInstData>(
                name + ".inputBuffer" + std::to_string(tid), "insts",
                params.decodeInputBufferSize));
    }
 }
 const ForwardInstData *
-Decode::getInput()
+Decode::getInput(ThreadID tid)
 {
    /* Get insts from the inputBuffer to work with */
-    if (!inputBuffer.empty()) {
+    if (!inputBuffer[tid].empty()) {
-        const ForwardInstData &head = inputBuffer.front();
+        const ForwardInstData &head = inputBuffer[tid].front();
-        return (head.isBubble() ? NULL : &(inputBuffer.front()));
+        return (head.isBubble() ? NULL : &(inputBuffer[tid].front()));
    } else {
        return NULL;
    }
 }
 void
-Decode::popInput()
+Decode::popInput(ThreadID tid)
 {
-    if (!inputBuffer.empty())
+    if (!inputBuffer[tid].empty())
-        inputBuffer.pop();
+        inputBuffer[tid].pop();
-    inputIndex = 0;
+    decodeInfo[tid].inputIndex = 0;
-    inMacroop = false;
+    decodeInfo[tid].inMacroop = false;
 }
 #if TRACING_ON
@ -117,32 +122,37 @@ dynInstAddTracing(MinorDynInstPtr inst, StaticInstPtr static_inst,
 void
 Decode::evaluate()
 {
-    inputBuffer.setTail(*inp.outputWire);
+    /* Push input onto appropriate input buffer */
    if (!inp.outputWire->isBubble())
        inputBuffer[inp.outputWire->threadId].setTail(*inp.outputWire);
    ForwardInstData &insts_out = *out.inputWire;
    assert(insts_out.isBubble());
-    blocked = false;
+    for (ThreadID tid = 0; tid < cpu.numThreads; tid++)
        decodeInfo[tid].blocked = !nextStageReserve[tid].canReserve();
-    if (!nextStageReserve.canReserve()) {
+    ThreadID tid = getScheduledThread();
-        blocked = true;
+
-    } else {
+    if (tid != InvalidThreadID) {
-        const ForwardInstData *insts_in = getInput();
+        DecodeThreadInfo &decode_info = decodeInfo[tid];
        const ForwardInstData *insts_in = getInput(tid);
        unsigned int output_index = 0;
        /* Pack instructions into the output while we can.  This may involve
         * using more than one input line */
        while (insts_in &&
-           inputIndex < insts_in->width() && /* Still more input */
+           decode_info.inputIndex < insts_in->width() && /* Still more input */
           output_index < outputWidth /* Still more output to fill */)
        {
-            MinorDynInstPtr inst = insts_in->insts[inputIndex];
+            MinorDynInstPtr inst = insts_in->insts[decode_info.inputIndex];
            if (inst->isBubble()) {
                /* Skip */
-                inputIndex++;
+                decode_info.inputIndex++;
-                inMacroop = false;
+                decode_info.inMacroop = false;
            } else {
                StaticInstPtr static_inst = inst->staticInst;
                /* Static inst of a macro-op above the output_inst */
@ -153,25 +163,26 @@ Decode::evaluate()
                    DPRINTF(Decode, "Fault being passed: %d\n",
                        inst->fault->name());
-                    inputIndex++;
+                    decode_info.inputIndex++;
-                    inMacroop = false;
+                    decode_info.inMacroop = false;
                } else if (static_inst->isMacroop()) {
                    /* Generate a new micro-op */
                    StaticInstPtr static_micro_inst;
                    /* Set up PC for the next micro-op emitted */
-                    if (!inMacroop) {
+                    if (!decode_info.inMacroop) {
-                        microopPC = inst->pc;
+                        decode_info.microopPC = inst->pc;
-                        inMacroop = true;
+                        decode_info.inMacroop = true;
                    }
                    /* Get the micro-op static instruction from the
                     * static_inst. */
                    static_micro_inst =
-                        static_inst->fetchMicroop(microopPC.microPC());
+                        static_inst->fetchMicroop(
                                decode_info.microopPC.microPC());
                    output_inst = new MinorDynInst(inst->id);
-                    output_inst->pc = microopPC;
+                    output_inst->pc = decode_info.microopPC;
                    output_inst->staticInst = static_micro_inst;
                    output_inst->fault = NoFault;
@ -185,45 +196,46 @@ Decode::evaluate()
                    DPRINTF(Decode, "Microop decomposition inputIndex:"
                        " %d output_index: %d lastMicroop: %s microopPC:"
                        " %d.%d inst: %d\n",
-                        inputIndex, output_index,
+                        decode_info.inputIndex, output_index,
                        (static_micro_inst->isLastMicroop() ?
                            "true" : "false"),
-                        microopPC.instAddr(), microopPC.microPC(),
+                        decode_info.microopPC.instAddr(),
                        decode_info.microopPC.microPC(),
                        *output_inst);
                    /* Acknowledge that the static_inst isn't mine, it's my
                     * parent macro-op's */
                    parent_static_inst = static_inst;
-                    static_micro_inst->advancePC(microopPC);
+                    static_micro_inst->advancePC(decode_info.microopPC);
                    /* Step input if this is the last micro-op */
                    if (static_micro_inst->isLastMicroop()) {
-                        inputIndex++;
+                        decode_info.inputIndex++;
-                        inMacroop = false;
+                        decode_info.inMacroop = false;
                    }
                } else {
                    /* Doesn't need decomposing, pass on instruction */
                    DPRINTF(Decode, "Passing on inst: %s inputIndex:"
                        " %d output_index: %d\n",
-                        *output_inst, inputIndex, output_index);
+                        *output_inst, decode_info.inputIndex, output_index);
                    parent_static_inst = static_inst;
                    /* Step input */
-                    inputIndex++;
+                    decode_info.inputIndex++;
-                    inMacroop = false;
+                    decode_info.inMacroop = false;
                }
                /* Set execSeqNum of output_inst */
-                output_inst->id.execSeqNum = execSeqNum;
+                output_inst->id.execSeqNum = decode_info.execSeqNum;
                /* Add tracing */
 #if TRACING_ON
                dynInstAddTracing(output_inst, parent_static_inst, cpu);
 #endif
                /* Step to next sequence number */
-                execSeqNum++;
+                decode_info.execSeqNum++;
                /* Correctly size the output before writing */
                if (output_index == 0) insts_out.resize(outputWidth);
@ -233,17 +245,17 @@ Decode::evaluate()
            }
            /* Have we finished with the input? */
-            if (inputIndex == insts_in->width()) {
+            if (decode_info.inputIndex == insts_in->width()) {
                /* If we have just been producing micro-ops, we *must* have
                 * got to the end of that for inputIndex to be pushed past
                 * insts_in->width() */
-                assert(!inMacroop);
+                assert(!decode_info.inMacroop);
-                popInput();
+                popInput(tid);
                insts_in = NULL;
                if (processMoreThanOneInput) {
                    DPRINTF(Decode, "Wrapping\n");
-                    insts_in = getInput();
+                    insts_in = getInput(tid);
                }
            }
        }
@ -261,22 +273,65 @@ Decode::evaluate()
    if (!insts_out.isBubble()) {
        /* Note activity of following buffer */
        cpu.activityRecorder->activity();
-        nextStageReserve.reserve();
+        insts_out.threadId = tid;
        nextStageReserve[tid].reserve();
    }
    /* If we still have input to process and somewhere to put it,
     *  mark stage as active */
-    if (getInput() && nextStageReserve.canReserve())
+    for (ThreadID i = 0; i < cpu.numThreads; i++)
-        cpu.activityRecorder->activateStage(Pipeline::DecodeStageId);
+    {
        if (getInput(i) && nextStageReserve[i].canReserve()) {
            cpu.activityRecorder->activateStage(Pipeline::DecodeStageId);
            break;
        }
    }
    /* Make sure the input (if any left) is pushed */
-    inputBuffer.pushTail();
+    if (!inp.outputWire->isBubble())
        inputBuffer[inp.outputWire->threadId].pushTail();
 }
 inline ThreadID
 Decode::getScheduledThread()
 {
    /* Select thread via policy. */
    std::vector<ThreadID> priority_list;
    switch (cpu.threadPolicy) {
      case Enums::SingleThreaded:
        priority_list.push_back(0);
        break;
      case Enums::RoundRobin:
        priority_list = cpu.roundRobinPriority(threadPriority);
        break;
      case Enums::Random:
        priority_list = cpu.randomPriority();
        break;
      default:
        panic("Unknown fetch policy");
    }
    for (auto tid : priority_list) {
        if (cpu.getContext(tid)->status() == ThreadContext::Active &&
            getInput(tid) && !decodeInfo[tid].blocked) {
            threadPriority = tid;
            return tid;
        }
    }
   return InvalidThreadID;
 }
 bool
 Decode::isDrained()
 {
-    return inputBuffer.empty() && (*inp.outputWire).isBubble();
+    for (const auto &buffer : inputBuffer) {
        if (!buffer.empty())
            return false;
    }
    return (*inp.outputWire).isBubble();
 }
 void
@ -284,13 +339,13 @@ Decode::minorTrace() const
 {
    std::ostringstream data;
-    if (blocked)
+    if (decodeInfo[0].blocked)
        data << 'B';
    else
        (*out.inputWire).reportData(data);
    MINORTRACE("insts=%s\n", data.str());
-    inputBuffer.minorTrace();
+    inputBuffer[0].minorTrace();
 }
 }
--- a/src/cpu/minor/decode.hh
+++ b/src/cpu/minor/decode.hh
@ -71,7 +71,7 @@ class Decode : public Named
    Latch<ForwardInstData>::Input out;
    /** Interface to reserve space in the next stage */
-    Reservable &nextStageReserve;
+    std::vector<InputBuffer<ForwardInstData>> &nextStageReserve;
    /** Width of output of this stage/input of next in instructions */
    unsigned int outputWidth;
@ -82,43 +82,68 @@ class Decode : public Named
  public:
    /* Public for Pipeline to be able to pass it to Fetch2 */
-    InputBuffer<ForwardInstData> inputBuffer;
+    std::vector<InputBuffer<ForwardInstData>> inputBuffer;
  protected:
    /** Data members after this line are cycle-to-cycle state */
-    /** Index into the inputBuffer's head marking the start of unhandled
+    struct DecodeThreadInfo {
     *  instructions */
    unsigned int inputIndex;
-    /** True when we're in the process of decomposing a micro-op and
+        /** Default Constructor */
-     *  microopPC will be valid.  This is only the case when there isn't
+        DecodeThreadInfo() :
-     *  sufficient space in Executes input buffer to take the whole of a
+            inputIndex(0),
-     *  decomposed instruction and some of that instructions micro-ops must
+            inMacroop(false),
-     *  be generated in a later cycle */
+            execSeqNum(InstId::firstExecSeqNum),
-    bool inMacroop;
+            blocked(false)
-    TheISA::PCState microopPC;
+        { }
-    /** Source of execSeqNums to number instructions. */
+        DecodeThreadInfo(const DecodeThreadInfo& other) :
-    InstSeqNum execSeqNum;
+            inputIndex(other.inputIndex),
            inMacroop(other.inMacroop),
            execSeqNum(other.execSeqNum),
            blocked(other.blocked)
        { }
-    /** Blocked indication for report */
+
-    bool blocked;
+        /** Index into the inputBuffer's head marking the start of unhandled
         *  instructions */
        unsigned int inputIndex;
        /** True when we're in the process of decomposing a micro-op and
         *  microopPC will be valid.  This is only the case when there isn't
         *  sufficient space in Executes input buffer to take the whole of a
         *  decomposed instruction and some of that instructions micro-ops must
         *  be generated in a later cycle */
        bool inMacroop;
        TheISA::PCState microopPC;
        /** Source of execSeqNums to number instructions. */
        InstSeqNum execSeqNum;
        /** Blocked indication for report */
        bool blocked;
    };
    std::vector<DecodeThreadInfo> decodeInfo;
    ThreadID threadPriority;
  protected:
    /** Get a piece of data to work on, or 0 if there is no data. */
-    const ForwardInstData *getInput();
+    const ForwardInstData *getInput(ThreadID tid);
    /** Pop an element off the input buffer, if there are any */
-    void popInput();
+    void popInput(ThreadID tid);
    /** Use the current threading policy to determine the next thread to
     *  decode from. */
    ThreadID getScheduledThread();
  public:
    Decode(const std::string &name,
        MinorCPU &cpu_,
        MinorCPUParams &params,
        Latch<ForwardInstData>::Output inp_,
        Latch<ForwardInstData>::Input out_,
-        Reservable &next_stage_input_buffer);
+        std::vector<InputBuffer<ForwardInstData>> &next_stage_input_buffer);
  public:
    /** Pass on input/buffer data to the output if you can */
--- a/src/cpu/minor/dyn_inst.cc
+++ b/src/cpu/minor/dyn_inst.cc
@ -52,6 +52,12 @@
 namespace Minor
 {
 const InstSeqNum InstId::firstStreamSeqNum;
 const InstSeqNum InstId::firstPredictionSeqNum;
 const InstSeqNum InstId::firstLineSeqNum;
 const InstSeqNum InstId::firstFetchSeqNum;
 const InstSeqNum InstId::firstExecSeqNum;
 std::ostream &
 operator <<(std::ostream &os, const InstId &id)
 {
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@ -342,12 +342,17 @@ class ExecContext : public ::ExecContext
  public:
    // monitor/mwait funtions
-    void armMonitor(Addr address) { getCpuPtr()->armMonitor(0, address); }
+    void armMonitor(Addr address)
-    bool mwait(PacketPtr pkt) { return getCpuPtr()->mwait(0, pkt); }
+    { getCpuPtr()->armMonitor(inst->id.threadId, address); }
    bool mwait(PacketPtr pkt)
    { return getCpuPtr()->mwait(inst->id.threadId, pkt); }
    void mwaitAtomic(ThreadContext *tc)
-    { return getCpuPtr()->mwaitAtomic(0, tc, thread.dtb); }
+    { return getCpuPtr()->mwaitAtomic(inst->id.threadId, tc, thread.dtb); }
    AddressMonitor *getAddrMonitor()
-    { return getCpuPtr()->getCpuAddrMonitor(0); }
+    { return getCpuPtr()->getCpuAddrMonitor(inst->id.threadId); }
 };
 }
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
--- a/src/cpu/minor/execute.hh
+++ b/src/cpu/minor/execute.hh
@ -116,13 +116,13 @@ class Execute : public Named
    LSQ lsq;
    /** Scoreboard of instruction dependencies */
-    Scoreboard scoreboard;
+    std::vector<Scoreboard> scoreboard;
    /** The execution functional units */
    std::vector<FUPipeline *> funcUnits;
  public: /* Public for Pipeline to be able to pass it to Decode */
-    InputBuffer<ForwardInstData> inputBuffer;
+    std::vector<InputBuffer<ForwardInstData>> inputBuffer;
  protected:
    /** Stage cycle-by-cycle state */
@ -143,48 +143,75 @@ class Execute : public Named
        DrainAllInsts /* Discarding all remaining insts */
    };
-    /** In-order instructions either in FUs or the LSQ */
+    struct ExecuteThreadInfo {
-    Queue<QueuedInst, ReportTraitsAdaptor<QueuedInst> > *inFlightInsts;
+        /** Constructor */
        ExecuteThreadInfo(unsigned int insts_committed) :
            inputIndex(0),
            lastCommitWasEndOfMacroop(true),
            instsBeingCommitted(insts_committed),
            streamSeqNum(InstId::firstStreamSeqNum),
            lastPredictionSeqNum(InstId::firstPredictionSeqNum),
            drainState(NotDraining)
        { }
-    /** Memory ref instructions still in the FUs */
+        ExecuteThreadInfo(const ExecuteThreadInfo& other) :
-    Queue<QueuedInst, ReportTraitsAdaptor<QueuedInst> > *inFUMemInsts;
+            inputIndex(other.inputIndex),
            lastCommitWasEndOfMacroop(other.lastCommitWasEndOfMacroop),
            instsBeingCommitted(other.instsBeingCommitted),
            streamSeqNum(other.streamSeqNum),
            lastPredictionSeqNum(other.lastPredictionSeqNum),
            drainState(other.drainState)
        { }
-    /** Index that we've completed upto in getInput data.  We can say we're
+        /** In-order instructions either in FUs or the LSQ */
-     *  popInput when this equals getInput()->width() */
+        Queue<QueuedInst, ReportTraitsAdaptor<QueuedInst> > *inFlightInsts;
    unsigned int inputIndex;
-    /** The last commit was the end of a full instruction so an interrupt
+        /** Memory ref instructions still in the FUs */
-     *  can safely happen */
+        Queue<QueuedInst, ReportTraitsAdaptor<QueuedInst> > *inFUMemInsts;
    bool lastCommitWasEndOfMacroop;
-    /** Structure for reporting insts currently being processed/retired
+        /** Index that we've completed upto in getInput data.  We can say we're
-     *  for MinorTrace */
+         *  popInput when this equals getInput()->width() */
-    ForwardInstData instsBeingCommitted;
+        unsigned int inputIndex;
-    /** Source of sequence number for instuction streams.  Increment this and
+        /** The last commit was the end of a full instruction so an interrupt
-     *  pass to fetch whenever an instruction stream needs to be changed.
+         *  can safely happen */
-     *  For any more complicated behaviour (e.g. speculation) there'll need
+        bool lastCommitWasEndOfMacroop;
     *  to be another plan. THREAD, need one for each thread */
    InstSeqNum streamSeqNum;
-    /** A prediction number for use where one isn't available from an
+        /** Structure for reporting insts currently being processed/retired
-     *  instruction.  This is harvested from committed instructions.
+         *  for MinorTrace */
-     *  This isn't really needed as the streamSeqNum will change on
+        ForwardInstData instsBeingCommitted;
     *  a branch, but it minimises disruption in stream identification */
    InstSeqNum lastPredictionSeqNum;
-    /** State progression for draining NotDraining -> ... -> DrainAllInsts */
+        /** Source of sequence number for instuction streams.  Increment this and
-    DrainState drainState;
+         *  pass to fetch whenever an instruction stream needs to be changed.
         *  For any more complicated behaviour (e.g. speculation) there'll need
         *  to be another plan. */
        InstSeqNum streamSeqNum;
        /** A prediction number for use where one isn't available from an
         *  instruction.  This is harvested from committed instructions.
         *  This isn't really needed as the streamSeqNum will change on
         *  a branch, but it minimises disruption in stream identification */
        InstSeqNum lastPredictionSeqNum;
        /** State progression for draining NotDraining -> ... -> DrainAllInsts */
        DrainState drainState;
    };
    std::vector<ExecuteThreadInfo> executeInfo;
    ThreadID interruptPriority;
    ThreadID issuePriority;
    ThreadID commitPriority;
  protected:
    friend std::ostream &operator <<(std::ostream &os, DrainState state);
    /** Get a piece of data to work on from the inputBuffer, or 0 if there
     *  is no data. */
-    const ForwardInstData *getInput();
+    const ForwardInstData *getInput(ThreadID tid);
    /** Pop an element off the input buffer, if there are any */
-    void popInput();
+    void popInput(ThreadID tid);
    /** Generate Branch data based (into branch) on an observed (or not)
     *  change in PC while executing an instruction.
@ -193,7 +220,7 @@ class Execute : public Named
    /** Actually create a branch to communicate to Fetch1/Fetch2 and,
     *  if that is a stream-changing branch update the streamSeqNum */
-    void updateBranchData(BranchData::Reason reason,
+    void updateBranchData(ThreadID tid, BranchData::Reason reason,
        MinorDynInstPtr inst, const TheISA::PCState &target,
        BranchData &branch);
@ -224,23 +251,32 @@ class Execute : public Named
    bool isInterrupted(ThreadID thread_id) const;
    /** Are we between instructions?  Can we be interrupted? */
-    bool isInbetweenInsts() const;
+    bool isInbetweenInsts(ThreadID thread_id) const;
    /** Act on an interrupt.  Returns true if an interrupt was actually
     *  signalled and invoked */
    bool takeInterrupt(ThreadID thread_id, BranchData &branch);
    /** Try and issue instructions from the inputBuffer */
-    unsigned int issue(bool only_issue_microops);
+    unsigned int issue(ThreadID thread_id);
    /** Try to act on PC-related events.  Returns true if any were
     *  executed */
-    bool tryPCEvents();
+    bool tryPCEvents(ThreadID thread_id);
    /** Do the stats handling and instruction count and PC event events
     *  related to the new instruction/op counts */
    void doInstCommitAccounting(MinorDynInstPtr inst);
    /** Check all threads for possible interrupts. If interrupt is taken,
     *  returns the tid of the thread.  interrupted is set if any thread
     *  has an interrupt, irrespective of if it is taken */
    ThreadID checkInterrupts(BranchData& branch, bool& interrupted);
    /** Checks if a specific thread has an interrupt.  No action is taken.
     *  this is used for determining if a thread should only commit microops */
    bool hasInterrupt(ThreadID thread_id);
    /** Commit a single instruction.  Returns true if the instruction being
     *  examined was completed (fully executed, discarded, or initiated a
     *  memory access), false if there is still some processing to do.
@ -266,10 +302,16 @@ class Execute : public Named
     *  If discard is true then discard all instructions rather than
     *  committing.
     *  branch is set to any branch raised during commit. */
-    void commit(bool only_commit_microops, bool discard, BranchData &branch);
+    void commit(ThreadID thread_id, bool only_commit_microops, bool discard,
        BranchData &branch);
    /** Set the drain state (with useful debugging messages) */
-    void setDrainState(DrainState state);
+    void setDrainState(ThreadID thread_id, DrainState state);
    /** Use the current threading policy to determine the next thread to
     *  decode from. */
    ThreadID getCommittingThread();
    ThreadID getIssuingThread();
  public:
    Execute(const std::string &name_,
@ -282,12 +324,6 @@ class Execute : public Named
  public:
    /** Cause Execute to issue an UnpredictedBranch (or WakeupFetch if
     *  that was passed as the reason) to Fetch1 to wake the
     *  system up (using the PC from the thread context). */
    void wakeupFetch(BranchData::Reason reason =
        BranchData::UnpredictedBranch);
    /** Returns the DcachePort owned by this Execute to pass upwards */
    MinorCPU::MinorCPUPort &getDcachePort();
--- a/src/cpu/minor/fetch1.cc
+++ b/src/cpu/minor/fetch1.cc
@ -57,7 +57,7 @@ Fetch1::Fetch1(const std::string &name_,
    Latch<BranchData>::Output inp_,
    Latch<ForwardLineData>::Input out_,
    Latch<BranchData>::Output prediction_,
-    Reservable &next_stage_input_buffer) :
+    std::vector<InputBuffer<ForwardLineData>> &next_stage_input_buffer) :
    Named(name_),
    cpu(cpu_),
    inp(inp_),
@ -68,11 +68,8 @@ Fetch1::Fetch1(const std::string &name_,
    lineSnap(params.fetch1LineSnapWidth),
    maxLineWidth(params.fetch1LineWidth),
    fetchLimit(params.fetch1FetchLimit),
-    state(FetchWaitingForPC),
+    fetchInfo(params.numThreads),
-    pc(0),
+    threadPriority(0),
    streamSeqNum(InstId::firstStreamSeqNum),
    predictionSeqNum(InstId::firstPredictionSeqNum),
    blocked(false),
    requests(name_ + ".requests", "lines", params.fetch1FetchLimit),
    transfers(name_ + ".transfers", "lines", params.fetch1FetchLimit),
    icacheState(IcacheRunning),
@ -114,32 +111,67 @@ Fetch1::Fetch1(const std::string &name_,
    }
 }
-void
+inline ThreadID
-Fetch1::fetchLine()
+Fetch1::getScheduledThread()
 {
    /* Select thread via policy. */
    std::vector<ThreadID> priority_list;
    switch (cpu.threadPolicy) {
      case Enums::SingleThreaded:
        priority_list.push_back(0);
        break;
      case Enums::RoundRobin:
        priority_list = cpu.roundRobinPriority(threadPriority);
        break;
      case Enums::Random:
        priority_list = cpu.randomPriority();
        break;
      default:
        panic("Unknown fetch policy");
    }
    for (auto tid : priority_list) {
        if (cpu.getContext(tid)->status() == ThreadContext::Active &&
            !fetchInfo[tid].blocked &&
            fetchInfo[tid].state == FetchRunning) {
            threadPriority = tid;
            return tid;
        }
    }
   return InvalidThreadID;
 }
 void
 Fetch1::fetchLine(ThreadID tid)
 {
    /* Reference the currently used thread state. */
    Fetch1ThreadInfo &thread = fetchInfo[tid];
    /* If line_offset != 0, a request is pushed for the remainder of the
     * line. */
    /* Use a lower, sizeof(MachInst) aligned address for the fetch */
-    Addr aligned_pc = pc.instAddr() & ~((Addr) lineSnap - 1);
+    Addr aligned_pc = thread.pc.instAddr() & ~((Addr) lineSnap - 1);
    unsigned int line_offset = aligned_pc % lineSnap;
    unsigned int request_size = maxLineWidth - line_offset;
    /* Fill in the line's id */
-    InstId request_id(0 /* thread */,
+    InstId request_id(tid,
-        streamSeqNum, predictionSeqNum,
+        thread.streamSeqNum, thread.predictionSeqNum,
        lineSeqNum);
-    FetchRequestPtr request = new FetchRequest(*this, request_id, pc);
+    FetchRequestPtr request = new FetchRequest(*this, request_id, thread.pc);
    DPRINTF(Fetch, "Inserting fetch into the fetch queue "
        "%s addr: 0x%x pc: %s line_offset: %d request_size: %d\n",
-        request_id, aligned_pc, pc, line_offset, request_size);
+        request_id, aligned_pc, thread.pc, line_offset, request_size);
-    request->request.setContext(cpu.threads[0]->getTC()->contextId());
+    request->request.setContext(cpu.threads[tid]->getTC()->contextId());
    request->request.setVirt(0 /* asid */,
        aligned_pc, request_size, Request::INST_FETCH, cpu.instMasterId(),
        /* I've no idea why we need the PC, but give it */
-        pc.instAddr());
+        thread.pc.instAddr());
    DPRINTF(Fetch, "Submitting ITLB request\n");
    numFetchesInITLB++;
@ -165,12 +197,12 @@ Fetch1::fetchLine()
     * reliable 'new' PC if the next line has a new stream sequence number. */
 #if THE_ISA == ALPHA_ISA
    /* Restore the low bits of the PC used as address space flags */
-    Addr pc_low_bits = pc.instAddr() &
+    Addr pc_low_bits = thread.pc.instAddr() &
        ((Addr) (1 << sizeof(TheISA::MachInst)) - 1);
-    pc.set(aligned_pc + request_size + pc_low_bits);
+    thread.pc.set(aligned_pc + request_size + pc_low_bits);
 #else
-    pc.set(aligned_pc + request_size);
+    thread.pc.set(aligned_pc + request_size);
 #endif
 }
@ -454,46 +486,58 @@ operator <<(std::ostream &os, Fetch1::FetchState state)
 void
 Fetch1::changeStream(const BranchData &branch)
 {
    Fetch1ThreadInfo &thread = fetchInfo[branch.threadId];
    updateExpectedSeqNums(branch);
    /* Start fetching again if we were stopped */
    switch (branch.reason) {
      case BranchData::SuspendThread:
-        DPRINTF(Fetch, "Suspending fetch: %s\n", branch);
+        {
-        state = FetchWaitingForPC;
+            if (thread.wakeupGuard) {
                DPRINTF(Fetch, "Not suspending fetch due to guard: %s\n",
                                branch);
            } else {
                DPRINTF(Fetch, "Suspending fetch: %s\n", branch);
                thread.state = FetchWaitingForPC;
            }
        }
        break;
      case BranchData::HaltFetch:
        DPRINTF(Fetch, "Halting fetch\n");
-        state = FetchHalted;
+        thread.state = FetchHalted;
        break;
      default:
        DPRINTF(Fetch, "Changing stream on branch: %s\n", branch);
-        state = FetchRunning;
+        thread.state = FetchRunning;
        break;
    }
-    pc = branch.target;
+    thread.pc = branch.target;
 }
 void
 Fetch1::updateExpectedSeqNums(const BranchData &branch)
 {
    Fetch1ThreadInfo &thread = fetchInfo[branch.threadId];
    DPRINTF(Fetch, "Updating streamSeqNum from: %d to %d,"
        " predictionSeqNum from: %d to %d\n",
-        streamSeqNum, branch.newStreamSeqNum,
+        thread.streamSeqNum, branch.newStreamSeqNum,
-        predictionSeqNum, branch.newPredictionSeqNum);
+        thread.predictionSeqNum, branch.newPredictionSeqNum);
    /* Change the stream */
-    streamSeqNum = branch.newStreamSeqNum;
+    thread.streamSeqNum = branch.newStreamSeqNum;
    /* Update the prediction.  Note that it's possible for this to
     *  actually set the prediction to an *older* value if new
     *  predictions have been discarded by execute */
-    predictionSeqNum = branch.newPredictionSeqNum;
+    thread.predictionSeqNum = branch.newPredictionSeqNum;
 }
 void
 Fetch1::processResponse(Fetch1::FetchRequestPtr response,
    ForwardLineData &line)
 {
    Fetch1ThreadInfo &thread = fetchInfo[response->id.threadId];
    PacketPtr packet = response->packet;
    /* Pass the prefetch abort (if any) on to Fetch2 in a ForwardLineData
@ -514,7 +558,7 @@ Fetch1::processResponse(Fetch1::FetchRequestPtr response,
         * can't (currently) selectively remove this stream from the queues */
        DPRINTF(Fetch, "Stopping line fetch because of fault: %s\n",
            response->fault->name());
-        state = Fetch1::FetchWaitingForPC;
+        thread.state = Fetch1::FetchWaitingForPC;
    } else {
        line.adoptPacketData(packet);
        /* Null the response's packet to prevent the response from trying to
@ -532,61 +576,86 @@ Fetch1::evaluate()
    assert(line_out.isBubble());
-    blocked = !nextStageReserve.canReserve();
+    for (ThreadID tid = 0; tid < cpu.numThreads; tid++)
        fetchInfo[tid].blocked = !nextStageReserve[tid].canReserve();
-    /* Are we changing stream?  Look to the Execute branches first, then
+    /** Are both branches from later stages valid and for the same thread? */
-     * to predicted changes of stream from Fetch2 */
+    if (execute_branch.threadId != InvalidThreadID &&
-    /* @todo, find better way to express ignoring branch predictions */
+        execute_branch.threadId == fetch2_branch.threadId) {
-    if (execute_branch.isStreamChange() &&
+
-        execute_branch.reason != BranchData::BranchPrediction)
+        Fetch1ThreadInfo &thread = fetchInfo[execute_branch.threadId];
-    {
+
-        if (state == FetchHalted) {
+        /* Are we changing stream?  Look to the Execute branches first, then
-            if (execute_branch.reason == BranchData::WakeupFetch) {
+         * to predicted changes of stream from Fetch2 */
-                DPRINTF(Fetch, "Waking up fetch: %s\n", execute_branch);
+        if (execute_branch.isStreamChange()) {
-                changeStream(execute_branch);
+            if (thread.state == FetchHalted) {
                DPRINTF(Fetch, "Halted, ignoring branch: %s\n", execute_branch);
            } else {
-                DPRINTF(Fetch, "Halted, ignoring branch: %s\n",
+                changeStream(execute_branch);
-                    execute_branch);
+            }
            if (!fetch2_branch.isBubble()) {
                DPRINTF(Fetch, "Ignoring simultaneous prediction: %s\n",
                    fetch2_branch);
            }
            /* The streamSeqNum tagging in request/response ->req should handle
             *  discarding those requests when we get to them. */
        } else if (thread.state != FetchHalted && fetch2_branch.isStreamChange()) {
            /* Handle branch predictions by changing the instruction source
             * if we're still processing the same stream (as set by streamSeqNum)
             * as the one of the prediction.
             */
            if (fetch2_branch.newStreamSeqNum != thread.streamSeqNum) {
                DPRINTF(Fetch, "Not changing stream on prediction: %s,"
                    " streamSeqNum mismatch\n",
                    fetch2_branch);
            } else {
                changeStream(fetch2_branch);
            }
        }
    } else {
        /* Fetch2 and Execute branches are for different threads */
        if (execute_branch.threadId != InvalidThreadID &&
            execute_branch.isStreamChange()) {
            if (fetchInfo[execute_branch.threadId].state == FetchHalted) {
                DPRINTF(Fetch, "Halted, ignoring branch: %s\n", execute_branch);
            } else {
                changeStream(execute_branch);
            }
        } else {
            changeStream(execute_branch);
        }
-        if (!fetch2_branch.isBubble()) {
+        if (fetch2_branch.threadId != InvalidThreadID &&
-            DPRINTF(Fetch, "Ignoring simultaneous prediction: %s\n",
+            fetch2_branch.isStreamChange()) {
                fetch2_branch);
        }
-        /* The streamSeqNum tagging in request/response ->req should handle
+            if (fetchInfo[fetch2_branch.threadId].state == FetchHalted) {
-         *  discarding those requests when we get to them. */
+                DPRINTF(Fetch, "Halted, ignoring branch: %s\n", fetch2_branch);
-    } else if (state != FetchHalted && fetch2_branch.isStreamChange()) {
+            } else if (fetch2_branch.newStreamSeqNum != fetchInfo[fetch2_branch.threadId].streamSeqNum) {
-        /* Handle branch predictions by changing the instruction source
+                DPRINTF(Fetch, "Not changing stream on prediction: %s,"
-         * if we're still processing the same stream (as set by streamSeqNum)
+                    " streamSeqNum mismatch\n", fetch2_branch);
-         * as the one of the prediction.
+            } else {
-         */
+                changeStream(fetch2_branch);
-        if (fetch2_branch.newStreamSeqNum != streamSeqNum) {
+            }
            DPRINTF(Fetch, "Not changing stream on prediction: %s,"
                " streamSeqNum mismatch\n",
                fetch2_branch);
        } else {
            changeStream(fetch2_branch);
        }
    }
-    /* Can we fetch? */
+    if (numInFlightFetches() < fetchLimit) {
-    /* The bare minimum requirements for initiating a fetch */
+        ThreadID fetch_tid = getScheduledThread();
-    /* THREAD need to handle multiple threads */
+
-    if (state == FetchRunning && /* We are actually fetching */
+        if (fetch_tid != InvalidThreadID) {
-        !blocked && /* Space in the Fetch2 inputBuffer */
+            DPRINTF(Fetch, "Fetching from thread %d\n", fetch_tid);
-        /* The thread we're going to fetch for (thread 0), is active */
+
-        cpu.getContext(0)->status() == ThreadContext::Active &&
+            /* Generate fetch to selected thread */
-        numInFlightFetches() < fetchLimit)
+            fetchLine(fetch_tid);
-    {
+            /* Take up a slot in the fetch queue */
-        fetchLine();
+            nextStageReserve[fetch_tid].reserve();
-        /* Take up a slot in the fetch queue */
+        } else {
-        nextStageReserve.reserve();
+            DPRINTF(Fetch, "No active threads available to fetch from\n");
        }
    }
    /* Halting shouldn't prevent fetches in flight from being processed */
    /* Step fetches through the icachePort queues and memory system */
    stepQueues();
@ -599,9 +668,9 @@ Fetch1::evaluate()
        Fetch1::FetchRequestPtr response = transfers.front();
        if (response->isDiscardable()) {
-            nextStageReserve.freeReservation();
+            nextStageReserve[response->id.threadId].freeReservation();
-            DPRINTF(Fetch, "Discarding translated fetch at it's for"
+            DPRINTF(Fetch, "Discarding translated fetch as it's for"
                " an old stream\n");
            /* Wake up next cycle just in case there was some other
@ -626,19 +695,49 @@ Fetch1::evaluate()
     *  generate a line output (tested just above) or to initiate a memory
     *  fetch which will signal activity when it returns/needs stepping
     *  between queues */
    /* This looks hackish.  And it is, but there doesn't seem to be a better
     * way to do this.  The signal from commit to suspend fetch takes 1
     * clock cycle to propagate to fetch.  However, a legitimate wakeup
     * may occur between cycles from the memory system.  Thus wakeup guard
     * prevents us from suspending in that case. */
    for (auto& thread : fetchInfo) {
        thread.wakeupGuard = false;
    }
 }
 void
 Fetch1::wakeupFetch(ThreadID tid)
 {
    ThreadContext *thread_ctx = cpu.getContext(tid);
    Fetch1ThreadInfo &thread = fetchInfo[tid];
    thread.pc = thread_ctx->pcState();
    thread.state = FetchRunning;
    thread.wakeupGuard = true;
    DPRINTF(Fetch, "[tid:%d]: Changing stream wakeup %s\n",
            tid, thread_ctx->pcState());
    cpu.wakeupOnEvent(Pipeline::Fetch1StageId);
 }
 bool
 Fetch1::isDrained()
 {
-    DPRINTF(Drain, "isDrained %s %s%s\n",
+    bool drained = numInFlightFetches() == 0 && (*out.inputWire).isBubble();
-        state,
+    for (ThreadID tid = 0; tid < cpu.numThreads; tid++) {
-        (numInFlightFetches() == 0 ? "" : "inFlightFetches "),
+        Fetch1ThreadInfo &thread = fetchInfo[tid];
-        ((*out.inputWire).isBubble() ? "" : "outputtingLine"));
+        DPRINTF(Drain, "isDrained[tid:%d]: %s %s%s\n",
                tid,
                thread.state == FetchHalted,
                (numInFlightFetches() == 0 ? "" : "inFlightFetches "),
                ((*out.inputWire).isBubble() ? "" : "outputtingLine"));
-    return state == FetchHalted &&
+        drained = drained && thread.state == FetchHalted;
-        numInFlightFetches() == 0 &&
+    }
-        (*out.inputWire).isBubble();
+
    return drained;
 }
 void
@ -649,26 +748,32 @@ Fetch1::FetchRequest::reportData(std::ostream &os) const
 bool Fetch1::FetchRequest::isDiscardable() const
 {
    Fetch1ThreadInfo &thread = fetch.fetchInfo[id.threadId];
    /* Can't discard lines in TLB/memory */
    return state != InTranslation && state != RequestIssuing &&
-        (id.streamSeqNum != fetch.streamSeqNum ||
+        (id.streamSeqNum != thread.streamSeqNum ||
-        id.predictionSeqNum != fetch.predictionSeqNum);
+        id.predictionSeqNum != thread.predictionSeqNum);
 }
 void
 Fetch1::minorTrace() const
 {
    // TODO: Un-bork minorTrace for THREADS
    // bork bork bork
    const Fetch1ThreadInfo &thread = fetchInfo[0];
    std::ostringstream data;
-    if (blocked)
+    if (thread.blocked)
        data << 'B';
    else
        (*out.inputWire).reportData(data);
    MINORTRACE("state=%s icacheState=%s in_tlb_mem=%s/%s"
-        " streamSeqNum=%d lines=%s\n", state, icacheState,
+        " streamSeqNum=%d lines=%s\n", thread.state, icacheState,
        numFetchesInITLB, numFetchesInMemorySystem,
-        streamSeqNum, data.str());
+        thread.streamSeqNum, data.str());
    requests.minorTrace();
    transfers.minorTrace();
 }
--- a/src/cpu/minor/fetch1.hh
+++ b/src/cpu/minor/fetch1.hh
@ -197,7 +197,7 @@ class Fetch1 : public Named
    Latch<BranchData>::Output prediction;
    /** Interface to reserve space in the next stage */
-    Reservable &nextStageReserve;
+    std::vector<InputBuffer<ForwardLineData>> &nextStageReserve;
    /** IcachePort to pass to the CPU.  Fetch1 is the only module that uses
     *  it. */
@ -233,26 +233,53 @@ class Fetch1 : public Named
    /** Stage cycle-by-cycle state */
-    FetchState state;
+    struct Fetch1ThreadInfo {
-    /** Fetch PC value. This is updated by branches from Execute, branch
+        /** Consturctor to initialize all fields. */
-     *  prediction targets from Fetch2 and by incrementing it as we fetch
+        Fetch1ThreadInfo() :
-     *  lines subsequent to those two sources. */
+            state(FetchWaitingForPC),
-    TheISA::PCState pc;
+            pc(TheISA::PCState(0)),
            streamSeqNum(InstId::firstStreamSeqNum),
            predictionSeqNum(InstId::firstPredictionSeqNum),
            blocked(false),
            wakeupGuard(false)
        { }
-    /** Stream sequence number.  This changes on request from Execute and is
+        Fetch1ThreadInfo(const Fetch1ThreadInfo& other) :
-     *  used to tag instructions by the fetch stream to which they belong.
+            state(other.state),
-     *  Execute originates new prediction sequence numbers. */
+            pc(other.pc),
-    InstSeqNum streamSeqNum;
+            streamSeqNum(other.streamSeqNum),
            predictionSeqNum(other.predictionSeqNum),
            blocked(other.blocked)
        { }
-    /** Prediction sequence number.  This changes when requests from Execute
+        FetchState state;
     *  or Fetch2 ask for a change of fetch address and is used to tag lines
     *  by the prediction to which they belong.  Fetch2 originates
     *  prediction sequence numbers. */
    InstSeqNum predictionSeqNum;
-    /** Blocked indication for report */
+        /** Fetch PC value. This is updated by branches from Execute, branch
-    bool blocked;
+         *  prediction targets from Fetch2 and by incrementing it as we fetch
         *  lines subsequent to those two sources. */
        TheISA::PCState pc;
        /** Stream sequence number.  This changes on request from Execute and is
         *  used to tag instructions by the fetch stream to which they belong.
         *  Execute originates new prediction sequence numbers. */
        InstSeqNum streamSeqNum;
        /** Prediction sequence number.  This changes when requests from Execute
         *  or Fetch2 ask for a change of fetch address and is used to tag lines
         *  by the prediction to which they belong.  Fetch2 originates
         *  prediction sequence numbers. */
        InstSeqNum predictionSeqNum;
        /** Blocked indication for report */
        bool blocked;
        /** Signal to guard against sleeping first cycle of wakeup */
        bool wakeupGuard;
    };
    std::vector<Fetch1ThreadInfo> fetchInfo;
    ThreadID threadPriority;
    /** State of memory access for head instruction fetch */
    enum IcacheState
@ -307,10 +334,15 @@ class Fetch1 : public Named
    friend std::ostream &operator <<(std::ostream &os,
        IcacheState state);
    /** Use the current threading policy to determine the next thread to
     *  fetch from. */
    ThreadID getScheduledThread();
    /** Insert a line fetch into the requests.  This can be a partial
     *  line request where the given address has a non-0 offset into a
     *  line. */
-    void fetchLine();
+    void fetchLine(ThreadID tid);
    /** Try and issue a fetch for a translated request at the
     *  head of the requests queue.  Also tries to move the request
@ -354,7 +386,7 @@ class Fetch1 : public Named
        Latch<BranchData>::Output inp_,
        Latch<ForwardLineData>::Input out_,
        Latch<BranchData>::Output prediction_,
-        Reservable &next_stage_input_buffer);
+        std::vector<InputBuffer<ForwardLineData>> &next_stage_input_buffer);
  public:
    /** Returns the IcachePort owned by this Fetch1 */
@ -363,6 +395,9 @@ class Fetch1 : public Named
    /** Pass on input/buffer data to the output if you can */
    void evaluate();
    /** Initiate fetch1 fetching */
    void wakeupFetch(ThreadID tid);
    void minorTrace() const;
    /** Is this stage drained?  For Fetch1, draining is initiated by
--- a/src/cpu/minor/fetch2.cc
+++ b/src/cpu/minor/fetch2.cc
@ -58,7 +58,7 @@ Fetch2::Fetch2(const std::string &name,
    Latch<BranchData>::Output branchInp_,
    Latch<BranchData>::Input predictionOut_,
    Latch<ForwardInstData>::Input out_,
-    Reservable &next_stage_input_buffer) :
+    std::vector<InputBuffer<ForwardInstData>> &next_stage_input_buffer) :
    Named(name),
    cpu(cpu_),
    inp(inp_),
@ -69,15 +69,8 @@ Fetch2::Fetch2(const std::string &name,
    outputWidth(params.decodeInputWidth),
    processMoreThanOneInput(params.fetch2CycleInput),
    branchPredictor(*params.branchPred),
-    inputBuffer(name + ".inputBuffer", "lines", params.fetch2InputBufferSize),
+    fetchInfo(params.numThreads),
-    inputIndex(0),
+    threadPriority(0)
    pc(TheISA::PCState(0)),
    havePC(false),
    lastStreamSeqNum(InstId::firstStreamSeqNum),
    fetchSeqNum(InstId::firstFetchSeqNum),
    expectedStreamSeqNum(InstId::firstStreamSeqNum),
    predictionSeqNum(InstId::firstPredictionSeqNum),
    blocked(false)
 {
    if (outputWidth < 1)
        fatal("%s: decodeInputWidth must be >= 1 (%d)\n", name, outputWidth);
@ -86,38 +79,46 @@ Fetch2::Fetch2(const std::string &name,
        fatal("%s: fetch2InputBufferSize must be >= 1 (%d)\n", name,
        params.fetch2InputBufferSize);
    }
    /* Per-thread input buffers */
    for (ThreadID tid = 0; tid < params.numThreads; tid++) {
        inputBuffer.push_back(
            InputBuffer<ForwardLineData>(
                name + ".inputBuffer" + std::to_string(tid), "lines",
                params.fetch2InputBufferSize));
    }
 }
 const ForwardLineData *
-Fetch2::getInput()
+Fetch2::getInput(ThreadID tid)
 {
    /* Get a line from the inputBuffer to work with */
-    if (!inputBuffer.empty()) {
+    if (!inputBuffer[tid].empty()) {
-        return &(inputBuffer.front());
+        return &(inputBuffer[tid].front());
    } else {
        return NULL;
    }
 }
 void
-Fetch2::popInput()
+Fetch2::popInput(ThreadID tid)
 {
-    if (!inputBuffer.empty()) {
+    if (!inputBuffer[tid].empty()) {
-        inputBuffer.front().freeLine();
+        inputBuffer[tid].front().freeLine();
-        inputBuffer.pop();
+        inputBuffer[tid].pop();
    }
-    inputIndex = 0;
+    fetchInfo[tid].inputIndex = 0;
 }
 void
-Fetch2::dumpAllInput()
+Fetch2::dumpAllInput(ThreadID tid)
 {
    DPRINTF(Fetch, "Dumping whole input buffer\n");
-    while (!inputBuffer.empty())
+    while (!inputBuffer[tid].empty())
-        popInput();
+        popInput(tid);
-    inputIndex = 0;
+    fetchInfo[tid].inputIndex = 0;
 }
 void
@ -139,9 +140,6 @@ Fetch2::updateBranchPrediction(const BranchData &branch)
      case BranchData::SuspendThread:
        /* Don't need to act on suspends */
        break;
      case BranchData::WakeupFetch:
        /* Don't need to act on wakeups, no instruction tied to action. */
        break;
      case BranchData::HaltFetch:
        /* Don't need to act on fetch wakeup */
        break;
@ -180,6 +178,7 @@ Fetch2::updateBranchPrediction(const BranchData &branch)
 void
 Fetch2::predictBranch(MinorDynInstPtr inst, BranchData &branch)
 {
    Fetch2ThreadInfo &thread = fetchInfo[inst->id.threadId];
    TheISA::PCState inst_pc = inst->pc;
    assert(!inst->predictedTaken);
@ -209,35 +208,37 @@ Fetch2::predictBranch(MinorDynInstPtr inst, BranchData &branch)
    if (inst->predictedTaken) {
        /* Update the predictionSeqNum and remember the streamSeqNum that it
         *  was associated with */
-        expectedStreamSeqNum = inst->id.streamSeqNum;
+        thread.expectedStreamSeqNum = inst->id.streamSeqNum;
        BranchData new_branch = BranchData(BranchData::BranchPrediction,
-            inst->id.streamSeqNum, predictionSeqNum + 1,
+            inst->id.threadId,
            inst->id.streamSeqNum, thread.predictionSeqNum + 1,
            inst->predictedTarget, inst);
        /* Mark with a new prediction number by the stream number of the
         *  instruction causing the prediction */
-        predictionSeqNum++;
+        thread.predictionSeqNum++;
        branch = new_branch;
        DPRINTF(Branch, "Branch predicted taken inst: %s target: %s"
            " new predictionSeqNum: %d\n",
-            *inst, inst->predictedTarget, predictionSeqNum);
+            *inst, inst->predictedTarget, thread.predictionSeqNum);
    }
 }
 void
 Fetch2::evaluate()
 {
-    inputBuffer.setTail(*inp.outputWire);
+    /* Push input onto appropriate input buffer */
    if (!inp.outputWire->isBubble())
        inputBuffer[inp.outputWire->id.threadId].setTail(*inp.outputWire);
    ForwardInstData &insts_out = *out.inputWire;
    BranchData prediction;
    BranchData &branch_inp = *branchInp.outputWire;
    assert(insts_out.isBubble());
    blocked = false;
    /* React to branches from Execute to update local branch prediction
     *  structures */
    updateBranchPrediction(branch_inp);
@ -247,39 +248,48 @@ Fetch2::evaluate()
    if (branch_inp.isStreamChange()) {
        DPRINTF(Fetch, "Dumping all input as a stream changing branch"
            " has arrived\n");
-        dumpAllInput();
+        dumpAllInput(branch_inp.threadId);
-        havePC = false;
+        fetchInfo[branch_inp.threadId].havePC = false;
    }
    assert(insts_out.isBubble());
    /* Even when blocked, clear out input lines with the wrong
     *  prediction sequence number */
-    {
+    for (ThreadID tid = 0; tid < cpu.numThreads; tid++) {
-        const ForwardLineData *line_in = getInput();
+        Fetch2ThreadInfo &thread = fetchInfo[tid];
        thread.blocked = !nextStageReserve[tid].canReserve();
        const ForwardLineData *line_in = getInput(tid);
        while (line_in &&
-            expectedStreamSeqNum == line_in->id.streamSeqNum &&
+            thread.expectedStreamSeqNum == line_in->id.streamSeqNum &&
-            predictionSeqNum != line_in->id.predictionSeqNum)
+            thread.predictionSeqNum != line_in->id.predictionSeqNum)
        {
            DPRINTF(Fetch, "Discarding line %s"
                " due to predictionSeqNum mismatch (expected: %d)\n",
-                line_in->id, predictionSeqNum);
+                line_in->id, thread.predictionSeqNum);
-            popInput();
+            popInput(tid);
-            havePC = false;
+            fetchInfo[tid].havePC = false;
            if (processMoreThanOneInput) {
                DPRINTF(Fetch, "Wrapping\n");
-                line_in = getInput();
+                line_in = getInput(tid);
            } else {
                line_in = NULL;
            }
        }
    }
-    if (!nextStageReserve.canReserve()) {
+    ThreadID tid = getScheduledThread();
-        blocked = true;
+    DPRINTF(Fetch, "Scheduled Thread: %d\n", tid);
-    } else {
+
-        const ForwardLineData *line_in = getInput();
+    assert(insts_out.isBubble());
    if (tid != InvalidThreadID) {
        Fetch2ThreadInfo &fetch_info = fetchInfo[tid];
        const ForwardLineData *line_in = getInput(tid);
        unsigned int output_index = 0;
@ -288,7 +298,7 @@ Fetch2::evaluate()
         * for faulting lines */
        while (line_in &&
            (line_in->isFault() ||
-                inputIndex < line_in->lineWidth) && /* More input */
+                fetch_info.inputIndex < line_in->lineWidth) && /* More input */
            output_index < outputWidth && /* More output to fill */
            prediction.isBubble() /* No predicted branch */)
        {
@ -298,26 +308,26 @@ Fetch2::evaluate()
            /* Discard line due to prediction sequence number being wrong but
             * without the streamSeqNum number having changed */
            bool discard_line =
-                expectedStreamSeqNum == line_in->id.streamSeqNum &&
+                fetch_info.expectedStreamSeqNum == line_in->id.streamSeqNum &&
-                predictionSeqNum != line_in->id.predictionSeqNum;
+                fetch_info.predictionSeqNum != line_in->id.predictionSeqNum;
            /* Set the PC if the stream changes.  Setting havePC to false in
             *  a previous cycle handles all other change of flow of control
             *  issues */
-            bool set_pc = lastStreamSeqNum != line_in->id.streamSeqNum;
+            bool set_pc = fetch_info.lastStreamSeqNum != line_in->id.streamSeqNum;
-            if (!discard_line && (!havePC || set_pc)) {
+            if (!discard_line && (!fetch_info.havePC || set_pc)) {
                /* Set the inputIndex to be the MachInst-aligned offset
                 *  from lineBaseAddr of the new PC value */
-                inputIndex =
+                fetch_info.inputIndex =
                    (line_in->pc.instAddr() & BaseCPU::PCMask) -
                    line_in->lineBaseAddr;
                DPRINTF(Fetch, "Setting new PC value: %s inputIndex: 0x%x"
                    " lineBaseAddr: 0x%x lineWidth: 0x%x\n",
-                    line_in->pc, inputIndex, line_in->lineBaseAddr,
+                    line_in->pc, fetch_info.inputIndex, line_in->lineBaseAddr,
                    line_in->lineWidth);
-                pc = line_in->pc;
+                fetch_info.pc = line_in->pc;
-                havePC = true;
+                fetch_info.havePC = true;
                decoder->reset();
            }
@ -330,7 +340,8 @@ Fetch2::evaluate()
                 *  stream */
                DPRINTF(Fetch, "Discarding line %s (from inputIndex: %d)"
                    " due to predictionSeqNum mismatch (expected: %d)\n",
-                    line_in->id, inputIndex, predictionSeqNum);
+                    line_in->id, fetch_info.inputIndex,
                    fetch_info.predictionSeqNum);
            } else if (line_in->isFault()) {
                /* Pack a fault as a MinorDynInst with ->fault set */
@ -339,13 +350,13 @@ Fetch2::evaluate()
                dyn_inst = new MinorDynInst(line_in->id);
                /* Fetch and prediction sequence numbers originate here */
-                dyn_inst->id.fetchSeqNum = fetchSeqNum;
+                dyn_inst->id.fetchSeqNum = fetch_info.fetchSeqNum;
-                dyn_inst->id.predictionSeqNum = predictionSeqNum;
+                dyn_inst->id.predictionSeqNum = fetch_info.predictionSeqNum;
                /* To complete the set, test that exec sequence number has
                 *  not been set */
                assert(dyn_inst->id.execSeqNum == 0);
-                dyn_inst->pc = pc;
+                dyn_inst->pc = fetch_info.pc;
                /* Pack a faulting instruction but allow other
                 *  instructions to be generated. (Fetch2 makes no
@ -361,13 +372,14 @@ Fetch2::evaluate()
                 *  assign */
                inst_word = TheISA::gtoh(
                    *(reinterpret_cast<TheISA::MachInst *>
-                    (line + inputIndex)));
+                    (line + fetch_info.inputIndex)));
                if (!decoder->instReady()) {
-                    decoder->moreBytes(pc,
+                    decoder->moreBytes(fetch_info.pc,
-                        line_in->lineBaseAddr + inputIndex, inst_word);
+                        line_in->lineBaseAddr + fetch_info.inputIndex,
-                    DPRINTF(Fetch, "Offering MachInst to decoder"
+                        inst_word);
-                        " addr: 0x%x\n", line_in->lineBaseAddr + inputIndex);
+                    DPRINTF(Fetch, "Offering MachInst to decoder addr: 0x%x\n",
                            line_in->lineBaseAddr + fetch_info.inputIndex);
                }
                /* Maybe make the above a loop to accomodate ISAs with
@ -379,8 +391,8 @@ Fetch2::evaluate()
                    dyn_inst = new MinorDynInst(line_in->id);
                    /* Fetch and prediction sequence numbers originate here */
-                    dyn_inst->id.fetchSeqNum = fetchSeqNum;
+                    dyn_inst->id.fetchSeqNum = fetch_info.fetchSeqNum;
-                    dyn_inst->id.predictionSeqNum = predictionSeqNum;
+                    dyn_inst->id.predictionSeqNum = fetch_info.predictionSeqNum;
                    /* To complete the set, test that exec sequence number
                     *  has not been set */
                    assert(dyn_inst->id.execSeqNum == 0);
@ -388,17 +400,19 @@ Fetch2::evaluate()
                    /* Note that the decoder can update the given PC.
                     *  Remember not to assign it until *after* calling
                     *  decode */
-                    StaticInstPtr decoded_inst = decoder->decode(pc);
+                    StaticInstPtr decoded_inst = decoder->decode(fetch_info.pc);
                    dyn_inst->staticInst = decoded_inst;
-                    dyn_inst->pc = pc;
+                    dyn_inst->pc = fetch_info.pc;
                    DPRINTF(Fetch, "decoder inst %s\n", *dyn_inst);
                    DPRINTF(Fetch, "Instruction extracted from line %s"
                        " lineWidth: %d output_index: %d inputIndex: %d"
                        " pc: %s inst: %s\n",
                        line_in->id,
-                        line_in->lineWidth, output_index, inputIndex,
+                        line_in->lineWidth, output_index, fetch_info.inputIndex,
-                        pc, *dyn_inst);
+                        fetch_info.pc, *dyn_inst);
 #if THE_ISA == X86_ISA || THE_ISA == ARM_ISA
                    /* In SE mode, it's possible to branch to a microop when
@ -415,12 +429,12 @@ Fetch2::evaluate()
                     * the case that, after a branch, the first un-advanced PC
                     * may be pointing to a microop other than 0.  Once
                     * advanced, however, the microop number *must* be 0 */
-                    pc.upc(0);
+                    fetch_info.pc.upc(0);
-                    pc.nupc(1);
+                    fetch_info.pc.nupc(1);
 #endif
                    /* Advance PC for the next instruction */
-                    TheISA::advancePC(pc, decoded_inst);
+                    TheISA::advancePC(fetch_info.pc, decoded_inst);
                    /* Predict any branches and issue a branch if
                     *  necessary */
@ -432,22 +446,23 @@ Fetch2::evaluate()
                /* Step on the pointer into the line if there's no
                 *  complete instruction waiting */
                if (decoder->needMoreBytes()) {
-                    inputIndex += sizeof(TheISA::MachInst);
+                    fetch_info.inputIndex += sizeof(TheISA::MachInst);
                DPRINTF(Fetch, "Updated inputIndex value PC: %s"
                    " inputIndex: 0x%x lineBaseAddr: 0x%x lineWidth: 0x%x\n",
-                    line_in->pc, inputIndex, line_in->lineBaseAddr,
+                    line_in->pc, fetch_info.inputIndex, line_in->lineBaseAddr,
                    line_in->lineWidth);
                }
            }
            if (dyn_inst) {
                /* Step to next sequence number */
-                fetchSeqNum++;
+                fetch_info.fetchSeqNum++;
                /* Correctly size the output before writing */
-                if (output_index == 0)
+                if (output_index == 0) {
                    insts_out.resize(outputWidth);
                }
                /* Pack the generated dynamic instruction into the output */
                insts_out.insts[output_index] = dyn_inst;
                output_index++;
@ -463,7 +478,7 @@ Fetch2::evaluate()
            /* Remember the streamSeqNum of this line so we can tell when
             *  we change stream */
-            lastStreamSeqNum = line_in->id.streamSeqNum;
+            fetch_info.lastStreamSeqNum = line_in->id.streamSeqNum;
            /* Asked to discard line or there was a branch or fault */
            if (!prediction.isBubble() || /* The remains of a
@ -471,33 +486,35 @@ Fetch2::evaluate()
                line_in->isFault() /* A line which is just a fault */)
            {
                DPRINTF(Fetch, "Discarding all input on branch/fault\n");
-                dumpAllInput();
+                dumpAllInput(tid);
-                havePC = false;
+                fetch_info.havePC = false;
                line_in = NULL;
            } else if (discard_line) {
                /* Just discard one line, one's behind it may have new
                 *  stream sequence numbers.  There's a DPRINTF above
                 *  for this event */
-                popInput();
+                popInput(tid);
-                havePC = false;
+                fetch_info.havePC = false;
                line_in = NULL;
-            } else if (inputIndex == line_in->lineWidth) {
+            } else if (fetch_info.inputIndex == line_in->lineWidth) {
                /* Got to end of a line, pop the line but keep PC
                 *  in case this is a line-wrapping inst. */
-                popInput();
+                popInput(tid);
                line_in = NULL;
            }
            if (!line_in && processMoreThanOneInput) {
                DPRINTF(Fetch, "Wrapping\n");
-                line_in = getInput();
+                line_in = getInput(tid);
            }
        }
        /* The rest of the output (if any) should already have been packed
         *  with bubble instructions by insts_out's initialisation */
    }
-
+    if (tid == InvalidThreadID) {
        assert(insts_out.isBubble());
    }
    /** Reserve a slot in the next stage and output data */
    *predictionOut.inputWire = prediction;
@ -506,24 +523,66 @@ Fetch2::evaluate()
    if (!insts_out.isBubble()) {
        /* Note activity of following buffer */
        cpu.activityRecorder->activity();
-        nextStageReserve.reserve();
+        insts_out.threadId = tid;
        nextStageReserve[tid].reserve();
    }
    /* If we still have input to process and somewhere to put it,
     *  mark stage as active */
-    if (getInput() && nextStageReserve.canReserve())
+    for (ThreadID i = 0; i < cpu.numThreads; i++)
-        cpu.activityRecorder->activateStage(Pipeline::Fetch2StageId);
+    {
        if (getInput(i) && nextStageReserve[i].canReserve()) {
            cpu.activityRecorder->activateStage(Pipeline::Fetch2StageId);
            break;
        }
    }
    /* Make sure the input (if any left) is pushed */
-    inputBuffer.pushTail();
+    if (!inp.outputWire->isBubble())
        inputBuffer[inp.outputWire->id.threadId].pushTail();
 }
 inline ThreadID
 Fetch2::getScheduledThread()
 {
    /* Select thread via policy. */
    std::vector<ThreadID> priority_list;
    switch (cpu.threadPolicy) {
      case Enums::SingleThreaded:
        priority_list.push_back(0);
        break;
      case Enums::RoundRobin:
        priority_list = cpu.roundRobinPriority(threadPriority);
        break;
      case Enums::Random:
        priority_list = cpu.randomPriority();
        break;
      default:
        panic("Unknown fetch policy");
    }
    for (auto tid : priority_list) {
        if (cpu.getContext(tid)->status() == ThreadContext::Active &&
            getInput(tid) && !fetchInfo[tid].blocked) {
            threadPriority = tid;
            return tid;
        }
    }
   return InvalidThreadID;
 }
 bool
 Fetch2::isDrained()
 {
-    return inputBuffer.empty() &&
+    for (const auto &buffer : inputBuffer) {
-        (*inp.outputWire).isBubble() &&
+        if (!buffer.empty())
-        (*predictionOut.inputWire).isBubble();
+            return false;
    }
    return (*inp.outputWire).isBubble() &&
           (*predictionOut.inputWire).isBubble();
 }
 void
@ -531,14 +590,14 @@ Fetch2::minorTrace() const
 {
    std::ostringstream data;
-    if (blocked)
+    if (fetchInfo[0].blocked)
        data << 'B';
    else
        (*out.inputWire).reportData(data);
    MINORTRACE("inputIndex=%d havePC=%d predictionSeqNum=%d insts=%s\n",
-        inputIndex, havePC, predictionSeqNum, data.str());
+        fetchInfo[0].inputIndex, fetchInfo[0].havePC, fetchInfo[0].predictionSeqNum, data.str());
-    inputBuffer.minorTrace();
+    inputBuffer[0].minorTrace();
 }
 }
--- a/src/cpu/minor/fetch2.hh
+++ b/src/cpu/minor/fetch2.hh
@ -78,7 +78,7 @@ class Fetch2 : public Named
    Latch<ForwardInstData>::Input out;
    /** Interface to reserve space in the next stage */
-    Reservable &nextStageReserve;
+    std::vector<InputBuffer<ForwardInstData>> &nextStageReserve;
    /** Width of output of this stage/input of next in instructions */
    unsigned int outputWidth;
@ -92,61 +92,90 @@ class Fetch2 : public Named
  public:
    /* Public so that Pipeline can pass it to Fetch1 */
-    InputBuffer<ForwardLineData> inputBuffer;
+    std::vector<InputBuffer<ForwardLineData>> inputBuffer;
  protected:
    /** Data members after this line are cycle-to-cycle state */
-    /** Index into an incompletely processed input line that instructions
+    struct Fetch2ThreadInfo {
     *  are to be extracted from */
    unsigned int inputIndex;
-    /** Remembered program counter value.  Between contiguous lines, this
+        /** Default constructor */
-     *  is just updated with advancePC.  For lines following changes of
+        Fetch2ThreadInfo() :
-     *  stream, a new PC must be loaded and havePC be set.
+            inputIndex(0),
-     *  havePC is needed to accomodate instructions which span across
+            pc(TheISA::PCState(0)),
-     *  lines meaning that Fetch2 and the decoder need to remember a PC
+            havePC(false),
-     *  value and a partially-offered instruction from the previous line */
+            lastStreamSeqNum(InstId::firstStreamSeqNum),
-    TheISA::PCState pc;
+            fetchSeqNum(InstId::firstFetchSeqNum),
            expectedStreamSeqNum(InstId::firstStreamSeqNum),
            predictionSeqNum(InstId::firstPredictionSeqNum),
            blocked(false)
        { }
-    /** PC is currently valid.  Initially false, gets set to true when a
+        Fetch2ThreadInfo(const Fetch2ThreadInfo& other) :
-     *  change-of-stream line is received and false again when lines are
+            inputIndex(other.inputIndex),
-     *  discarded for any reason */
+            pc(other.pc),
-    bool havePC;
+            havePC(other.havePC),
            lastStreamSeqNum(other.lastStreamSeqNum),
            expectedStreamSeqNum(other.expectedStreamSeqNum),
            predictionSeqNum(other.predictionSeqNum),
            blocked(other.blocked)
        { }
-    /** Stream sequence number of the last seen line used to identify changes
+        /** Index into an incompletely processed input line that instructions
-     *  of instruction stream */
+         *  are to be extracted from */
-    InstSeqNum lastStreamSeqNum;
+        unsigned int inputIndex;
    /** Fetch2 is the source of fetch sequence numbers.  These represent the
     *  sequence that instructions were extracted from fetched lines. */
    InstSeqNum fetchSeqNum;
-    /** Stream sequence number remembered from last time the predictionSeqNum
+        /** Remembered program counter value.  Between contiguous lines, this
-     *  changed.  Lines should only be discarded when their predictionSeqNums
+         *  is just updated with advancePC.  For lines following changes of
-     *  disagree with Fetch2::predictionSeqNum *and* they are from the same
+         *  stream, a new PC must be loaded and havePC be set.
-     *  stream that bore that prediction number */
+         *  havePC is needed to accomodate instructions which span across
-    InstSeqNum expectedStreamSeqNum;
+         *  lines meaning that Fetch2 and the decoder need to remember a PC
         *  value and a partially-offered instruction from the previous line */
        TheISA::PCState pc;
-    /** Fetch2 is the source of prediction sequence numbers.  These represent
+        /** PC is currently valid.  Initially false, gets set to true when a
-     *  predicted changes of control flow sources from branch prediction in
+         *  change-of-stream line is received and false again when lines are
-     *  Fetch2. */
+         *  discarded for any reason */
-    InstSeqNum predictionSeqNum;
+        bool havePC;
-    /** Blocked indication for report */
+        /** Stream sequence number of the last seen line used to identify
-    bool blocked;
+         *  changes of instruction stream */
        InstSeqNum lastStreamSeqNum;
        /** Fetch2 is the source of fetch sequence numbers.  These represent the
         *  sequence that instructions were extracted from fetched lines. */
        InstSeqNum fetchSeqNum;
        /** Stream sequence number remembered from last time the
         *  predictionSeqNum changed.  Lines should only be discarded when their
         *  predictionSeqNums disagree with Fetch2::predictionSeqNum *and* they
         *  are from the same stream that bore that prediction number */
        InstSeqNum expectedStreamSeqNum;
        /** Fetch2 is the source of prediction sequence numbers.  These
         *  represent predicted changes of control flow sources from branch
         *  prediction in Fetch2. */
        InstSeqNum predictionSeqNum;
        /** Blocked indication for report */
        bool blocked;
    };
    std::vector<Fetch2ThreadInfo> fetchInfo;
    ThreadID threadPriority;
  protected:
    /** Get a piece of data to work on from the inputBuffer, or 0 if there
     *  is no data. */
-    const ForwardLineData *getInput();
+    const ForwardLineData *getInput(ThreadID tid);
    /** Pop an element off the input buffer, if there are any */
-    void popInput();
+    void popInput(ThreadID tid);
    /** Dump the whole contents of the input buffer.  Useful after a
     *  prediction changes control flow */
-    void dumpAllInput();
+    void dumpAllInput(ThreadID tid);
    /** Update local branch prediction structures from feedback from
     *  Execute. */
@ -157,6 +186,10 @@ class Fetch2 : public Named
     *  carries the prediction to Fetch1 */
    void predictBranch(MinorDynInstPtr inst, BranchData &branch);
    /** Use the current threading policy to determine the next thread to
     *  fetch from. */
    ThreadID getScheduledThread();
  public:
    Fetch2(const std::string &name,
        MinorCPU &cpu_,
@ -165,7 +198,7 @@ class Fetch2 : public Named
        Latch<BranchData>::Output branchInp_,
        Latch<BranchData>::Input predictionOut_,
        Latch<ForwardInstData>::Input out_,
-        Reservable &next_stage_input_buffer);
+        std::vector<InputBuffer<ForwardInstData>> &next_stage_input_buffer);
  public:
    /** Pass on input/buffer data to the output if you can */
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@ -216,13 +216,14 @@ operator <<(std::ostream &os, LSQ::LSQRequest::LSQRequestState state)
 void
 LSQ::clearMemBarrier(MinorDynInstPtr inst)
 {
-    bool is_last_barrier = inst->id.execSeqNum >= lastMemBarrier;
+    bool is_last_barrier =
        inst->id.execSeqNum >= lastMemBarrier[inst->id.threadId];
    DPRINTF(MinorMem, "Moving %s barrier out of store buffer inst: %s\n",
        (is_last_barrier ? "last" : "a"), *inst);
    if (is_last_barrier)
-        lastMemBarrier = 0;
+        lastMemBarrier[inst->id.threadId] = 0;
 }
 void
@ -676,7 +677,8 @@ LSQ::StoreBuffer::canForwardDataToLoad(LSQRequestPtr request,
    while (ret == NoAddrRangeCoverage && i != slots.rend()) {
        LSQRequestPtr slot = *i;
-        if (slot->packet) {
+        if (slot->packet &&
            slot->inst->id.threadId == request->inst->id.threadId) {
            AddrRangeCoverage coverage = slot->containsAddrRangeOf(request);
            if (coverage != NoAddrRangeCoverage) {
@ -1042,8 +1044,9 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
            request->issuedToMemory = true;
        }
-        if (tryToSend(request))
+        if (tryToSend(request)) {
            moveFromRequestsToTransfers(request);
        }
    } else {
        request->setState(LSQRequest::Complete);
        moveFromRequestsToTransfers(request);
@ -1145,6 +1148,9 @@ LSQ::tryToSend(LSQRequestPtr request)
        }
    }
    if (ret)
        threadSnoop(request);
    return ret;
 }
@ -1293,7 +1299,7 @@ LSQ::LSQ(std::string name_, std::string dcache_port_name_,
    cpu(cpu_),
    execute(execute_),
    dcachePort(dcache_port_name_, *this, cpu_),
-    lastMemBarrier(0),
+    lastMemBarrier(cpu.numThreads, 0),
    state(MemoryRunning),
    inMemorySystemLimit(in_memory_system_limit),
    lineWidth((line_width == 0 ? cpu.cacheLineSize() : line_width)),
@ -1526,7 +1532,7 @@ LSQ::minorTrace() const
    MINORTRACE("state=%s in_tlb_mem=%d/%d stores_in_transfers=%d"
        " lastMemBarrier=%d\n",
        state, numAccessesInDTLB, numAccessesInMemorySystem,
-        numStoresInTransfers, lastMemBarrier);
+        numStoresInTransfers, lastMemBarrier[0]);
    requests.minorTrace();
    transfers.minorTrace();
    storeBuffer.minorTrace();
@ -1565,12 +1571,12 @@ void
 LSQ::issuedMemBarrierInst(MinorDynInstPtr inst)
 {
    assert(inst->isInst() && inst->staticInst->isMemBarrier());
-    assert(inst->id.execSeqNum > lastMemBarrier);
+    assert(inst->id.execSeqNum > lastMemBarrier[inst->id.threadId]);
    /* Remember the barrier.  We only have a notion of one
     *  barrier so this may result in some mem refs being
     *  delayed if they are between barriers */
-    lastMemBarrier = inst->id.execSeqNum;
+    lastMemBarrier[inst->id.threadId] = inst->id.execSeqNum;
 }
 void
@ -1616,10 +1622,40 @@ LSQ::recvTimingSnoopReq(PacketPtr pkt)
    /* LLSC operations in Minor can't be speculative and are executed from
     * the head of the requests queue.  We shouldn't need to do more than
     * this action on snoops. */
    for (ThreadID tid = 0; tid < cpu.numThreads; tid++) {
        if (cpu.getCpuAddrMonitor(tid)->doMonitor(pkt)) {
            cpu.wakeup(tid);
        }
    }
    /* THREAD */
    if (pkt->isInvalidate() || pkt->isWrite()) {
-        TheISA::handleLockedSnoop(cpu.getContext(0), pkt, cacheBlockMask);
+        for (ThreadID tid = 0; tid < cpu.numThreads; tid++) {
            TheISA::handleLockedSnoop(cpu.getContext(tid), pkt,
                                      cacheBlockMask);
        }
    }
 }
 void
 LSQ::threadSnoop(LSQRequestPtr request)
 {
    /* LLSC operations in Minor can't be speculative and are executed from
     * the head of the requests queue.  We shouldn't need to do more than
     * this action on snoops. */
    ThreadID req_tid = request->inst->id.threadId;
    PacketPtr pkt = request->packet;
    for (ThreadID tid = 0; tid < cpu.numThreads; tid++) {
        if (tid != req_tid) {
            if (cpu.getCpuAddrMonitor(tid)->doMonitor(pkt)) {
                cpu.wakeup(tid);
            }
            if (pkt->isInvalidate() || pkt->isWrite()) {
                TheISA::handleLockedSnoop(cpu.getContext(tid), pkt,
                                          cacheBlockMask);
            }
        }
    }
 }
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@ -537,7 +537,7 @@ class LSQ : public Named
    /** Most recent execSeqNum of a memory barrier instruction or
     *  0 if there are no in-flight barriers.  Useful as a
     *  dependency for early-issued memory operations */
-    InstSeqNum lastMemBarrier;
+    std::vector<InstSeqNum> lastMemBarrier;
  public:
    /** Retry state of last issued memory transfer */
@ -640,6 +640,9 @@ class LSQ : public Named
    /** Can a request be sent to the memory system */
    bool canSendToMemorySystem();
    /** Snoop other threads monitors on memory system accesses */
    void threadSnoop(LSQRequestPtr request);
  public:
    LSQ(std::string name_, std::string dcache_port_name_,
        MinorCPU &cpu_, Execute &execute_,
@ -691,7 +694,8 @@ class LSQ : public Named
    void issuedMemBarrierInst(MinorDynInstPtr inst);
    /** Get the execSeqNum of the last issued memory barrier */
-    InstSeqNum getLastMemBarrier() const { return lastMemBarrier; }
+    InstSeqNum getLastMemBarrier(ThreadID thread_id) const
    { return lastMemBarrier[thread_id]; }
    /** Is there nothing left in the LSQ */
    bool isDrained();
--- a/src/cpu/minor/pipe_data.cc
+++ b/src/cpu/minor/pipe_data.cc
@ -71,9 +71,6 @@ operator <<(std::ostream &os, BranchData::Reason reason)
      case BranchData::SuspendThread:
        os << "SuspendThread";
        break;
      case BranchData::WakeupFetch:
        os << "WakeupFetch";
        break;
      case BranchData::HaltFetch:
        os << "HaltFetch";
        break;
@ -102,7 +99,6 @@ BranchData::isStreamChange(const BranchData::Reason reason)
      case BadlyPredictedBranch:
      case SuspendThread:
      case Interrupt:
      case WakeupFetch:
      case HaltFetch:
        ret = true;
        break;
@ -123,7 +119,6 @@ BranchData::isBranch(const BranchData::Reason reason)
      case CorrectlyPredictedBranch:
      case SuspendThread:
      case Interrupt:
      case WakeupFetch:
      case HaltFetch:
        ret = false;
        break;
@ -228,8 +223,8 @@ ForwardLineData::reportData(std::ostream &os) const
        os << id;
 }
-ForwardInstData::ForwardInstData(unsigned int width) :
+ForwardInstData::ForwardInstData(unsigned int width, ThreadID tid) :
-    numInsts(width)
+    numInsts(width), threadId(tid)
 {
    bubbleFill();
 }
--- a/src/cpu/minor/pipe_data.hh
+++ b/src/cpu/minor/pipe_data.hh
@ -91,8 +91,6 @@ class BranchData /* : public ReportIF, public BubbleIF */
         * count it as stream changing itself and expect pc to be the PC
         * of the next instruction */
        SuspendThread,
        /* Wakeup fetching from Halted */
        WakeupFetch,
        /* Branch from an interrupt (no instruction) */
        Interrupt,
        /* Stop fetching in anticipation of of draining */
@ -112,6 +110,9 @@ class BranchData /* : public ReportIF, public BubbleIF */
    /** Explanation for this branch */
    Reason reason;
    /** ThreadID associated with branch */
    ThreadID threadId;
    /** Sequence number of new stream/prediction to be adopted */
    InstSeqNum newStreamSeqNum;
    InstSeqNum newPredictionSeqNum;
@ -124,18 +125,20 @@ class BranchData /* : public ReportIF, public BubbleIF */
  public:
    BranchData() :
-        reason(NoBranch), newStreamSeqNum(0),
+        reason(NoBranch), threadId(InvalidThreadID), newStreamSeqNum(0),
        newPredictionSeqNum(0), target(TheISA::PCState(0)),
        inst(MinorDynInst::bubble())
    { }
    BranchData(
        Reason reason_,
        ThreadID thread_id,
        InstSeqNum new_stream_seq_num,
        InstSeqNum new_prediction_seq_num,
        TheISA::PCState target,
        MinorDynInstPtr inst_) :
        reason(reason_),
        threadId(thread_id),
        newStreamSeqNum(new_stream_seq_num),
        newPredictionSeqNum(new_prediction_seq_num),
        target(target),
@ -258,8 +261,12 @@ class ForwardInstData /* : public ReportIF, public BubbleIF */
    /** The number of insts slots that can be expected to be valid insts */
    unsigned int numInsts;
    /** Thread associated with these instructions */
    ThreadID threadId;
  public:
-    explicit ForwardInstData(unsigned int width = 0);
+    explicit ForwardInstData(unsigned int width = 0,
                             ThreadID tid = InvalidThreadID);
    ForwardInstData(const ForwardInstData &src);
--- a/src/cpu/minor/pipeline.cc
+++ b/src/cpu/minor/pipeline.cc
@ -187,9 +187,9 @@ Pipeline::getDataPort()
 }
 void
-Pipeline::wakeupFetch()
+Pipeline::wakeupFetch(ThreadID tid)
 {
-    execute.wakeupFetch();
+    fetch1.wakeupFetch(tid);
 }
 bool
@ -212,6 +212,11 @@ void
 Pipeline::drainResume()
 {
    DPRINTF(Drain, "Drain resume\n");
    for (ThreadID tid = 0; tid < cpu.numThreads; tid++) {
        fetch1.wakeupFetch(tid);
    }
    execute.drainResume();
 }
--- a/src/cpu/minor/pipeline.hh
+++ b/src/cpu/minor/pipeline.hh
@ -112,7 +112,7 @@ class Pipeline : public Ticked
  public:
    /** Wake up the Fetch unit.  This is needed on thread activation esp.
     *  after quiesce wakeup */
-    void wakeupFetch();
+    void wakeupFetch(ThreadID tid);
    /** Try to drain the CPU */
    bool drain();
--- a/src/sim/pseudo_inst.cc
+++ b/src/sim/pseudo_inst.cc
@ -261,7 +261,7 @@ quiesceSkip(ThreadContext *tc)
    EndQuiesceEvent *quiesceEvent = tc->getQuiesceEvent();
-    Tick resume = curTick() + 1;
+    Tick resume = cpu->nextCycle() + 1;
    cpu->reschedule(quiesceEvent, resume, true);
--- a/util/minorview/minor.pic
+++ b/util/minorview/minor.pic
@ -115,9 +115,9 @@ macro predictionFrame: decoder=frame stripDir=vert dataElement=predictionSeqNum
 # name ::= ? alphanumeric name with dots ?
 # value ::= "(<char-except-">)*", <char-except-' '>* }
-Fi: fetch2.inputBuffer inputBuffer decoder=lines
+Fi: fetch2.inputBuffer0 inputBuffer decoder=lines
-Di: decode.inputBuffer inputBuffer decoder=insts hideId=E
+Di: decode.inputBuffer0 inputBuffer decoder=insts hideId=E
-Ei: execute.inputBuffer inputBuffer stripDir=horiz decoder=insts border=mid
+Ei: execute.inputBuffer0 inputBuffer stripDir=horiz decoder=insts border=mid
 F1: fetch1 streamFrame blankStrips=11 name="Fetch1"
 fe: fetch1 decoder=lines border=thin name="Line"
 F2: fetch2 predictionFrame blankStrips=11 name="Fetch2"
@ -146,9 +146,9 @@ f3: execute.fu.3 fu shorten=2 name=Div
 f4: execute.fu.4 fu shorten=2 name=Float
 f5: execute.fu.5 fu shorten=2 name=Mem
 f6: execute.fu.6 fu shorten=2 name=Misc
-iq: execute.inFlightInsts fifo decoder=insts name="inFlightInsts"
+iq: execute.inFlightInsts0 fifo decoder=insts name="inFlightInsts"
-im: execute.inFUMemInsts fifo decoder=insts name="inFU..."
+im: execute.inFUMemInsts0 fifo decoder=insts name="inFU..."
-sc: execute.scoreboard name="scoreboard" decoder=indexedCounts \
+sc: execute.scoreboard0 name="scoreboard" decoder=indexedCounts \
    dataElement=busy border=mid name="scoreboard" strips=38 stripelems=3
 sa: activity dataElement=stages activity name="Stage activity"
 ac: activity dataElement=activity decoder=counts border=mid name="Activity"