a0d4019abd
valid field for GpuTlbEntry is not set in the default ctor, which can lead to strange behavior, and is also flagged by UBSAN.
465 lines
16 KiB
C++
465 lines
16 KiB
C++
/*
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* Author: Lisa Hsu
|
|
*/
|
|
|
|
#ifndef __GPU_TLB_HH__
|
|
#define __GPU_TLB_HH__
|
|
|
|
#include <fstream>
|
|
#include <list>
|
|
#include <queue>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "arch/generic/tlb.hh"
|
|
#include "arch/x86/pagetable.hh"
|
|
#include "arch/x86/pagetable_walker.hh"
|
|
#include "arch/x86/regs/segment.hh"
|
|
#include "base/callback.hh"
|
|
#include "base/misc.hh"
|
|
#include "base/statistics.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "mem/mem_object.hh"
|
|
#include "mem/port.hh"
|
|
#include "mem/request.hh"
|
|
#include "params/X86GPUTLB.hh"
|
|
#include "sim/sim_object.hh"
|
|
|
|
class BaseTLB;
|
|
class Packet;
|
|
class ThreadContext;
|
|
|
|
namespace X86ISA
|
|
{
|
|
class GpuTlbEntry : public TlbEntry
|
|
{
|
|
public:
|
|
GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
|
|
: TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
|
|
|
|
GpuTlbEntry() : TlbEntry(), valid(false) { }
|
|
|
|
bool valid;
|
|
};
|
|
|
|
class GpuTLB : public MemObject
|
|
{
|
|
protected:
|
|
friend class Walker;
|
|
|
|
typedef std::list<GpuTlbEntry*> EntryList;
|
|
|
|
uint32_t configAddress;
|
|
|
|
// TLB clock: will inherit clock from shader's clock period in terms
|
|
// of nuber of ticks of curTime (aka global simulation clock)
|
|
// The assignment of TLB clock from shader clock is done in the python
|
|
// config files.
|
|
int clock;
|
|
|
|
public:
|
|
// clock related functions ; maps to-and-from Simulation ticks and
|
|
// object clocks.
|
|
Tick frequency() const { return SimClock::Frequency / clock; }
|
|
|
|
Tick
|
|
ticks(int numCycles) const
|
|
{
|
|
return (Tick)clock * numCycles;
|
|
}
|
|
|
|
Tick curCycle() const { return curTick() / clock; }
|
|
Tick tickToCycles(Tick val) const { return val / clock;}
|
|
|
|
typedef X86GPUTLBParams Params;
|
|
GpuTLB(const Params *p);
|
|
~GpuTLB();
|
|
|
|
typedef enum BaseTLB::Mode Mode;
|
|
|
|
class Translation
|
|
{
|
|
public:
|
|
virtual ~Translation() { }
|
|
|
|
/**
|
|
* Signal that the translation has been delayed due to a hw page
|
|
* table walk.
|
|
*/
|
|
virtual void markDelayed() = 0;
|
|
|
|
/**
|
|
* The memory for this object may be dynamically allocated, and it
|
|
* may be responsible for cleaning itslef up which will happen in
|
|
* this function. Once it's called the object is no longer valid.
|
|
*/
|
|
virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
|
|
Mode mode) = 0;
|
|
};
|
|
|
|
void dumpAll();
|
|
GpuTlbEntry *lookup(Addr va, bool update_lru=true);
|
|
void setConfigAddress(uint32_t addr);
|
|
|
|
protected:
|
|
EntryList::iterator lookupIt(Addr va, bool update_lru=true);
|
|
Walker *walker;
|
|
|
|
public:
|
|
Walker *getWalker();
|
|
void invalidateAll();
|
|
void invalidateNonGlobal();
|
|
void demapPage(Addr va, uint64_t asn);
|
|
|
|
protected:
|
|
int size;
|
|
int assoc;
|
|
int numSets;
|
|
|
|
/**
|
|
* true if this is a fully-associative TLB
|
|
*/
|
|
bool FA;
|
|
Addr setMask;
|
|
|
|
/**
|
|
* Allocation Policy: true if we always allocate on a hit, false
|
|
* otherwise. Default is true.
|
|
*/
|
|
bool allocationPolicy;
|
|
|
|
/**
|
|
* if true, then this is not the last level TLB
|
|
*/
|
|
bool hasMemSidePort;
|
|
|
|
/**
|
|
* Print out accessDistance stats. One stat file
|
|
* per TLB.
|
|
*/
|
|
bool accessDistance;
|
|
|
|
std::vector<GpuTlbEntry> tlb;
|
|
|
|
/*
|
|
* It's a per-set list. As long as we have not reached
|
|
* the full capacity of the given set, grab an entry from
|
|
* the freeList.
|
|
*/
|
|
std::vector<EntryList> freeList;
|
|
|
|
/**
|
|
* An entryList per set is the equivalent of an LRU stack;
|
|
* it's used to guide replacement decisions. The head of the list
|
|
* contains the MRU TLB entry of the given set. If the freeList
|
|
* for this set is empty, the last element of the list
|
|
* is evicted (i.e., dropped on the floor).
|
|
*/
|
|
std::vector<EntryList> entryList;
|
|
|
|
Fault translateInt(RequestPtr req, ThreadContext *tc);
|
|
|
|
Fault translate(RequestPtr req, ThreadContext *tc,
|
|
Translation *translation, Mode mode, bool &delayedResponse,
|
|
bool timing, int &latency);
|
|
|
|
public:
|
|
// latencies for a TLB hit, miss and page fault
|
|
int hitLatency;
|
|
int missLatency1;
|
|
int missLatency2;
|
|
|
|
// local_stats are as seen from the TLB
|
|
// without taking into account coalescing
|
|
Stats::Scalar localNumTLBAccesses;
|
|
Stats::Scalar localNumTLBHits;
|
|
Stats::Scalar localNumTLBMisses;
|
|
Stats::Formula localTLBMissRate;
|
|
|
|
// global_stats are as seen from the
|
|
// CU's perspective taking into account
|
|
// all coalesced requests.
|
|
Stats::Scalar globalNumTLBAccesses;
|
|
Stats::Scalar globalNumTLBHits;
|
|
Stats::Scalar globalNumTLBMisses;
|
|
Stats::Formula globalTLBMissRate;
|
|
|
|
// from the CU perspective (global)
|
|
Stats::Scalar accessCycles;
|
|
// from the CU perspective (global)
|
|
Stats::Scalar pageTableCycles;
|
|
Stats::Scalar numUniquePages;
|
|
// from the perspective of this TLB
|
|
Stats::Scalar localCycles;
|
|
// from the perspective of this TLB
|
|
Stats::Formula localLatency;
|
|
// I take the avg. per page and then
|
|
// the avg. over all pages.
|
|
Stats::Scalar avgReuseDistance;
|
|
|
|
void regStats();
|
|
void updatePageFootprint(Addr virt_page_addr);
|
|
void printAccessPattern();
|
|
|
|
|
|
Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
|
|
int &latency);
|
|
|
|
void translateTiming(RequestPtr req, ThreadContext *tc,
|
|
Translation *translation, Mode mode,
|
|
int &latency);
|
|
|
|
Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
|
|
Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
|
|
|
|
GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
|
|
|
|
// Checkpointing
|
|
virtual void serialize(CheckpointOut& cp) const;
|
|
virtual void unserialize(CheckpointIn& cp);
|
|
void issueTranslation();
|
|
enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
|
|
bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
|
|
|
|
void handleTranslationReturn(Addr addr, tlbOutcome outcome,
|
|
PacketPtr pkt);
|
|
|
|
void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
|
|
|
|
void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
|
|
GpuTlbEntry *tlb_entry, Mode mode);
|
|
|
|
void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
|
|
Addr phys_page_addr);
|
|
|
|
void issueTLBLookup(PacketPtr pkt);
|
|
|
|
// CpuSidePort is the TLB Port closer to the CPU/CU side
|
|
class CpuSidePort : public SlavePort
|
|
{
|
|
public:
|
|
CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
|
|
PortID _index)
|
|
: SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
|
|
|
|
protected:
|
|
GpuTLB *tlb;
|
|
int index;
|
|
|
|
virtual bool recvTimingReq(PacketPtr pkt);
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
virtual void recvFunctional(PacketPtr pkt);
|
|
virtual void recvRangeChange() { }
|
|
virtual void recvReqRetry();
|
|
virtual void recvRespRetry() { assert(false); }
|
|
virtual AddrRangeList getAddrRanges() const;
|
|
};
|
|
|
|
/**
|
|
* MemSidePort is the TLB Port closer to the memory side
|
|
* If this is a last level TLB then this port will not be connected.
|
|
*
|
|
* Future action item: if we ever do real page walks, then this port
|
|
* should be connected to a RubyPort.
|
|
*/
|
|
class MemSidePort : public MasterPort
|
|
{
|
|
public:
|
|
MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
|
|
PortID _index)
|
|
: MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
|
|
|
|
std::deque<PacketPtr> retries;
|
|
|
|
protected:
|
|
GpuTLB *tlb;
|
|
int index;
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
virtual void recvRangeChange() { }
|
|
virtual void recvReqRetry();
|
|
};
|
|
|
|
// TLB ports on the cpu Side
|
|
std::vector<CpuSidePort*> cpuSidePort;
|
|
// TLB ports on the memory side
|
|
std::vector<MemSidePort*> memSidePort;
|
|
|
|
BaseMasterPort &getMasterPort(const std::string &if_name,
|
|
PortID idx=InvalidPortID);
|
|
|
|
BaseSlavePort &getSlavePort(const std::string &if_name,
|
|
PortID idx=InvalidPortID);
|
|
|
|
/**
|
|
* TLB TranslationState: this currently is a somewhat bastardization of
|
|
* the usage of SenderState, whereby the receiver of a packet is not
|
|
* usually supposed to need to look at the contents of the senderState,
|
|
* you're really only supposed to look at what you pushed on, pop it
|
|
* off, and send it back.
|
|
*
|
|
* However, since there is state that we want to pass to the TLBs using
|
|
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
|
|
* arguments, we need a common TLB senderState to pass between TLBs,
|
|
* both "forwards" and "backwards."
|
|
*
|
|
* So, basically, the rule is that any packet received by a TLB port
|
|
* (cpuside OR memside) must be safely castable to a TranslationState.
|
|
*/
|
|
|
|
struct TranslationState : public Packet::SenderState
|
|
{
|
|
// TLB mode, read or write
|
|
Mode tlbMode;
|
|
// Thread context associated with this req
|
|
ThreadContext *tc;
|
|
|
|
/*
|
|
* TLB entry to be populated and passed back and filled in
|
|
* previous TLBs. Equivalent to the data cache concept of
|
|
* "data return."
|
|
*/
|
|
GpuTlbEntry *tlbEntry;
|
|
// Is this a TLB prefetch request?
|
|
bool prefetch;
|
|
// When was the req for this translation issued
|
|
uint64_t issueTime;
|
|
// Remember where this came from
|
|
std::vector<SlavePort*>ports;
|
|
|
|
// keep track of #uncoalesced reqs per packet per TLB level;
|
|
// reqCnt per level >= reqCnt higher level
|
|
std::vector<int> reqCnt;
|
|
// TLB level this packet hit in; 0 if it hit in the page table
|
|
int hitLevel;
|
|
Packet::SenderState *saved;
|
|
|
|
TranslationState(Mode tlb_mode, ThreadContext *_tc,
|
|
bool _prefetch=false,
|
|
Packet::SenderState *_saved=nullptr)
|
|
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
|
|
prefetch(_prefetch), issueTime(0),
|
|
hitLevel(0),saved(_saved) { }
|
|
};
|
|
|
|
// maximum number of permitted coalesced requests per cycle
|
|
int maxCoalescedReqs;
|
|
|
|
// Current number of outstandings coalesced requests.
|
|
// Should be <= maxCoalescedReqs
|
|
int outstandingReqs;
|
|
|
|
/**
|
|
* A TLBEvent is scheduled after the TLB lookup and helps us take the
|
|
* appropriate actions:
|
|
* (e.g., update TLB on a hit,
|
|
* send request to lower level TLB on a miss,
|
|
* or start a page walk if this was the last-level TLB).
|
|
*/
|
|
void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
|
|
PacketPtr pkt);
|
|
|
|
class TLBEvent : public Event
|
|
{
|
|
private:
|
|
GpuTLB *tlb;
|
|
Addr virtPageAddr;
|
|
/**
|
|
* outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
|
|
*/
|
|
tlbOutcome outcome;
|
|
PacketPtr pkt;
|
|
|
|
public:
|
|
TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
|
|
PacketPtr _pkt);
|
|
|
|
void process();
|
|
const char *description() const;
|
|
|
|
// updateOutcome updates the tlbOutcome of a TLBEvent
|
|
void updateOutcome(tlbOutcome _outcome);
|
|
Addr getTLBEventVaddr();
|
|
};
|
|
|
|
std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
|
|
|
|
// this FIFO queue keeps track of the virt. page addresses
|
|
// that are pending cleanup
|
|
std::queue<Addr> cleanupQueue;
|
|
|
|
// the cleanupEvent is scheduled after a TLBEvent triggers in order to
|
|
// free memory and do the required clean-up
|
|
void cleanup();
|
|
|
|
EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
|
|
|
|
/**
|
|
* This hash map will use the virtual page address as a key
|
|
* and will keep track of total number of accesses per page
|
|
*/
|
|
|
|
struct AccessInfo
|
|
{
|
|
unsigned int lastTimeAccessed; // last access to this page
|
|
unsigned int accessesPerPage;
|
|
// need to divide it by accessesPerPage at the end
|
|
unsigned int totalReuseDistance;
|
|
|
|
/**
|
|
* The field below will help us compute the access distance,
|
|
* that is the number of (coalesced) TLB accesses that
|
|
* happened in between each access to this page
|
|
*
|
|
* localTLBAccesses[x] is the value of localTLBNumAccesses
|
|
* when the page <Addr> was accessed for the <x>th time
|
|
*/
|
|
std::vector<unsigned int> localTLBAccesses;
|
|
unsigned int sumDistance;
|
|
unsigned int meanDistance;
|
|
};
|
|
|
|
typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
|
|
AccessPatternTable TLBFootprint;
|
|
|
|
// Called at the end of simulation to dump page access stats.
|
|
void exitCallback();
|
|
|
|
EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
|
|
};
|
|
}
|
|
|
|
#endif // __GPU_TLB_HH__
|