gem5/src/gpu-compute/gpu_tlb.hh
Tony Gutierrez a0d4019abd gpu-compute: init valid field of GpuTlbEntry in default ctor
valid field for GpuTlbEntry is not set in the default ctor, which can
lead to strange behavior, and is also flagged by UBSAN.
2016-11-21 15:38:30 -05:00

466 lines
16 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Lisa Hsu
*/
#ifndef __GPU_TLB_HH__
#define __GPU_TLB_HH__
#include <fstream>
#include <list>
#include <queue>
#include <string>
#include <vector>
#include "arch/generic/tlb.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/pagetable_walker.hh"
#include "arch/x86/regs/segment.hh"
#include "base/callback.hh"
#include "base/misc.hh"
#include "base/statistics.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/X86GPUTLB.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class Packet;
class ThreadContext;
namespace X86ISA
{
class GpuTlbEntry : public TlbEntry
{
public:
GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
: TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
GpuTlbEntry() : TlbEntry(), valid(false) { }
bool valid;
};
class GpuTLB : public MemObject
{
protected:
friend class Walker;
typedef std::list<GpuTlbEntry*> EntryList;
uint32_t configAddress;
// TLB clock: will inherit clock from shader's clock period in terms
// of nuber of ticks of curTime (aka global simulation clock)
// The assignment of TLB clock from shader clock is done in the python
// config files.
int clock;
public:
// clock related functions ; maps to-and-from Simulation ticks and
// object clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick
ticks(int numCycles) const
{
return (Tick)clock * numCycles;
}
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
typedef X86GPUTLBParams Params;
GpuTLB(const Params *p);
~GpuTLB();
typedef enum BaseTLB::Mode Mode;
class Translation
{
public:
virtual ~Translation() { }
/**
* Signal that the translation has been delayed due to a hw page
* table walk.
*/
virtual void markDelayed() = 0;
/**
* The memory for this object may be dynamically allocated, and it
* may be responsible for cleaning itslef up which will happen in
* this function. Once it's called the object is no longer valid.
*/
virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
Mode mode) = 0;
};
void dumpAll();
GpuTlbEntry *lookup(Addr va, bool update_lru=true);
void setConfigAddress(uint32_t addr);
protected:
EntryList::iterator lookupIt(Addr va, bool update_lru=true);
Walker *walker;
public:
Walker *getWalker();
void invalidateAll();
void invalidateNonGlobal();
void demapPage(Addr va, uint64_t asn);
protected:
int size;
int assoc;
int numSets;
/**
* true if this is a fully-associative TLB
*/
bool FA;
Addr setMask;
/**
* Allocation Policy: true if we always allocate on a hit, false
* otherwise. Default is true.
*/
bool allocationPolicy;
/**
* if true, then this is not the last level TLB
*/
bool hasMemSidePort;
/**
* Print out accessDistance stats. One stat file
* per TLB.
*/
bool accessDistance;
std::vector<GpuTlbEntry> tlb;
/*
* It's a per-set list. As long as we have not reached
* the full capacity of the given set, grab an entry from
* the freeList.
*/
std::vector<EntryList> freeList;
/**
* An entryList per set is the equivalent of an LRU stack;
* it's used to guide replacement decisions. The head of the list
* contains the MRU TLB entry of the given set. If the freeList
* for this set is empty, the last element of the list
* is evicted (i.e., dropped on the floor).
*/
std::vector<EntryList> entryList;
Fault translateInt(RequestPtr req, ThreadContext *tc);
Fault translate(RequestPtr req, ThreadContext *tc,
Translation *translation, Mode mode, bool &delayedResponse,
bool timing, int &latency);
public:
// latencies for a TLB hit, miss and page fault
int hitLatency;
int missLatency1;
int missLatency2;
// local_stats are as seen from the TLB
// without taking into account coalescing
Stats::Scalar localNumTLBAccesses;
Stats::Scalar localNumTLBHits;
Stats::Scalar localNumTLBMisses;
Stats::Formula localTLBMissRate;
// global_stats are as seen from the
// CU's perspective taking into account
// all coalesced requests.
Stats::Scalar globalNumTLBAccesses;
Stats::Scalar globalNumTLBHits;
Stats::Scalar globalNumTLBMisses;
Stats::Formula globalTLBMissRate;
// from the CU perspective (global)
Stats::Scalar accessCycles;
// from the CU perspective (global)
Stats::Scalar pageTableCycles;
Stats::Scalar numUniquePages;
// from the perspective of this TLB
Stats::Scalar localCycles;
// from the perspective of this TLB
Stats::Formula localLatency;
// I take the avg. per page and then
// the avg. over all pages.
Stats::Scalar avgReuseDistance;
void regStats();
void updatePageFootprint(Addr virt_page_addr);
void printAccessPattern();
Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
int &latency);
void translateTiming(RequestPtr req, ThreadContext *tc,
Translation *translation, Mode mode,
int &latency);
Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
// Checkpointing
virtual void serialize(CheckpointOut& cp) const;
virtual void unserialize(CheckpointIn& cp);
void issueTranslation();
enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
void handleTranslationReturn(Addr addr, tlbOutcome outcome,
PacketPtr pkt);
void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
GpuTlbEntry *tlb_entry, Mode mode);
void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
Addr phys_page_addr);
void issueTLBLookup(PacketPtr pkt);
// CpuSidePort is the TLB Port closer to the CPU/CU side
class CpuSidePort : public SlavePort
{
public:
CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingReq(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt);
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void recvRespRetry() { assert(false); }
virtual AddrRangeList getAddrRanges() const;
};
/**
* MemSidePort is the TLB Port closer to the memory side
* If this is a last level TLB then this port will not be connected.
*
* Future action item: if we ever do real page walks, then this port
* should be connected to a RubyPort.
*/
class MemSidePort : public MasterPort
{
public:
MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
std::deque<PacketPtr> retries;
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
// TLB ports on the cpu Side
std::vector<CpuSidePort*> cpuSidePort;
// TLB ports on the memory side
std::vector<MemSidePort*> memSidePort;
BaseMasterPort &getMasterPort(const std::string &if_name,
PortID idx=InvalidPortID);
BaseSlavePort &getSlavePort(const std::string &if_name,
PortID idx=InvalidPortID);
/**
* TLB TranslationState: this currently is a somewhat bastardization of
* the usage of SenderState, whereby the receiver of a packet is not
* usually supposed to need to look at the contents of the senderState,
* you're really only supposed to look at what you pushed on, pop it
* off, and send it back.
*
* However, since there is state that we want to pass to the TLBs using
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
* arguments, we need a common TLB senderState to pass between TLBs,
* both "forwards" and "backwards."
*
* So, basically, the rule is that any packet received by a TLB port
* (cpuside OR memside) must be safely castable to a TranslationState.
*/
struct TranslationState : public Packet::SenderState
{
// TLB mode, read or write
Mode tlbMode;
// Thread context associated with this req
ThreadContext *tc;
/*
* TLB entry to be populated and passed back and filled in
* previous TLBs. Equivalent to the data cache concept of
* "data return."
*/
GpuTlbEntry *tlbEntry;
// Is this a TLB prefetch request?
bool prefetch;
// When was the req for this translation issued
uint64_t issueTime;
// Remember where this came from
std::vector<SlavePort*>ports;
// keep track of #uncoalesced reqs per packet per TLB level;
// reqCnt per level >= reqCnt higher level
std::vector<int> reqCnt;
// TLB level this packet hit in; 0 if it hit in the page table
int hitLevel;
Packet::SenderState *saved;
TranslationState(Mode tlb_mode, ThreadContext *_tc,
bool _prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
prefetch(_prefetch), issueTime(0),
hitLevel(0),saved(_saved) { }
};
// maximum number of permitted coalesced requests per cycle
int maxCoalescedReqs;
// Current number of outstandings coalesced requests.
// Should be <= maxCoalescedReqs
int outstandingReqs;
/**
* A TLBEvent is scheduled after the TLB lookup and helps us take the
* appropriate actions:
* (e.g., update TLB on a hit,
* send request to lower level TLB on a miss,
* or start a page walk if this was the last-level TLB).
*/
void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
PacketPtr pkt);
class TLBEvent : public Event
{
private:
GpuTLB *tlb;
Addr virtPageAddr;
/**
* outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
*/
tlbOutcome outcome;
PacketPtr pkt;
public:
TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
PacketPtr _pkt);
void process();
const char *description() const;
// updateOutcome updates the tlbOutcome of a TLBEvent
void updateOutcome(tlbOutcome _outcome);
Addr getTLBEventVaddr();
};
std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
// this FIFO queue keeps track of the virt. page addresses
// that are pending cleanup
std::queue<Addr> cleanupQueue;
// the cleanupEvent is scheduled after a TLBEvent triggers in order to
// free memory and do the required clean-up
void cleanup();
EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
/**
* This hash map will use the virtual page address as a key
* and will keep track of total number of accesses per page
*/
struct AccessInfo
{
unsigned int lastTimeAccessed; // last access to this page
unsigned int accessesPerPage;
// need to divide it by accessesPerPage at the end
unsigned int totalReuseDistance;
/**
* The field below will help us compute the access distance,
* that is the number of (coalesced) TLB accesses that
* happened in between each access to this page
*
* localTLBAccesses[x] is the value of localTLBNumAccesses
* when the page <Addr> was accessed for the <x>th time
*/
std::vector<unsigned int> localTLBAccesses;
unsigned int sumDistance;
unsigned int meanDistance;
};
typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
AccessPatternTable TLBFootprint;
// Called at the end of simulation to dump page access stats.
void exitCallback();
EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
};
}
#endif // __GPU_TLB_HH__