mem: Split the hit_latency into tag_latency and data_latency
If the cache access mode is parallel, i.e. "sequential_access" parameter is set to "False", tags and data are accessed in parallel. Therefore, the hit_latency is the maximum latency between tag_latency and data_latency. On the other hand, if the cache access mode is sequential, i.e. "sequential_access" parameter is set to "True", tags and data are accessed sequentially. Therefore, the hit_latency is the sum of tag_latency plus data_latency. Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
This commit is contained in:
parent
047caf24ba
commit
ce2722cdd9
15 changed files with 95 additions and 36 deletions
|
@ -48,7 +48,8 @@ from m5.objects import *
|
||||||
|
|
||||||
class L1Cache(Cache):
|
class L1Cache(Cache):
|
||||||
assoc = 2
|
assoc = 2
|
||||||
hit_latency = 2
|
tag_latency = 2
|
||||||
|
data_latency = 2
|
||||||
response_latency = 2
|
response_latency = 2
|
||||||
mshrs = 4
|
mshrs = 4
|
||||||
tgts_per_mshr = 20
|
tgts_per_mshr = 20
|
||||||
|
@ -63,7 +64,8 @@ class L1_DCache(L1Cache):
|
||||||
|
|
||||||
class L2Cache(Cache):
|
class L2Cache(Cache):
|
||||||
assoc = 8
|
assoc = 8
|
||||||
hit_latency = 20
|
tag_latency = 20
|
||||||
|
data_latency = 20
|
||||||
response_latency = 20
|
response_latency = 20
|
||||||
mshrs = 20
|
mshrs = 20
|
||||||
tgts_per_mshr = 12
|
tgts_per_mshr = 12
|
||||||
|
@ -71,7 +73,8 @@ class L2Cache(Cache):
|
||||||
|
|
||||||
class IOCache(Cache):
|
class IOCache(Cache):
|
||||||
assoc = 8
|
assoc = 8
|
||||||
hit_latency = 50
|
tag_latency = 50
|
||||||
|
data_latency = 50
|
||||||
response_latency = 50
|
response_latency = 50
|
||||||
mshrs = 20
|
mshrs = 20
|
||||||
size = '1kB'
|
size = '1kB'
|
||||||
|
@ -79,7 +82,8 @@ class IOCache(Cache):
|
||||||
|
|
||||||
class PageTableWalkerCache(Cache):
|
class PageTableWalkerCache(Cache):
|
||||||
assoc = 2
|
assoc = 2
|
||||||
hit_latency = 2
|
tag_latency = 2
|
||||||
|
data_latency = 2
|
||||||
response_latency = 2
|
response_latency = 2
|
||||||
mshrs = 10
|
mshrs = 10
|
||||||
size = '1kB'
|
size = '1kB'
|
||||||
|
|
|
@ -147,7 +147,8 @@ class O3_ARM_v7a_3(DerivO3CPU):
|
||||||
|
|
||||||
# Instruction Cache
|
# Instruction Cache
|
||||||
class O3_ARM_v7a_ICache(Cache):
|
class O3_ARM_v7a_ICache(Cache):
|
||||||
hit_latency = 1
|
tag_latency = 1
|
||||||
|
data_latency = 1
|
||||||
response_latency = 1
|
response_latency = 1
|
||||||
mshrs = 2
|
mshrs = 2
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
@ -159,7 +160,8 @@ class O3_ARM_v7a_ICache(Cache):
|
||||||
|
|
||||||
# Data Cache
|
# Data Cache
|
||||||
class O3_ARM_v7a_DCache(Cache):
|
class O3_ARM_v7a_DCache(Cache):
|
||||||
hit_latency = 2
|
tag_latency = 2
|
||||||
|
data_latency = 2
|
||||||
response_latency = 2
|
response_latency = 2
|
||||||
mshrs = 6
|
mshrs = 6
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
@ -172,7 +174,8 @@ class O3_ARM_v7a_DCache(Cache):
|
||||||
# TLB Cache
|
# TLB Cache
|
||||||
# Use a cache as a L2 TLB
|
# Use a cache as a L2 TLB
|
||||||
class O3_ARM_v7aWalkCache(Cache):
|
class O3_ARM_v7aWalkCache(Cache):
|
||||||
hit_latency = 4
|
tag_latency = 4
|
||||||
|
data_latency = 4
|
||||||
response_latency = 4
|
response_latency = 4
|
||||||
mshrs = 6
|
mshrs = 6
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
@ -185,7 +188,8 @@ class O3_ARM_v7aWalkCache(Cache):
|
||||||
|
|
||||||
# L2 Cache
|
# L2 Cache
|
||||||
class O3_ARM_v7aL2(Cache):
|
class O3_ARM_v7aL2(Cache):
|
||||||
hit_latency = 12
|
tag_latency = 12
|
||||||
|
data_latency = 12
|
||||||
response_latency = 12
|
response_latency = 12
|
||||||
mshrs = 16
|
mshrs = 16
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
|
|
@ -45,7 +45,8 @@ from common.Caches import *
|
||||||
from common import CpuConfig
|
from common import CpuConfig
|
||||||
|
|
||||||
class L1I(L1_ICache):
|
class L1I(L1_ICache):
|
||||||
hit_latency = 1
|
tag_latency = 1
|
||||||
|
data_latency = 1
|
||||||
response_latency = 1
|
response_latency = 1
|
||||||
mshrs = 4
|
mshrs = 4
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
@ -54,7 +55,8 @@ class L1I(L1_ICache):
|
||||||
|
|
||||||
|
|
||||||
class L1D(L1_DCache):
|
class L1D(L1_DCache):
|
||||||
hit_latency = 2
|
tag_latency = 2
|
||||||
|
data_latency = 2
|
||||||
response_latency = 1
|
response_latency = 1
|
||||||
mshrs = 16
|
mshrs = 16
|
||||||
tgts_per_mshr = 16
|
tgts_per_mshr = 16
|
||||||
|
@ -64,7 +66,8 @@ class L1D(L1_DCache):
|
||||||
|
|
||||||
|
|
||||||
class WalkCache(PageTableWalkerCache):
|
class WalkCache(PageTableWalkerCache):
|
||||||
hit_latency = 4
|
tag_latency = 4
|
||||||
|
data_latency = 4
|
||||||
response_latency = 4
|
response_latency = 4
|
||||||
mshrs = 6
|
mshrs = 6
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
@ -74,7 +77,8 @@ class WalkCache(PageTableWalkerCache):
|
||||||
|
|
||||||
|
|
||||||
class L2(L2Cache):
|
class L2(L2Cache):
|
||||||
hit_latency = 12
|
tag_latency = 12
|
||||||
|
data_latency = 12
|
||||||
response_latency = 5
|
response_latency = 5
|
||||||
mshrs = 32
|
mshrs = 32
|
||||||
tgts_per_mshr = 8
|
tgts_per_mshr = 8
|
||||||
|
@ -87,7 +91,8 @@ class L2(L2Cache):
|
||||||
class L3(Cache):
|
class L3(Cache):
|
||||||
size = '16MB'
|
size = '16MB'
|
||||||
assoc = 16
|
assoc = 16
|
||||||
hit_latency = 20
|
tag_latency = 20
|
||||||
|
data_latency = 20
|
||||||
response_latency = 20
|
response_latency = 20
|
||||||
mshrs = 20
|
mshrs = 20
|
||||||
tgts_per_mshr = 12
|
tgts_per_mshr = 12
|
||||||
|
|
|
@ -153,7 +153,7 @@ for t, m in zip(testerspec, multiplier):
|
||||||
|
|
||||||
# Define a prototype L1 cache that we scale for all successive levels
|
# Define a prototype L1 cache that we scale for all successive levels
|
||||||
proto_l1 = Cache(size = '32kB', assoc = 4,
|
proto_l1 = Cache(size = '32kB', assoc = 4,
|
||||||
hit_latency = 1, response_latency = 1,
|
tag_latency = 1, data_latency = 1, response_latency = 1,
|
||||||
tgts_per_mshr = 8)
|
tgts_per_mshr = 8)
|
||||||
|
|
||||||
if options.blocking:
|
if options.blocking:
|
||||||
|
@ -175,7 +175,8 @@ for scale in cachespec[:-1]:
|
||||||
prev = cache_proto[0]
|
prev = cache_proto[0]
|
||||||
next = prev()
|
next = prev()
|
||||||
next.size = prev.size * scale
|
next.size = prev.size * scale
|
||||||
next.hit_latency = prev.hit_latency * 10
|
next.tag_latency = prev.tag_latency * 10
|
||||||
|
next.data_latency = prev.data_latency * 10
|
||||||
next.response_latency = prev.response_latency * 10
|
next.response_latency = prev.response_latency * 10
|
||||||
next.assoc = prev.assoc * scale
|
next.assoc = prev.assoc * scale
|
||||||
next.mshrs = prev.mshrs * scale
|
next.mshrs = prev.mshrs * scale
|
||||||
|
|
|
@ -176,7 +176,7 @@ else:
|
||||||
|
|
||||||
# Define a prototype L1 cache that we scale for all successive levels
|
# Define a prototype L1 cache that we scale for all successive levels
|
||||||
proto_l1 = Cache(size = '32kB', assoc = 4,
|
proto_l1 = Cache(size = '32kB', assoc = 4,
|
||||||
hit_latency = 1, response_latency = 1,
|
tag_latency = 1, data_latency = 1, response_latency = 1,
|
||||||
tgts_per_mshr = 8, clusivity = 'mostly_incl',
|
tgts_per_mshr = 8, clusivity = 'mostly_incl',
|
||||||
writeback_clean = True)
|
writeback_clean = True)
|
||||||
|
|
||||||
|
@ -194,7 +194,8 @@ for scale in cachespec[:-1]:
|
||||||
prev = cache_proto[0]
|
prev = cache_proto[0]
|
||||||
next = prev()
|
next = prev()
|
||||||
next.size = prev.size * scale
|
next.size = prev.size * scale
|
||||||
next.hit_latency = prev.hit_latency * 10
|
next.tag_latency = prev.tag_latency * 10
|
||||||
|
next.data_latency = prev.data_latency * 10
|
||||||
next.response_latency = prev.response_latency * 10
|
next.response_latency = prev.response_latency * 10
|
||||||
next.assoc = prev.assoc * scale
|
next.assoc = prev.assoc * scale
|
||||||
next.mshrs = prev.mshrs * scale
|
next.mshrs = prev.mshrs * scale
|
||||||
|
|
|
@ -45,7 +45,8 @@ class L1Cache(Cache):
|
||||||
"""Simple L1 Cache with default values"""
|
"""Simple L1 Cache with default values"""
|
||||||
|
|
||||||
assoc = 2
|
assoc = 2
|
||||||
hit_latency = 2
|
tag_latency = 2
|
||||||
|
data_latency = 2
|
||||||
response_latency = 2
|
response_latency = 2
|
||||||
mshrs = 4
|
mshrs = 4
|
||||||
tgts_per_mshr = 20
|
tgts_per_mshr = 20
|
||||||
|
@ -107,7 +108,8 @@ class L2Cache(Cache):
|
||||||
# Default parameters
|
# Default parameters
|
||||||
size = '256kB'
|
size = '256kB'
|
||||||
assoc = 8
|
assoc = 8
|
||||||
hit_latency = 20
|
tag_latency = 20
|
||||||
|
data_latency = 20
|
||||||
response_latency = 20
|
response_latency = 20
|
||||||
mshrs = 20
|
mshrs = 20
|
||||||
tgts_per_mshr = 12
|
tgts_per_mshr = 12
|
||||||
|
|
3
src/mem/cache/Cache.py
vendored
3
src/mem/cache/Cache.py
vendored
|
@ -53,7 +53,8 @@ class BaseCache(MemObject):
|
||||||
size = Param.MemorySize("Capacity")
|
size = Param.MemorySize("Capacity")
|
||||||
assoc = Param.Unsigned("Associativity")
|
assoc = Param.Unsigned("Associativity")
|
||||||
|
|
||||||
hit_latency = Param.Cycles("Hit latency")
|
tag_latency = Param.Cycles("Tag lookup latency")
|
||||||
|
data_latency = Param.Cycles("Data access latency")
|
||||||
response_latency = Param.Cycles("Latency for the return path on a miss");
|
response_latency = Param.Cycles("Latency for the return path on a miss");
|
||||||
|
|
||||||
max_miss_count = Param.Counter(0,
|
max_miss_count = Param.Counter(0,
|
||||||
|
|
7
src/mem/cache/base.cc
vendored
7
src/mem/cache/base.cc
vendored
|
@ -72,9 +72,10 @@ BaseCache::BaseCache(const BaseCacheParams *p, unsigned blk_size)
|
||||||
mshrQueue("MSHRs", p->mshrs, 0, p->demand_mshr_reserve), // see below
|
mshrQueue("MSHRs", p->mshrs, 0, p->demand_mshr_reserve), // see below
|
||||||
writeBuffer("write buffer", p->write_buffers, p->mshrs), // see below
|
writeBuffer("write buffer", p->write_buffers, p->mshrs), // see below
|
||||||
blkSize(blk_size),
|
blkSize(blk_size),
|
||||||
lookupLatency(p->hit_latency),
|
lookupLatency(p->tag_latency),
|
||||||
forwardLatency(p->hit_latency),
|
dataLatency(p->data_latency),
|
||||||
fillLatency(p->response_latency),
|
forwardLatency(p->tag_latency),
|
||||||
|
fillLatency(p->data_latency),
|
||||||
responseLatency(p->response_latency),
|
responseLatency(p->response_latency),
|
||||||
numTarget(p->tgts_per_mshr),
|
numTarget(p->tgts_per_mshr),
|
||||||
forwardSnoops(true),
|
forwardSnoops(true),
|
||||||
|
|
6
src/mem/cache/base.hh
vendored
6
src/mem/cache/base.hh
vendored
|
@ -264,6 +264,12 @@ class BaseCache : public MemObject
|
||||||
*/
|
*/
|
||||||
const Cycles lookupLatency;
|
const Cycles lookupLatency;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The latency of data access of a cache. It occurs when there is
|
||||||
|
* an access to the cache.
|
||||||
|
*/
|
||||||
|
const Cycles dataLatency;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the forward latency of the cache. It occurs when there
|
* This is the forward latency of the cache. It occurs when there
|
||||||
* is a cache miss and a request is forwarded downstream, in
|
* is a cache miss and a request is forwarded downstream, in
|
||||||
|
|
15
src/mem/cache/tags/Tags.py
vendored
15
src/mem/cache/tags/Tags.py
vendored
|
@ -49,17 +49,22 @@ class BaseTags(ClockedObject):
|
||||||
# Get the block size from the parent (system)
|
# Get the block size from the parent (system)
|
||||||
block_size = Param.Int(Parent.cache_line_size, "block size in bytes")
|
block_size = Param.Int(Parent.cache_line_size, "block size in bytes")
|
||||||
|
|
||||||
# Get the hit latency from the parent (cache)
|
# Get the tag lookup latency from the parent (cache)
|
||||||
hit_latency = Param.Cycles(Parent.hit_latency,
|
tag_latency = Param.Cycles(Parent.tag_latency,
|
||||||
"The hit latency for this cache")
|
"The tag lookup latency for this cache")
|
||||||
|
|
||||||
|
# Get the RAM access latency from the parent (cache)
|
||||||
|
data_latency = Param.Cycles(Parent.data_latency,
|
||||||
|
"The data access latency for this cache")
|
||||||
|
|
||||||
|
sequential_access = Param.Bool(Parent.sequential_access,
|
||||||
|
"Whether to access tags and data sequentially")
|
||||||
|
|
||||||
class BaseSetAssoc(BaseTags):
|
class BaseSetAssoc(BaseTags):
|
||||||
type = 'BaseSetAssoc'
|
type = 'BaseSetAssoc'
|
||||||
abstract = True
|
abstract = True
|
||||||
cxx_header = "mem/cache/tags/base_set_assoc.hh"
|
cxx_header = "mem/cache/tags/base_set_assoc.hh"
|
||||||
assoc = Param.Int(Parent.assoc, "associativity")
|
assoc = Param.Int(Parent.assoc, "associativity")
|
||||||
sequential_access = Param.Bool(Parent.sequential_access,
|
|
||||||
"Whether to access tags and data sequentially")
|
|
||||||
|
|
||||||
class LRU(BaseSetAssoc):
|
class LRU(BaseSetAssoc):
|
||||||
type = 'LRU'
|
type = 'LRU'
|
||||||
|
|
6
src/mem/cache/tags/base.cc
vendored
6
src/mem/cache/tags/base.cc
vendored
|
@ -56,7 +56,11 @@ using namespace std;
|
||||||
|
|
||||||
BaseTags::BaseTags(const Params *p)
|
BaseTags::BaseTags(const Params *p)
|
||||||
: ClockedObject(p), blkSize(p->block_size), size(p->size),
|
: ClockedObject(p), blkSize(p->block_size), size(p->size),
|
||||||
accessLatency(p->hit_latency), cache(nullptr), warmupBound(0),
|
lookupLatency(p->tag_latency),
|
||||||
|
accessLatency(p->sequential_access ?
|
||||||
|
p->tag_latency + p->data_latency :
|
||||||
|
std::max(p->tag_latency, p->data_latency)),
|
||||||
|
cache(nullptr), warmupBound(0),
|
||||||
warmedUp(false), numBlocks(0)
|
warmedUp(false), numBlocks(0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
8
src/mem/cache/tags/base.hh
vendored
8
src/mem/cache/tags/base.hh
vendored
|
@ -69,7 +69,13 @@ class BaseTags : public ClockedObject
|
||||||
const unsigned blkSize;
|
const unsigned blkSize;
|
||||||
/** The size of the cache. */
|
/** The size of the cache. */
|
||||||
const unsigned size;
|
const unsigned size;
|
||||||
/** The access latency of the cache. */
|
/** The tag lookup latency of the cache. */
|
||||||
|
const Cycles lookupLatency;
|
||||||
|
/**
|
||||||
|
* The total access latency of the cache. This latency
|
||||||
|
* is different depending on the cache access mode
|
||||||
|
* (parallel or sequential)
|
||||||
|
*/
|
||||||
const Cycles accessLatency;
|
const Cycles accessLatency;
|
||||||
/** Pointer to the parent cache. */
|
/** Pointer to the parent cache. */
|
||||||
BaseCache *cache;
|
BaseCache *cache;
|
||||||
|
|
17
src/mem/cache/tags/base_set_assoc.hh
vendored
17
src/mem/cache/tags/base_set_assoc.hh
vendored
|
@ -208,7 +208,6 @@ public:
|
||||||
Addr tag = extractTag(addr);
|
Addr tag = extractTag(addr);
|
||||||
int set = extractSet(addr);
|
int set = extractSet(addr);
|
||||||
BlkType *blk = sets[set].findBlk(tag, is_secure);
|
BlkType *blk = sets[set].findBlk(tag, is_secure);
|
||||||
lat = accessLatency;;
|
|
||||||
|
|
||||||
// Access all tags in parallel, hence one in each way. The data side
|
// Access all tags in parallel, hence one in each way. The data side
|
||||||
// either accesses all blocks in parallel, or one block sequentially on
|
// either accesses all blocks in parallel, or one block sequentially on
|
||||||
|
@ -223,12 +222,20 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
if (blk != nullptr) {
|
if (blk != nullptr) {
|
||||||
if (blk->whenReady > curTick()
|
// If a cache hit
|
||||||
&& cache->ticksToCycles(blk->whenReady - curTick())
|
lat = accessLatency;
|
||||||
> accessLatency) {
|
// Check if the block to be accessed is available. If not,
|
||||||
lat = cache->ticksToCycles(blk->whenReady - curTick());
|
// apply the accessLatency on top of block->whenReady.
|
||||||
|
if (blk->whenReady > curTick() &&
|
||||||
|
cache->ticksToCycles(blk->whenReady - curTick()) >
|
||||||
|
accessLatency) {
|
||||||
|
lat = cache->ticksToCycles(blk->whenReady - curTick()) +
|
||||||
|
accessLatency;
|
||||||
}
|
}
|
||||||
blk->refCount += 1;
|
blk->refCount += 1;
|
||||||
|
} else {
|
||||||
|
// If a cache miss
|
||||||
|
lat = lookupLatency;
|
||||||
}
|
}
|
||||||
|
|
||||||
return blk;
|
return blk;
|
||||||
|
|
13
src/mem/cache/tags/fa_lru.cc
vendored
13
src/mem/cache/tags/fa_lru.cc
vendored
|
@ -186,6 +186,16 @@ FALRU::accessBlock(Addr addr, bool is_secure, Cycles &lat, int context_src,
|
||||||
FALRUBlk* blk = hashLookup(blkAddr);
|
FALRUBlk* blk = hashLookup(blkAddr);
|
||||||
|
|
||||||
if (blk && blk->isValid()) {
|
if (blk && blk->isValid()) {
|
||||||
|
// If a cache hit
|
||||||
|
lat = accessLatency;
|
||||||
|
// Check if the block to be accessed is available. If not,
|
||||||
|
// apply the accessLatency on top of block->whenReady.
|
||||||
|
if (blk->whenReady > curTick() &&
|
||||||
|
cache->ticksToCycles(blk->whenReady - curTick()) >
|
||||||
|
accessLatency) {
|
||||||
|
lat = cache->ticksToCycles(blk->whenReady - curTick()) +
|
||||||
|
accessLatency;
|
||||||
|
}
|
||||||
assert(blk->tag == blkAddr);
|
assert(blk->tag == blkAddr);
|
||||||
tmp_in_cache = blk->inCache;
|
tmp_in_cache = blk->inCache;
|
||||||
for (unsigned i = 0; i < numCaches; i++) {
|
for (unsigned i = 0; i < numCaches; i++) {
|
||||||
|
@ -200,6 +210,8 @@ FALRU::accessBlock(Addr addr, bool is_secure, Cycles &lat, int context_src,
|
||||||
moveToHead(blk);
|
moveToHead(blk);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// If a cache miss
|
||||||
|
lat = lookupLatency;
|
||||||
blk = nullptr;
|
blk = nullptr;
|
||||||
for (unsigned i = 0; i <= numCaches; ++i) {
|
for (unsigned i = 0; i <= numCaches; ++i) {
|
||||||
misses[i]++;
|
misses[i]++;
|
||||||
|
@ -209,7 +221,6 @@ FALRU::accessBlock(Addr addr, bool is_secure, Cycles &lat, int context_src,
|
||||||
*inCache = tmp_in_cache;
|
*inCache = tmp_in_cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
lat = accessLatency;
|
|
||||||
//assert(check());
|
//assert(check());
|
||||||
return blk;
|
return blk;
|
||||||
}
|
}
|
||||||
|
|
1
src/mem/cache/tags/fa_lru.hh
vendored
1
src/mem/cache/tags/fa_lru.hh
vendored
|
@ -51,6 +51,7 @@
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "mem/cache/base.hh"
|
||||||
#include "mem/cache/blk.hh"
|
#include "mem/cache/blk.hh"
|
||||||
#include "mem/cache/tags/base.hh"
|
#include "mem/cache/tags/base.hh"
|
||||||
#include "mem/packet.hh"
|
#include "mem/packet.hh"
|
||||||
|
|
Loading…
Reference in a new issue