Mem: Fix issue with dirty block being lost when entire block transferred to non-cache.

This change fixes the problem for all the cases we actively use. If you want to try more creative I/O device attachments (E.g. sharing an L2), this won't work. You would need another level of caching between the I/O device and the cache (which you actually need anyway with our current code to make sure writes propagate). This is required so that you can mark the cache in between as top level and it won't try to send ownership of a block to the I/O device. Asserts have been added that should catch any issues.
2011-03-17 19:20:19 -05:00 · 2011-03-17 19:20:19 -05:00 · a432d8e085
commit a432d8e085
parent 2f40b3b8ae
24 changed files with 54 additions and 4 deletions
--- a/configs/common/Caches.py
+++ b/configs/common/Caches.py
@ -34,6 +34,7 @@ class L1Cache(BaseCache):
    latency = '1ns'
    mshrs = 10
    tgts_per_mshr = 5
+    is_top_level = True

 class L2Cache(BaseCache):
    assoc = 8
@ -49,6 +50,7 @@ class PageTableWalkerCache(BaseCache):
    mshrs = 10
    size = '1kB'
    tgts_per_mshr = 12
+    is_top_level = True

 class IOCache(BaseCache):
    assoc = 8
@ -58,3 +60,4 @@ class IOCache(BaseCache):
    size = '1kB'
    tgts_per_mshr = 12
    forward_snoops = False
+    is_top_level = True
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@ -112,6 +112,9 @@ DefaultFetch<Impl>::IcachePort::recvTiming(PacketPtr pkt)
 {
    DPRINTF(Fetch, "Received timing\n");
    if (pkt->isResponse()) {
+        // We shouldn't ever get a block in ownership state
+        assert(!(pkt->memInhibitAsserted() && !pkt->sharedAsserted()));
+
        fetch->processCacheCompletion(pkt);
    }
    //else Snooped a coherence request, just return
--- a/src/dev/io_device.cc
+++ b/src/dev/io_device.cc
@ -139,6 +139,9 @@ DmaPort::recvTiming(PacketPtr pkt)
        assert(pendingCount >= 0);
        assert(state);

+        // We shouldn't ever get a block in ownership state
+        assert(!(pkt->memInhibitAsserted() && !pkt->sharedAsserted()));
+
        state->numBytes += pkt->req->getSize();
        assert(state->totBytes >= state->numBytes);
        if (state->totBytes == state->numBytes) {
--- a/src/mem/cache/BaseCache.py
+++ b/src/mem/cache/BaseCache.py
@ -48,6 +48,7 @@ class BaseCache(MemObject):
    size = Param.MemorySize("capacity in bytes")
    forward_snoops = Param.Bool(True,
        "forward snoops from mem side to cpu side")
+    is_top_level = Param.Bool(False, "Is this cache at the top level (e.g. L1)")
    subblock_size = Param.Int(0,
        "Size of subblock in IIC used for compression")
    tgts_per_mshr = Param.Int("max number of accesses per MSHR")
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@ -58,6 +58,7 @@ BaseCache::BaseCache(const Params *p)
      hitLatency(p->latency),
      numTarget(p->tgts_per_mshr),
      forwardSnoops(p->forward_snoops),
+      isTopLevel(p->is_top_level),
      blocked(0),
      noTargetMSHR(NULL),
      missCount(p->max_miss_count),
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@ -194,6 +194,11 @@ class BaseCache : public MemObject
    /** Do we forward snoops from mem side port through to cpu side port? */
    bool forwardSnoops;

+    /** Is this cache a toplevel cache (e.g. L1, I/O cache). If so we should
+     * never try to forward ownership and similar optimizations to the cpu
+     * side */
+    bool isTopLevel;
+
    /**
     * Bit vector of the blocking reasons for the access path.
     * @sa #BlockedCause
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@ -216,7 +216,7 @@ Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk,
                
                if (blk->isDirty()) {
                    // special considerations if we're owner:
-                    if (!deferred_response) {
+                    if (!deferred_response && !isTopLevel) {
                        // if we are responding immediately and can
                        // signal that we're transferring ownership
                        // along with exclusivity, do so
--- a/tests/configs/inorder-timing.py
+++ b/tests/configs/inorder-timing.py
@ -37,8 +37,12 @@ class MyCache(BaseCache):
    mshrs = 10
    tgts_per_mshr = 5

+class MyL1Cache(MyCache):
+    is_top_level = True
+
 cpu = InOrderCPU(cpu_id=0)
-cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
+cpu.addTwoLevelCacheHierarchy(MyL1Cache(size = '128kB'),
+                              MyL1Cache(size = '256kB'),
                              MyCache(size = '2MB', latency='10ns'))

 cpu.clock = '2GHz'
--- a/tests/configs/memtest.py
+++ b/tests/configs/memtest.py
@ -38,6 +38,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 12
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/o3-timing-mp.py
+++ b/tests/configs/o3-timing-mp.py
@ -39,6 +39,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/o3-timing.py
+++ b/tests/configs/o3-timing.py
@ -37,8 +37,12 @@ class MyCache(BaseCache):
    mshrs = 10
    tgts_per_mshr = 5

+class MyL1Cache(MyCache):
+    is_top_level = True
+
 cpu = DerivO3CPU(cpu_id=0)
-cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
+cpu.addTwoLevelCacheHierarchy(MyL1Cache(size = '128kB'),
+                              MyL1Cache(size = '256kB'),
                              MyCache(size = '2MB'))
 cpu.clock = '2GHz'

--- a/tests/configs/pc-simple-atomic.py
+++ b/tests/configs/pc-simple-atomic.py
@ -43,6 +43,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -65,6 +66,7 @@ class PageTableWalkerCache(BaseCache):
    mshrs = 10
    size = '1kB'
    tgts_per_mshr = 12
+    is_top_level = True

 # ---------------------
 # I/O Cache
@ -78,6 +80,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range = AddrRange(0, size=mem_size)
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpu = AtomicSimpleCPU(cpu_id=0)
--- a/tests/configs/pc-simple-timing.py
+++ b/tests/configs/pc-simple-timing.py
@ -44,6 +44,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/realview-simple-atomic.py
+++ b/tests/configs/realview-simple-atomic.py
@ -40,6 +40,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/realview-simple-timing.py
+++ b/tests/configs/realview-simple-timing.py
@ -41,6 +41,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/simple-atomic-mp.py
+++ b/tests/configs/simple-atomic-mp.py
@ -38,6 +38,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/simple-timing-mp.py
+++ b/tests/configs/simple-timing-mp.py
@ -38,6 +38,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
--- a/tests/configs/simple-timing.py
+++ b/tests/configs/simple-timing.py
@ -36,8 +36,12 @@ class MyCache(BaseCache):
    mshrs = 10
    tgts_per_mshr = 5

+class MyL1Cache(MyCache):
+    is_top_level = True
+
 cpu = TimingSimpleCPU(cpu_id=0)
-cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
+cpu.addTwoLevelCacheHierarchy(MyL1Cache(size = '128kB'),
+                              MyL1Cache(size = '256kB'),
                              MyCache(size = '2MB', latency='10ns'))
 system = System(cpu = cpu,
                physmem = PhysicalMemory(),
--- a/tests/configs/tsunami-o3-dual.py
+++ b/tests/configs/tsunami-o3-dual.py
@ -41,6 +41,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -65,6 +66,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range=AddrRange(0, size='8GB')
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpus = [ DerivO3CPU(cpu_id=i) for i in xrange(2) ]
--- a/tests/configs/tsunami-o3.py
+++ b/tests/configs/tsunami-o3.py
@ -41,6 +41,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -65,6 +66,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range=AddrRange(0, size='8GB')
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpu = DerivO3CPU(cpu_id=0)
--- a/tests/configs/tsunami-simple-atomic-dual.py
+++ b/tests/configs/tsunami-simple-atomic-dual.py
@ -40,6 +40,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -64,6 +65,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range=AddrRange(0, size='8GB')
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpus = [ AtomicSimpleCPU(cpu_id=i) for i in xrange(2) ]
--- a/tests/configs/tsunami-simple-atomic.py
+++ b/tests/configs/tsunami-simple-atomic.py
@ -40,6 +40,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -64,6 +65,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range=AddrRange(0, size='8GB')
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpu = AtomicSimpleCPU(cpu_id=0)
--- a/tests/configs/tsunami-simple-timing-dual.py
+++ b/tests/configs/tsunami-simple-timing-dual.py
@ -40,6 +40,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -64,6 +65,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range=AddrRange(0, size='8GB')
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpus = [ TimingSimpleCPU(cpu_id=i) for i in xrange(2) ]
--- a/tests/configs/tsunami-simple-timing.py
+++ b/tests/configs/tsunami-simple-timing.py
@ -41,6 +41,7 @@ class L1(BaseCache):
    block_size = 64
    mshrs = 4
    tgts_per_mshr = 8
+    is_top_level = True

 # ----------------------
 # Base L2 Cache
@ -65,6 +66,7 @@ class IOCache(BaseCache):
    tgts_per_mshr = 12
    addr_range=AddrRange(0, size='8GB')
    forward_snoops = False
+    is_top_level = True

 #cpu
 cpu = TimingSimpleCPU(cpu_id=0)