From a432d8e0851de8d090676697e29ca6ed4be64fb7 Mon Sep 17 00:00:00 2001
From: Ali Saidi <Ali.Saidi@ARM.com>
Date: Thu, 17 Mar 2011 19:20:19 -0500
Subject: [PATCH] Mem: Fix issue with dirty block being lost when entire block
 transferred to non-cache.

This change fixes the problem for all the cases we actively use. If you want to try
more creative I/O device attachments (E.g. sharing an L2), this won't work. You
would need another level of caching between the I/O device and the cache
(which you actually need anyway with our current code to make sure writes
propagate). This is required so that you can mark the cache in between as
top level and it won't try to send ownership of a block to the I/O device.
Asserts have been added that should catch any issues.
---
 configs/common/Caches.py                    | 3 +++
 src/cpu/o3/fetch_impl.hh                    | 3 +++
 src/dev/io_device.cc                        | 3 +++
 src/mem/cache/BaseCache.py                  | 1 +
 src/mem/cache/base.cc                       | 1 +
 src/mem/cache/base.hh                       | 5 +++++
 src/mem/cache/cache_impl.hh                 | 2 +-
 tests/configs/inorder-timing.py             | 6 +++++-
 tests/configs/memtest.py                    | 1 +
 tests/configs/o3-timing-mp.py               | 1 +
 tests/configs/o3-timing.py                  | 6 +++++-
 tests/configs/pc-simple-atomic.py           | 3 +++
 tests/configs/pc-simple-timing.py           | 1 +
 tests/configs/realview-simple-atomic.py     | 1 +
 tests/configs/realview-simple-timing.py     | 1 +
 tests/configs/simple-atomic-mp.py           | 1 +
 tests/configs/simple-timing-mp.py           | 1 +
 tests/configs/simple-timing.py              | 6 +++++-
 tests/configs/tsunami-o3-dual.py            | 2 ++
 tests/configs/tsunami-o3.py                 | 2 ++
 tests/configs/tsunami-simple-atomic-dual.py | 2 ++
 tests/configs/tsunami-simple-atomic.py      | 2 ++
 tests/configs/tsunami-simple-timing-dual.py | 2 ++
 tests/configs/tsunami-simple-timing.py      | 2 ++
 24 files changed, 54 insertions(+), 4 deletions(-)
diff --git a/configs/common/Caches.py b/configs/common/Caches.py
index 3adc7e5c9..ffcd63c49 100644
--- a/configs/common/Caches.py
+++ b/configs/common/Caches.py
@@ -34,6 +34,7 @@ class L1Cache(BaseCache):
     latency = '1ns'
     mshrs = 10
     tgts_per_mshr = 5
+    is_top_level = True
 
 class L2Cache(BaseCache):
     assoc = 8
@@ -49,6 +50,7 @@ class PageTableWalkerCache(BaseCache):
     mshrs = 10
     size = '1kB'
     tgts_per_mshr = 12
+    is_top_level = True
 
 class IOCache(BaseCache):
     assoc = 8
@@ -58,3 +60,4 @@ class IOCache(BaseCache):
     size = '1kB'
     tgts_per_mshr = 12
     forward_snoops = False
+    is_top_level = True
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index a2f2b4f8a..3092bd937 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -112,6 +112,9 @@ DefaultFetch<Impl>::IcachePort::recvTiming(PacketPtr pkt)
 {
     DPRINTF(Fetch, "Received timing\n");
     if (pkt->isResponse()) {
+        // We shouldn't ever get a block in ownership state
+        assert(!(pkt->memInhibitAsserted() && !pkt->sharedAsserted()));
+
         fetch->processCacheCompletion(pkt);
     }
     //else Snooped a coherence request, just return
diff --git a/src/dev/io_device.cc b/src/dev/io_device.cc
index be97bc4ad..ffe8fdf06 100644
--- a/src/dev/io_device.cc
+++ b/src/dev/io_device.cc
@@ -139,6 +139,9 @@ DmaPort::recvTiming(PacketPtr pkt)
         assert(pendingCount >= 0);
         assert(state);
 
+        // We shouldn't ever get a block in ownership state
+        assert(!(pkt->memInhibitAsserted() && !pkt->sharedAsserted()));
+
         state->numBytes += pkt->req->getSize();
         assert(state->totBytes >= state->numBytes);
         if (state->totBytes == state->numBytes) {
diff --git a/src/mem/cache/BaseCache.py b/src/mem/cache/BaseCache.py
index dffac2234..5c7ae5274 100644
--- a/src/mem/cache/BaseCache.py
+++ b/src/mem/cache/BaseCache.py
@@ -48,6 +48,7 @@ class BaseCache(MemObject):
     size = Param.MemorySize("capacity in bytes")
     forward_snoops = Param.Bool(True,
         "forward snoops from mem side to cpu side")
+    is_top_level = Param.Bool(False, "Is this cache at the top level (e.g. L1)")
     subblock_size = Param.Int(0,
         "Size of subblock in IIC used for compression")
     tgts_per_mshr = Param.Int("max number of accesses per MSHR")
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index 9166e1a09..b7e331d54 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -58,6 +58,7 @@ BaseCache::BaseCache(const Params *p)
       hitLatency(p->latency),
       numTarget(p->tgts_per_mshr),
       forwardSnoops(p->forward_snoops),
+      isTopLevel(p->is_top_level),
       blocked(0),
       noTargetMSHR(NULL),
       missCount(p->max_miss_count),
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index e8a644296..28ddf5054 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -194,6 +194,11 @@ class BaseCache : public MemObject
     /** Do we forward snoops from mem side port through to cpu side port? */
     bool forwardSnoops;
 
+    /** Is this cache a toplevel cache (e.g. L1, I/O cache). If so we should
+     * never try to forward ownership and similar optimizations to the cpu
+     * side */
+    bool isTopLevel;
+
     /**
      * Bit vector of the blocking reasons for the access path.
      * @sa #BlockedCause
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index e4e4a3c92..0b2b273f9 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -216,7 +216,7 @@ Cache<TagStore>::satisfyCpuSideRequest(PacketPtr pkt, BlkType *blk,
                 
                 if (blk->isDirty()) {
                     // special considerations if we're owner:
-                    if (!deferred_response) {
+                    if (!deferred_response && !isTopLevel) {
                         // if we are responding immediately and can
                         // signal that we're transferring ownership
                         // along with exclusivity, do so
diff --git a/tests/configs/inorder-timing.py b/tests/configs/inorder-timing.py
index af58cafa5..ddf37b5ec 100644
--- a/tests/configs/inorder-timing.py
+++ b/tests/configs/inorder-timing.py
@@ -37,8 +37,12 @@ class MyCache(BaseCache):
     mshrs = 10
     tgts_per_mshr = 5
 
+class MyL1Cache(MyCache):
+    is_top_level = True
+
 cpu = InOrderCPU(cpu_id=0)
-cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
+cpu.addTwoLevelCacheHierarchy(MyL1Cache(size = '128kB'),
+                              MyL1Cache(size = '256kB'),
                               MyCache(size = '2MB', latency='10ns'))
 
 cpu.clock = '2GHz'
diff --git a/tests/configs/memtest.py b/tests/configs/memtest.py
index d75bd3d8c..f62381473 100644
--- a/tests/configs/memtest.py
+++ b/tests/configs/memtest.py
@@ -38,6 +38,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 12
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/o3-timing-mp.py b/tests/configs/o3-timing-mp.py
index 5c770cdbc..35811282c 100644
--- a/tests/configs/o3-timing-mp.py
+++ b/tests/configs/o3-timing-mp.py
@@ -39,6 +39,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/o3-timing.py b/tests/configs/o3-timing.py
index a4c054122..d4a69d94a 100644
--- a/tests/configs/o3-timing.py
+++ b/tests/configs/o3-timing.py
@@ -37,8 +37,12 @@ class MyCache(BaseCache):
     mshrs = 10
     tgts_per_mshr = 5
 
+class MyL1Cache(MyCache):
+    is_top_level = True
+
 cpu = DerivO3CPU(cpu_id=0)
-cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
+cpu.addTwoLevelCacheHierarchy(MyL1Cache(size = '128kB'),
+                              MyL1Cache(size = '256kB'),
                               MyCache(size = '2MB'))
 cpu.clock = '2GHz'
 
diff --git a/tests/configs/pc-simple-atomic.py b/tests/configs/pc-simple-atomic.py
index 382899eb5..1c35ff2d9 100644
--- a/tests/configs/pc-simple-atomic.py
+++ b/tests/configs/pc-simple-atomic.py
@@ -43,6 +43,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -65,6 +66,7 @@ class PageTableWalkerCache(BaseCache):
     mshrs = 10
     size = '1kB'
     tgts_per_mshr = 12
+    is_top_level = True
 
 # ---------------------
 # I/O Cache
@@ -78,6 +80,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range = AddrRange(0, size=mem_size)
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpu = AtomicSimpleCPU(cpu_id=0)
diff --git a/tests/configs/pc-simple-timing.py b/tests/configs/pc-simple-timing.py
index 7452e2542..9c9f4aeca 100644
--- a/tests/configs/pc-simple-timing.py
+++ b/tests/configs/pc-simple-timing.py
@@ -44,6 +44,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/realview-simple-atomic.py b/tests/configs/realview-simple-atomic.py
index ab6d612d4..7340be7a4 100644
--- a/tests/configs/realview-simple-atomic.py
+++ b/tests/configs/realview-simple-atomic.py
@@ -40,6 +40,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/realview-simple-timing.py b/tests/configs/realview-simple-timing.py
index 53b6ab2b2..83b643c52 100644
--- a/tests/configs/realview-simple-timing.py
+++ b/tests/configs/realview-simple-timing.py
@@ -41,6 +41,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/simple-atomic-mp.py b/tests/configs/simple-atomic-mp.py
index d88a9b395..4db741b8a 100644
--- a/tests/configs/simple-atomic-mp.py
+++ b/tests/configs/simple-atomic-mp.py
@@ -38,6 +38,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/simple-timing-mp.py b/tests/configs/simple-timing-mp.py
index f5793b282..6f4090ec2 100644
--- a/tests/configs/simple-timing-mp.py
+++ b/tests/configs/simple-timing-mp.py
@@ -38,6 +38,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
diff --git a/tests/configs/simple-timing.py b/tests/configs/simple-timing.py
index 739e11e55..bc9d016c5 100644
--- a/tests/configs/simple-timing.py
+++ b/tests/configs/simple-timing.py
@@ -36,8 +36,12 @@ class MyCache(BaseCache):
     mshrs = 10
     tgts_per_mshr = 5
 
+class MyL1Cache(MyCache):
+    is_top_level = True
+
 cpu = TimingSimpleCPU(cpu_id=0)
-cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
+cpu.addTwoLevelCacheHierarchy(MyL1Cache(size = '128kB'),
+                              MyL1Cache(size = '256kB'),
                               MyCache(size = '2MB', latency='10ns'))
 system = System(cpu = cpu,
                 physmem = PhysicalMemory(),
diff --git a/tests/configs/tsunami-o3-dual.py b/tests/configs/tsunami-o3-dual.py
index 7744560f9..125e228a7 100644
--- a/tests/configs/tsunami-o3-dual.py
+++ b/tests/configs/tsunami-o3-dual.py
@@ -41,6 +41,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -65,6 +66,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range=AddrRange(0, size='8GB')
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpus = [ DerivO3CPU(cpu_id=i) for i in xrange(2) ]
diff --git a/tests/configs/tsunami-o3.py b/tests/configs/tsunami-o3.py
index fd2d66431..13212d5d9 100644
--- a/tests/configs/tsunami-o3.py
+++ b/tests/configs/tsunami-o3.py
@@ -41,6 +41,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -65,6 +66,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range=AddrRange(0, size='8GB')
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpu = DerivO3CPU(cpu_id=0)
diff --git a/tests/configs/tsunami-simple-atomic-dual.py b/tests/configs/tsunami-simple-atomic-dual.py
index 9d3dbaa91..2e56ce851 100644
--- a/tests/configs/tsunami-simple-atomic-dual.py
+++ b/tests/configs/tsunami-simple-atomic-dual.py
@@ -40,6 +40,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -64,6 +65,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range=AddrRange(0, size='8GB')
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpus = [ AtomicSimpleCPU(cpu_id=i) for i in xrange(2) ]
diff --git a/tests/configs/tsunami-simple-atomic.py b/tests/configs/tsunami-simple-atomic.py
index cbacf1995..3c1981464 100644
--- a/tests/configs/tsunami-simple-atomic.py
+++ b/tests/configs/tsunami-simple-atomic.py
@@ -40,6 +40,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -64,6 +65,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range=AddrRange(0, size='8GB')
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpu = AtomicSimpleCPU(cpu_id=0)
diff --git a/tests/configs/tsunami-simple-timing-dual.py b/tests/configs/tsunami-simple-timing-dual.py
index f0105461d..747cdac18 100644
--- a/tests/configs/tsunami-simple-timing-dual.py
+++ b/tests/configs/tsunami-simple-timing-dual.py
@@ -40,6 +40,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -64,6 +65,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range=AddrRange(0, size='8GB')
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpus = [ TimingSimpleCPU(cpu_id=i) for i in xrange(2) ]
diff --git a/tests/configs/tsunami-simple-timing.py b/tests/configs/tsunami-simple-timing.py
index 9a262b3b2..110e6ee74 100644
--- a/tests/configs/tsunami-simple-timing.py
+++ b/tests/configs/tsunami-simple-timing.py
@@ -41,6 +41,7 @@ class L1(BaseCache):
     block_size = 64
     mshrs = 4
     tgts_per_mshr = 8
+    is_top_level = True
 
 # ----------------------
 # Base L2 Cache
@@ -65,6 +66,7 @@ class IOCache(BaseCache):
     tgts_per_mshr = 12
     addr_range=AddrRange(0, size='8GB')
     forward_snoops = False
+    is_top_level = True
 
 #cpu
 cpu = TimingSimpleCPU(cpu_id=0)