diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index 2a285bf2f..70d1b4167 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -93,9 +93,9 @@ BaseCache::CacheSlavePort::setBlocked()
     // if we already scheduled a retry in this cycle, but it has not yet
     // happened, cancel it
     if (sendRetryEvent.scheduled()) {
-       owner.deschedule(sendRetryEvent);
-       DPRINTF(CachePort, "Cache port %s deschedule retry\n", name());
-       mustSendRetry = true;
+        owner.deschedule(sendRetryEvent);
+        DPRINTF(CachePort, "Cache port %s deschedule retry\n", name());
+        mustSendRetry = true;
     }
 }
 
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index 1567aaa62..297b80180 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -94,6 +94,7 @@ class BaseCache : public MemObject
         Blocked_NoMSHRs = MSHRQueue_MSHRs,
         Blocked_NoWBBuffers = MSHRQueue_WriteBuffer,
         Blocked_NoTargets,
+        Blocked_PendingWriteInvalidate,
         NUM_BLOCKED_CAUSES
     };
 
@@ -168,6 +169,8 @@ class BaseCache : public MemObject
         /** Return to normal operation and accept new requests. */
         void clearBlocked();
 
+        bool isBlocked() const { return blocked; }
+
       protected:
 
         CacheSlavePort(const std::string &_name, BaseCache *_cache,
diff --git a/src/mem/cache/blk.hh b/src/mem/cache/blk.hh
index 626b4818d..ff09b42c4 100644
--- a/src/mem/cache/blk.hh
+++ b/src/mem/cache/blk.hh
@@ -72,7 +72,10 @@ enum CacheBlkStatusBits {
     /** block was a hardware prefetch yet unaccessed*/
     BlkHWPrefetched =   0x20,
     /** block holds data from the secure memory space */
-    BlkSecure =         0x40
+    BlkSecure =         0x40,
+    /** can the block transition to E? (hasn't been shared with another cache)
+      * used to close a timing gap when handling WriteInvalidate packets */
+    BlkCanGoExclusive = 0x80
 };
 
 /**
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index 0ee1e353a..12fb3b0f0 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -180,6 +180,11 @@ class Cache : public BaseCache
      */
     const bool doFastWrites;
 
+    /**
+     * Turn line-sized writes into WriteInvalidate transactions.
+     */
+    void promoteWholeLineWrites(PacketPtr pkt);
+
     /**
      * Notify the prefetcher on every access, not just misses.
      */
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index 1a72f285f..a792de19d 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -312,30 +312,20 @@ Cache<TagStore>::access(PacketPtr pkt, BlkType *&blk,
             pkt->getAddr(), pkt->isSecure() ? "s" : "ns",
             blk ? "hit" : "miss", blk ? blk->print() : "");
 
-    if (blk != NULL) {
-
-        if (pkt->needsExclusive() ? blk->isWritable() : blk->isReadable()) {
-            // OK to satisfy access
-            incHitCount(pkt);
-            satisfyCpuSideRequest(pkt, blk);
-            return true;
-        }
-    }
-
-    // Can't satisfy access normally... either no block (blk == NULL)
-    // or have block but need exclusive & only have shared.
-
     // Writeback handling is special case.  We can write the block
     // into the cache without having a writeable copy (or any copy at
-    // all).
-    if (pkt->cmd == MemCmd::Writeback) {
+    // all).  Like writebacks, we write into the cache upon initial
+    // receipt of a write-invalidate packets as well.
+    if ((pkt->cmd == MemCmd::Writeback) ||
+       ((pkt->cmd == MemCmd::WriteInvalidateReq) && isTopLevel)) {
         assert(blkSize == pkt->getSize());
         if (blk == NULL) {
             // need to do a replacement
             blk = allocateBlock(pkt->getAddr(), pkt->isSecure(), writebacks);
             if (blk == NULL) {
                 // no replaceable block available, give up.
-                // writeback will be forwarded to next level.
+                // Writeback will be forwarded to next level,
+                // WriteInvalidate will be retried.
                 incMissCount(pkt);
                 return false;
             }
@@ -347,17 +337,41 @@ Cache<TagStore>::access(PacketPtr pkt, BlkType *&blk,
             }
         }
         std::memcpy(blk->data, pkt->getPtr<uint8_t>(), blkSize);
-        blk->status |= BlkDirty;
-        if (pkt->isSupplyExclusive()) {
-            blk->status |= BlkWritable;
+        if (pkt->cmd == MemCmd::Writeback) {
+            blk->status |= BlkDirty;
+            if (pkt->isSupplyExclusive()) {
+                blk->status |= BlkWritable;
+            }
+            // nothing else to do; writeback doesn't expect response
+            assert(!pkt->needsResponse());
+        } else if (pkt->cmd == MemCmd::WriteInvalidateReq) {
+            assert(blk->isReadable()); // implicitly checks for Valid bit also
+            blk->status |= (BlkDirty | BlkCanGoExclusive);
+            blk->status &= ~BlkWritable;
+            ++fastWrites;
         }
-        // nothing else to do; writeback doesn't expect response
-        assert(!pkt->needsResponse());
         DPRINTF(Cache, "%s new state is %s\n", __func__, blk->print());
         incHitCount(pkt);
         return true;
+    } else if ((pkt->cmd == MemCmd::WriteInvalidateReq) && !isTopLevel) {
+        if (blk != NULL) {
+            assert(blk != tempBlock);
+            tags->invalidate(blk);
+            blk->invalidate();
+        }
+        return true;
+    } else if ((blk != NULL) &&
+               (pkt->needsExclusive() ? blk->isWritable()
+                                      : blk->isReadable())) {
+        // OK to satisfy access
+        incHitCount(pkt);
+        satisfyCpuSideRequest(pkt, blk);
+        return true;
     }
 
+    // Can't satisfy access normally... either no block (blk == NULL)
+    // or have block but need exclusive & only have shared.
+
     incMissCount(pkt);
 
     if (blk == NULL && pkt->isLLSC() && pkt->isWrite()) {
@@ -413,6 +427,19 @@ Cache<TagStore>::recvTimingSnoopResp(PacketPtr pkt)
     memSidePort->schedTimingSnoopResp(pkt, time);
 }
 
+template<class TagStore>
+void
+Cache<TagStore>::promoteWholeLineWrites(PacketPtr pkt)
+{
+    // Cache line clearing instructions
+    if (doFastWrites && (pkt->cmd == MemCmd::WriteReq) &&
+        (pkt->getSize() == blkSize) && (pkt->getOffset(blkSize) == 0)) {
+        pkt->cmd = MemCmd::WriteInvalidateReq;
+        DPRINTF(Cache, "packet promoted from Write to WriteInvalidate\n");
+        assert(isTopLevel); // should only happen at L1 or I/O cache
+    }
+}
+
 template<class TagStore>
 bool
 Cache<TagStore>::recvTimingReq(PacketPtr pkt)
@@ -439,6 +466,8 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
         return true;
     }
 
+    promoteWholeLineWrites(pkt);
+
     if (pkt->memInhibitAsserted()) {
         DPRINTF(Cache, "mem inhibited on 0x%x (%s): not responding\n",
                 pkt->getAddr(), pkt->isSecure() ? "s" : "ns");
@@ -496,35 +525,26 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
 
     bool satisfied = access(pkt, blk, lat, writebacks);
 
-#if 0
-    /** @todo make the fast write alloc (wh64) work with coherence. */
-
-    // If this is a block size write/hint (WH64) allocate the block here
-    // if the coherence protocol allows it.
-    if (!blk && pkt->getSize() >= blkSize && coherence->allowFastWrites() &&
-        (pkt->cmd == MemCmd::WriteReq
-         || pkt->cmd == MemCmd::WriteInvalidateReq) ) {
-        // not outstanding misses, can do this
-        MSHR *outstanding_miss = mshrQueue.findMatch(pkt->getAddr(),
-                                                     pkt->isSecure());
-        if (pkt->cmd == MemCmd::WriteInvalidateReq || !outstanding_miss) {
-            if (outstanding_miss) {
-                warn("WriteInv doing a fastallocate"
-                     "with an outstanding miss to the same address\n");
-            }
-            blk = handleFill(NULL, pkt, BlkValid | BlkWritable,
-                                   writebacks);
-            ++fastWrites;
-        }
-    }
-#endif
-
     // track time of availability of next prefetch, if any
     Tick next_pf_time = 0;
 
     bool needsResponse = pkt->needsResponse();
 
+    if (pkt->cmd == MemCmd::WriteInvalidateReq) {
+        if (!satisfied && isTopLevel) {
+            // access() tried to allocate a block but it could not; abort.
+            setBlocked(Blocked_PendingWriteInvalidate);
+            return false;
+        }
+        satisfied = false;
+        // we need to take the miss path (allocate MSHR, etc.) for
+        // WriteInvalidates because they always need to propagate
+        // throughout the memory system
+    }
+
     if (satisfied) {
+        // hit (for all other request types)
+
         if (prefetcher && (prefetchOnAccess || (blk && blk->wasPrefetched()))) {
             if (blk)
                 blk->status &= ~BlkHWPrefetched;
@@ -551,6 +571,16 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
         // @todo: Make someone pay for this
         pkt->busFirstWordDelay = pkt->busLastWordDelay = 0;
 
+        if (blk && blk->isValid() && (blk->status & BlkCanGoExclusive) &&
+            pkt->isWrite() && (pkt->cmd != MemCmd::WriteInvalidateReq)) {
+            // Packet is a Write (needs exclusive) should be delayed because
+            // a WriteInvalidate is pending.  Instead of going the MSHR route,
+            // the Packet should be replayed, since if the block transitions
+            // to Exclusive the write can complete immediately.
+            setBlocked(Blocked_PendingWriteInvalidate);
+            return false;
+        }
+
         Addr blk_addr = blockAlign(pkt->getAddr());
         MSHR *mshr = mshrQueue.findMatch(blk_addr, pkt->isSecure());
 
@@ -639,7 +669,10 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
             if (pkt->cmd == MemCmd::Writeback) {
                 allocateWriteBuffer(pkt, time, true);
             } else {
-                if (blk && blk->isValid()) {
+                if (pkt->cmd == MemCmd::WriteInvalidateReq) {
+                    // a WriteInvalidate is not a normal write miss;
+                    // the assertions below are not applicable.
+                } else if (blk && blk->isValid()) {
                     // If we have a write miss to a valid block, we
                     // need to mark the block non-readable.  Otherwise
                     // if we allow reads while there's an outstanding
@@ -655,7 +688,8 @@ Cache<TagStore>::recvTimingReq(PacketPtr pkt)
                     // internally, and have a sufficiently weak memory
                     // model, this is probably unnecessary, but at some
                     // point it must have seemed like we needed it...
-                    assert(pkt->needsExclusive() && !blk->isWritable());
+                    assert(pkt->needsExclusive());
+                    assert(!blk->isWritable());
                     blk->status &= ~BlkReadable;
                 }
 
@@ -697,6 +731,12 @@ Cache<TagStore>::getBusPacket(PacketPtr cpu_pkt, BlkType *blk,
         return NULL;
     }
 
+    // WriteInvalidates for cache line clearing instructions don't
+    // require a read; just send directly to the bus.
+    if (cpu_pkt->cmd == MemCmd::WriteInvalidateReq) {
+        return NULL;
+    }
+
     if (!blkValid &&
         (cpu_pkt->cmd == MemCmd::Writeback || cpu_pkt->isUpgrade())) {
         // Writebacks that weren't allocated in access() and upgrades
@@ -716,7 +756,8 @@ Cache<TagStore>::getBusPacket(PacketPtr cpu_pkt, BlkType *blk,
     if (blkValid && useUpgrades) {
         // only reason to be here is that blk is shared
         // (read-only) and we need exclusive
-        assert(needsExclusive && !blk->isWritable());
+        assert(needsExclusive);
+        assert(!blk->isWritable());
         cmd = cpu_pkt->isLLSC() ? MemCmd::SCUpgradeReq : MemCmd::UpgradeReq;
     } else if (cpu_pkt->cmd == MemCmd::SCUpgradeFailReq ||
                cpu_pkt->cmd == MemCmd::StoreCondFailReq) {
@@ -751,6 +792,8 @@ Cache<TagStore>::recvAtomic(PacketPtr pkt)
     if (system->bypassCaches())
         return ticksToCycles(memSidePort->sendAtomic(pkt));
 
+    promoteWholeLineWrites(pkt);
+
     if (pkt->memInhibitAsserted()) {
         assert(!pkt->req->isUncacheable());
         // have to invalidate ourselves and any lower caches even if
@@ -788,6 +831,10 @@ Cache<TagStore>::recvAtomic(PacketPtr pkt)
 
     if (!access(pkt, blk, lat, writebacks)) {
         // MISS
+
+        // WriteInvalidates should never fail an access() in Atomic mode
+        assert(pkt->cmd != MemCmd::WriteInvalidateReq);
+
         PacketPtr bus_pkt = getBusPacket(pkt, blk, pkt->needsExclusive());
 
         bool is_forward = (bus_pkt == NULL);
@@ -858,7 +905,25 @@ Cache<TagStore>::recvAtomic(PacketPtr pkt)
         delete wbPkt;
     }
 
-    // We now have the block one way or another (hit or completed miss)
+    // We now have the block one way or another (hit or completed miss),
+    // except for Request types that perform an invalidate, where the point
+    // is to make sure there is no block.
+
+    if (pkt->cmd == MemCmd::WriteInvalidateReq) {
+        memSidePort->sendAtomic(pkt); // complete writeback
+        if (isTopLevel) {
+            // top level caches allocate and write the data
+            assert(blk->isDirty());
+            assert(!blk->isWritable());
+            assert(blk->status & BlkCanGoExclusive);
+            blk->status &= ~(BlkDirty | BlkCanGoExclusive); // and mark clean
+            blk->status |= BlkWritable;                     // i.e. O(+cgE) -> E
+        } else {
+            // other caches invalidate.
+            // if the block was found, it was invalidated.
+            assert(!blk || !blk->isValid());
+        }
+    }
 
     if (pkt->needsResponse()) {
         pkt->makeAtomicResponse();
@@ -1064,6 +1129,38 @@ Cache<TagStore>::recvTimingResp(PacketPtr pkt)
                 completion_time = clockEdge(responseLatency) +
                     pkt->busLastWordDelay;
                 target->pkt->req->setExtraData(0);
+            } else if (pkt->cmd == MemCmd::WriteInvalidateResp) {
+                if (blk) {
+                    assert(blk->isDirty() && !blk->isWritable());
+                    // The block, having been written back, is no longer dirty,
+                    // nor do we have any reason to see if it was snooped in the
+                    // meantime (which CanGoExclusive tracks).  If it can go
+                    // exclusive, we put it in that state, and otherwise S.
+                    // In short: O(+cgE) -> E, O(-cgE) -> S
+                    if (blk->status & BlkCanGoExclusive) {
+                        blk->status |= BlkWritable;
+                    }
+                    blk->status &= ~(BlkDirty | BlkCanGoExclusive);
+                }
+                if (isTopLevel) {
+                    // makeTimingResponse() will turn it into a WriteResp
+                    target->pkt->cmd = MemCmd::WriteReq;
+                    // Writes may have been blocked - quite rare case, but
+                    // it does happen. Prevent deadlock by telling the core
+                    if (isBlocked()) { // to retry.
+                        clearBlocked(Blocked_PendingWriteInvalidate);
+                    }
+                }
+                // If the block managed to get evicted before its own
+                // writeback (e.g. by a Read/Upgrade (from O(-cgE)/S to
+                // I/E) or ReadExclusive (direct to I/E); either way a
+                // cache-to-cache ownership transfer) completed, that's
+                // OK, we just ignore this response. If the new owner
+                // doesn't actually modify it, a superfluous writeback
+                // will occur for its impatience (since it will think it
+                // has dirty data), but it really can't be helped.
+                completion_time = clockEdge(responseLatency) +
+                    pkt->busLastWordDelay;
             } else {
                 // not a cache fill, just forwarding response
                 // responseLatency is the latency of the return path
@@ -1291,9 +1388,10 @@ Cache<TagStore>::allocateBlock(Addr addr, bool is_secure,
         Addr repl_addr = tags->regenerateBlkAddr(blk->tag, blk->set);
         MSHR *repl_mshr = mshrQueue.findMatch(repl_addr, blk->isSecure());
         if (repl_mshr) {
-            // must be an outstanding upgrade request on block
-            // we're about to replace...
-            assert(!blk->isWritable());
+            // must be an outstanding upgrade request (common case)
+            // or WriteInvalidate pending writeback (very uncommon case)
+            // on a block we're about to replace...
+            assert(!blk->isWritable() || blk->isDirty());
             assert(repl_mshr->needsExclusive());
             // too hard to replace block with transient state
             // allocation failed, block not inserted
@@ -1378,6 +1476,11 @@ Cache<TagStore>::handleFill(PacketPtr pkt, BlkType *blk,
             blk->status |= BlkDirty;
     }
 
+    if (pkt->cmd == MemCmd::WriteInvalidateReq) {
+        // a block written immediately, all at once, pre-writeback is dirty
+        blk->status |= BlkDirty;
+    }
+
     DPRINTF(Cache, "Block addr %x (%s) moving from state %x to %s\n",
             addr, is_secure ? "s" : "ns", old_state, blk->print());
 
@@ -1492,20 +1595,25 @@ Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
         }
     }
 
-     if (!blk || !blk->isValid()) {
-         DPRINTF(Cache, "%s snoop miss for %s address %x size %d\n",
-                 __func__, pkt->cmdString(), pkt->getAddr(), pkt->getSize());
-         return;
-     } else {
-        DPRINTF(Cache, "%s snoop hit for %s for address %x size %d, "
-                "old state is %s\n", __func__, pkt->cmdString(),
-                pkt->getAddr(), pkt->getSize(), blk->print());
-     }
+    if (!blk || !blk->isValid()) {
+        DPRINTF(Cache, "%s snoop miss for %s address %x size %d\n",
+                __func__, pkt->cmdString(), pkt->getAddr(), pkt->getSize());
+        return;
+    } else {
+       DPRINTF(Cache, "%s snoop hit for %s for address %x size %d, "
+               "old state is %s\n", __func__, pkt->cmdString(),
+               pkt->getAddr(), pkt->getSize(), blk->print());
+    }
 
     // we may end up modifying both the block state and the packet (if
     // we respond in atomic mode), so just figure out what to do now
-    // and then do it later
-    bool respond = blk->isDirty() && pkt->needsResponse();
+    // and then do it later.  If we find dirty data while snooping for a
+    // WriteInvalidate, we don't care, since no merging needs to take place.
+    // We need the eviction to happen as normal, but the data needn't be
+    // sent anywhere, nor should the writeback be inhibited at the memory
+    // controller for any reason.
+    bool respond = blk->isDirty() && pkt->needsResponse()
+                                  && (pkt->cmd != MemCmd::WriteInvalidateReq);
     bool have_exclusive = blk->isWritable();
 
     // Invalidate any prefetch's from below that would strip write permissions
@@ -1522,7 +1630,7 @@ Cache<TagStore>::handleSnoop(PacketPtr pkt, BlkType *blk,
     if (pkt->isRead() && !invalidate) {
         assert(!needs_exclusive);
         pkt->assertShared();
-        int bits_to_clear = BlkWritable;
+        int bits_to_clear = BlkWritable | BlkCanGoExclusive;
         const bool haveOwnershipState = true; // for now
         if (!haveOwnershipState) {
             // if we don't support pure ownership (dirty && !writable),
@@ -1950,9 +2058,9 @@ Cache<TagStore>::CpuSidePort::recvTimingReq(PacketPtr pkt)
         // either already committed to send a retry, or blocked
         success = false;
     } else {
-        // for now this should always succeed
+        // pass it on to the cache, and let the cache decide if we
+        // have to retry or not
         success = cache->recvTimingReq(pkt);
-        assert(success);
     }
 
     // remember if we have to retry
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 4ff531e80..5b0834285 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -98,11 +98,11 @@ MemCmd::commandInfo[] =
     /* HardPFResp */
     { SET4(IsRead, IsResponse, IsHWPrefetch, HasData),
             InvalidCmd, "HardPFResp" },
-    /* WriteInvalidateReq (currently unused, see packet.hh) */
+    /* WriteInvalidateReq */
     { SET6(IsWrite, NeedsExclusive, IsInvalidate,
            IsRequest, HasData, NeedsResponse),
             WriteInvalidateResp, "WriteInvalidateReq" },
-    /* WriteInvalidateResp (currently unused, see packet.hh) */
+    /* WriteInvalidateResp */
     { SET3(IsWrite, NeedsExclusive, IsResponse),
             InvalidCmd, "WriteInvalidateResp" },
     /* UpgradeReq */
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index f93725fcb..155a7ff82 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -91,14 +91,6 @@ class MemCmd
         HardPFReq,
         SoftPFResp,
         HardPFResp,
-        // WriteInvalidateReq transactions used to be generated by the
-        // DMA ports when writing full blocks to memory, however, it
-        // is not used anymore since we put the I/O cache in place to
-        // deal with partial block writes. Hence, WriteInvalidateReq
-        // and WriteInvalidateResp are currently unused. The
-        // implication is that the I/O cache does read-exclusive
-        // operations on every full-cache-block DMA, and ultimately
-        // this needs to be fixed.
         WriteInvalidateReq,
         WriteInvalidateResp,
         UpgradeReq,