diff --git a/src/mem/SimpleDRAM.py b/src/mem/SimpleDRAM.py index ec76542d8..b066b27de 100644 --- a/src/mem/SimpleDRAM.py +++ b/src/mem/SimpleDRAM.py @@ -10,6 +10,9 @@ # unmodified and in its entirety in all distributions of the software, # modified or unmodified, in source code or in binary form. # +# Copyright (c) 2013 Amin Farmahini-Farahani +# All rights reserved. +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: redistributions of source code must retain the above copyright @@ -118,7 +121,12 @@ class SimpleDRAM(AbstractMemory): static_backend_latency = Param.Latency("10ns", "Static backend latency") # the physical organisation of the DRAM - lines_per_rowbuffer = Param.Unsigned("Row buffer size in cache lines") + device_bus_width = Param.Unsigned("data bus width in bits for each DRAM "\ + "device/chip") + burst_length = Param.Unsigned("Burst lenght (BL) in beats") + device_rowbuffer_size = Param.MemorySize("Page (row buffer) size per "\ + "device/chip") + devices_per_rank = Param.Unsigned("Number of devices/chips per rank") ranks_per_channel = Param.Unsigned("Number of ranks per channel") banks_per_rank = Param.Unsigned("Number of banks per rank") # only used for the address mapping as the controller by @@ -141,9 +149,9 @@ class SimpleDRAM(AbstractMemory): # time to complete a burst transfer, typically the burst length # divided by two due to the DDR bus, but by making it a parameter # it is easier to also evaluate SDR memories like WideIO. - # This parameter has to account for bus width and burst length. - # Adjustment also necessary if cache line size is greater than - # data size read/written by one full burst. + # This parameter has to account for burst length. + # Read/Write requests with data size larger than one full burst are broken + # down into multiple requests in the SimpleDRAM controller tBURST = Param.Latency("Burst duration (for DDR burst length / 2 cycles)") # time taken to complete one refresh cycle (N rows in all banks) @@ -170,15 +178,22 @@ class SimpleDRAM(AbstractMemory): # tRC - assumed to be 4 * tRP - # burst length for an access derived from the cache line size - # A single DDR3 x64 interface (one command and address bus), with # default timings based on DDR3-1600 4 Gbit parts in an 8x8 # configuration, which would amount to 4 Gbyte of memory. class DDR3_1600_x64(SimpleDRAM): - # Assuming 64 byte cache lines, and a 1kbyte page size per module + # 8x8 configuration, 8 devices each with an 8-bit interface + device_bus_width = 8 + + # DDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB # (this depends on the memory density) - lines_per_rowbuffer = 128 + device_rowbuffer_size = '1kB' + + # 8x8 configuration, so 8 devices + devices_per_rank = 8 # Use two ranks ranks_per_channel = 2 @@ -191,8 +206,8 @@ class DDR3_1600_x64(SimpleDRAM): tCL = '13.75ns' tRP = '13.75ns' - # Assuming 64 byte cache lines, across an x64 - # interface, translates to BL8, 4 clocks @ 800 MHz + # 8 beats across an x64 interface translates to 4 clocks @ 800 MHz. + # Note this is a BL8 DDR device. tBURST = '5ns' # DDR3, 4 Gbit has a tRFC of 240 CK and tCK = 1.25 ns @@ -213,9 +228,18 @@ class DDR3_1600_x64(SimpleDRAM): # default timings based on a LPDDR2-1066 4 Gbit part in a 1x32 # configuration. class LPDDR2_S4_1066_x32(SimpleDRAM): - # Assuming 64 byte cache lines, use a 1kbyte page size, this - # depends on the memory density - lines_per_rowbuffer = 16 + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR2_S4 is a BL4 and BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB + # (this depends on the memory density) + device_rowbuffer_size = '1kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 # Use a single rank ranks_per_channel = 1 @@ -232,10 +256,11 @@ class LPDDR2_S4_1066_x32(SimpleDRAM): # Pre-charge one bank 15 ns (all banks 18 ns) tRP = '15ns' - # Assuming 64 byte cache lines, across a x32 DDR interface - # translates to two BL8, 8 clocks @ 533 MHz. Note that this is a - # simplification - tBURST = '15ns' + # 8 beats across an x32 DDR interface translates to 4 clocks @ 533 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the SimpleDRAM controller + tBURST = '7.5ns' # LPDDR2-S4, 4 Gbit tRFC = '130ns' @@ -251,9 +276,18 @@ class LPDDR2_S4_1066_x32(SimpleDRAM): # A single WideIO x128 interface (one command and address bus), with # default timings based on an estimated WIO-200 8 Gbit part. class WideIO_200_x128(SimpleDRAM): - # Assuming 64 byte cache lines, use a 4kbyte page size, this - # depends on the memory density - lines_per_rowbuffer = 64 + # 1x128 configuration, 1 device with a 128-bit interface + device_bus_width = 128 + + # This is a BL4 device + burst_length = 4 + + # Each device has a page (row buffer) size of 4KB + # (this depends on the memory density) + device_rowbuffer_size = '4kB' + + # 1x128 configuration, so 1 device + devices_per_rank = 1 # Use one rank for a one-high die stack ranks_per_channel = 1 @@ -266,8 +300,8 @@ class WideIO_200_x128(SimpleDRAM): tCL = '18ns' tRP = '18ns' - # Assuming 64 byte cache lines, across an x128 SDR interface, - # translates to BL4, 4 clocks @ 200 MHz + # 4 beats across an x128 SDR interface translates to 4 clocks @ 200 MHz. + # Note this is a BL4 SDR device. tBURST = '20ns' # WIO 8 Gb @@ -287,9 +321,18 @@ class WideIO_200_x128(SimpleDRAM): # default timings based on a LPDDR3-1600 4 Gbit part in a 1x32 # configuration class LPDDR3_1600_x32(SimpleDRAM): - # 4 Gbit and 8 Gbit devices use a 1 kByte page size, so ssuming 64 - # byte cache lines, that is 16 lines - lines_per_rowbuffer = 16 + # 1x32 configuration, 1 device with a 32-bit interface + device_bus_width = 32 + + # LPDDR3 is a BL8 device + burst_length = 8 + + # Each device has a page (row buffer) size of 1KB + # (this depends on the memory density) + device_rowbuffer_size = '1kB' + + # 1x32 configuration, so 1 device + devices_per_rank = 1 # Use a single rank ranks_per_channel = 1 @@ -306,9 +349,11 @@ class LPDDR3_1600_x32(SimpleDRAM): # Pre-charge one bank 15 ns (all banks 18 ns) tRP = '15ns' - # Assuming 64 byte cache lines, across a x32 DDR interface - # translates to two bursts of BL8, 8 clocks @ 800 MHz - tBURST = '10ns' + # 8 beats across a x32 DDR interface translates to 4 clocks @ 800 MHz. + # Note this is a BL8 DDR device. + # Requests larger than 32 bytes are broken down into multiple requests + # in the SimpleDRAM controller + tBURST = '5ns' # LPDDR3, 4 Gb tRFC = '130ns' diff --git a/src/mem/simple_dram.cc b/src/mem/simple_dram.cc index 9091288ec..faeedbb2b 100644 --- a/src/mem/simple_dram.cc +++ b/src/mem/simple_dram.cc @@ -11,6 +11,9 @@ * unmodified and in its entirety in all distributions of the software, * modified or unmodified, in source code or in binary form. * + * Copyright (c) 2013 Amin Farmahini-Farahani + * All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright @@ -54,8 +57,11 @@ SimpleDRAM::SimpleDRAM(const SimpleDRAMParams* p) : rowHitFlag(false), stopReads(false), actTicks(p->activation_limit, 0), writeEvent(this), respondEvent(this), refreshEvent(this), nextReqEvent(this), drainManager(NULL), - bytesPerCacheLine(0), - linesPerRowBuffer(p->lines_per_rowbuffer), + deviceBusWidth(p->device_bus_width), burstLength(p->burst_length), + deviceRowBufferSize(p->device_rowbuffer_size), + devicesPerRank(p->devices_per_rank), + burstSize((devicesPerRank * burstLength * deviceBusWidth) / 8), + rowBufferSize(devicesPerRank * deviceRowBufferSize), ranksPerChannel(p->ranks_per_channel), banksPerRank(p->banks_per_rank), channels(p->channels), rowsPerBank(0), readBufferSize(p->read_buffer_size), @@ -93,22 +99,22 @@ SimpleDRAM::init() port.sendRangeChange(); } - // get the burst size from the connected port as it is currently - // assumed to be equal to the cache line size - bytesPerCacheLine = _system->cacheLineSize(); - // we could deal with plenty options here, but for now do a quick // sanity check - if (bytesPerCacheLine != 64 && bytesPerCacheLine != 32) - panic("Unexpected burst size %d", bytesPerCacheLine); + DPRINTF(DRAM, "Burst size %d bytes\n", burstSize); // determine the rows per bank by looking at the total capacity uint64_t capacity = ULL(1) << ceilLog2(AbstractMemory::size()); DPRINTF(DRAM, "Memory capacity %lld (%lld) bytes\n", capacity, AbstractMemory::size()); - rowsPerBank = capacity / (bytesPerCacheLine * linesPerRowBuffer * - banksPerRank * ranksPerChannel); + + columnsPerRowBuffer = rowBufferSize / burstSize; + + DPRINTF(DRAM, "Row buffer size %d bytes with %d columns per row buffer\n", + rowBufferSize, columnsPerRowBuffer); + + rowsPerBank = capacity / (rowBufferSize * banksPerRank * ranksPerChannel); if (range.interleaved()) { if (channels != range.stripes()) @@ -116,18 +122,17 @@ SimpleDRAM::init() name(), range.stripes(), channels); if (addrMapping == Enums::RaBaChCo) { - if (bytesPerCacheLine * linesPerRowBuffer != - range.granularity()) { + if (rowBufferSize != range.granularity()) { panic("Interleaving of %s doesn't match RaBaChCo address map\n", name()); } } else if (addrMapping == Enums::RaBaCoCh) { - if (bytesPerCacheLine != range.granularity()) { + if (burstSize != range.granularity()) { panic("Interleaving of %s doesn't match RaBaCoCh address map\n", name()); } } else if (addrMapping == Enums::CoRaBaCh) { - if (bytesPerCacheLine != range.granularity()) + if (burstSize != range.granularity()) panic("Interleaving of %s doesn't match CoRaBaCh address map\n", name()); } @@ -162,24 +167,26 @@ SimpleDRAM::recvAtomic(PacketPtr pkt) } bool -SimpleDRAM::readQueueFull() const +SimpleDRAM::readQueueFull(unsigned int neededEntries) const { - DPRINTF(DRAM, "Read queue limit %d current size %d\n", - readBufferSize, readQueue.size() + respQueue.size()); + DPRINTF(DRAM, "Read queue limit %d, current size %d, entries needed %d\n", + readBufferSize, readQueue.size() + respQueue.size(), + neededEntries); - return (readQueue.size() + respQueue.size()) == readBufferSize; + return + (readQueue.size() + respQueue.size() + neededEntries) > readBufferSize; } bool -SimpleDRAM::writeQueueFull() const +SimpleDRAM::writeQueueFull(unsigned int neededEntries) const { - DPRINTF(DRAM, "Write queue limit %d current size %d\n", - writeBufferSize, writeQueue.size()); - return writeQueue.size() == writeBufferSize; + DPRINTF(DRAM, "Write queue limit %d, current size %d, entries needed %d\n", + writeBufferSize, writeQueue.size(), neededEntries); + return (writeQueue.size() + neededEntries) > writeBufferSize; } SimpleDRAM::DRAMPacket* -SimpleDRAM::decodeAddr(PacketPtr pkt) +SimpleDRAM::decodeAddr(PacketPtr pkt, Addr dramPktAddr, unsigned size) { // decode the address based on the address mapping scheme, with // Ra, Co, Ba and Ch denoting rank, column, bank and channel, @@ -188,17 +195,15 @@ SimpleDRAM::decodeAddr(PacketPtr pkt) uint16_t bank; uint16_t row; - Addr addr = pkt->getAddr(); - // truncate the address to the access granularity - addr = addr / bytesPerCacheLine; + Addr addr = dramPktAddr / burstSize; // we have removed the lowest order address bits that denote the - // position within the cache line + // position within the column if (addrMapping == Enums::RaBaChCo) { // the lowest order bits denote the column to ensure that // sequential cache lines occupy the same row - addr = addr / linesPerRowBuffer; + addr = addr / columnsPerRowBuffer; // take out the channel part of the address addr = addr / channels; @@ -221,7 +226,7 @@ SimpleDRAM::decodeAddr(PacketPtr pkt) addr = addr / channels; // next, the column - addr = addr / linesPerRowBuffer; + addr = addr / columnsPerRowBuffer; // after the column bits, we get the bank bits to interleave // over the banks @@ -256,7 +261,7 @@ SimpleDRAM::decodeAddr(PacketPtr pkt) // next the column bits which we do not need to keep track of // and simply skip past - addr = addr / linesPerRowBuffer; + addr = addr / columnsPerRowBuffer; // lastly, get the row bits row = addr % rowsPerBank; @@ -269,54 +274,98 @@ SimpleDRAM::decodeAddr(PacketPtr pkt) assert(row < rowsPerBank); DPRINTF(DRAM, "Address: %lld Rank %d Bank %d Row %d\n", - pkt->getAddr(), rank, bank, row); + dramPktAddr, rank, bank, row); // create the corresponding DRAM packet with the entry time and // ready time set to the current tick, the latter will be updated // later - return new DRAMPacket(pkt, rank, bank, row, pkt->getAddr(), + return new DRAMPacket(pkt, rank, bank, row, dramPktAddr, size, banks[rank][bank]); } void -SimpleDRAM::addToReadQueue(PacketPtr pkt) +SimpleDRAM::addToReadQueue(PacketPtr pkt, unsigned int pktCount) { // only add to the read queue here. whenever the request is // eventually done, set the readyTime, and call schedule() assert(!pkt->isWrite()); - // First check write buffer to see if the data is already at - // the controller - list::const_iterator i; - Addr addr = pkt->getAddr(); + assert(pktCount != 0); - // @todo: add size check - for (i = writeQueue.begin(); i != writeQueue.end(); ++i) { - if ((*i)->addr == addr){ - servicedByWrQ++; - DPRINTF(DRAM, "Read to %lld serviced by write queue\n", addr); - bytesRead += bytesPerCacheLine; - bytesConsumedRd += pkt->getSize(); - accessAndRespond(pkt, frontendLatency); - return; + // if the request size is larger than burst size, the pkt is split into + // multiple DRAM packets + // Note if the pkt starting address is not aligened to burst size, the + // address of first DRAM packet is kept unaliged. Subsequent DRAM packets + // are aligned to burst size boundaries. This is to ensure we accurately + // check read packets against packets in write queue. + Addr addr = pkt->getAddr(); + unsigned pktsServicedByWrQ = 0; + BurstHelper* burst_helper = NULL; + for (int cnt = 0; cnt < pktCount; ++cnt) { + unsigned size = std::min((addr | (burstSize - 1)) + 1, + pkt->getAddr() + pkt->getSize()) - addr; + readPktSize[ceilLog2(size)]++; + readBursts++; + + // First check write buffer to see if the data is already at + // the controller + bool foundInWrQ = false; + list::const_iterator i; + for (i = writeQueue.begin(); i != writeQueue.end(); ++i) { + if ((*i)->addr == addr && (*i)->size >= size){ + foundInWrQ = true; + servicedByWrQ++; + pktsServicedByWrQ++; + DPRINTF(DRAM, "Read to addr %lld with size %d serviced by " + "write queue\n", addr, size); + bytesRead += burstSize; + bytesConsumedRd += size; + break; + } } + + // If not found in the write q, make a DRAM packet and + // push it onto the read queue + if (!foundInWrQ) { + + // Make the burst helper for split packets + if (pktCount > 1 && burst_helper == NULL) { + DPRINTF(DRAM, "Read to addr %lld translates to %d " + "dram requests\n", pkt->getAddr(), pktCount); + burst_helper = new BurstHelper(pktCount); + } + + DRAMPacket* dram_pkt = decodeAddr(pkt, addr, size); + dram_pkt->burstHelper = burst_helper; + + assert(!readQueueFull(1)); + rdQLenPdf[readQueue.size() + respQueue.size()]++; + + DPRINTF(DRAM, "Adding to read queue\n"); + + readQueue.push_back(dram_pkt); + + // Update stats + uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank; + assert(bank_id < ranksPerChannel * banksPerRank); + perBankRdReqs[bank_id]++; + + avgRdQLen = readQueue.size() + respQueue.size(); + } + + // Starting address of next dram pkt (aligend to burstSize boundary) + addr = (addr | (burstSize - 1)) + 1; } - DRAMPacket* dram_pkt = decodeAddr(pkt); + // If all packets are serviced by write queue, we send the repsonse back + if (pktsServicedByWrQ == pktCount) { + accessAndRespond(pkt, frontendLatency); + return; + } - assert(readQueue.size() + respQueue.size() < readBufferSize); - rdQLenPdf[readQueue.size() + respQueue.size()]++; - - DPRINTF(DRAM, "Adding to read queue\n"); - - readQueue.push_back(dram_pkt); - - // Update stats - uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank; - assert(bank_id < ranksPerChannel * banksPerRank); - perBankRdReqs[bank_id]++; - - avgRdQLen = readQueue.size() + respQueue.size(); + // Update how many split packets are serviced by write queue + if (burst_helper != NULL) + burst_helper->burstsServiced = pktsServicedByWrQ; // If we are not already scheduled to get the read request out of // the queue, do so now @@ -364,7 +413,7 @@ SimpleDRAM::processWriteEvent() bank.openRow = dram_pkt->row; bank.freeAt = schedTime + tBURST + std::max(accessLat, tCL); busBusyUntil = bank.freeAt - tCL; - bank.bytesAccessed += bytesPerCacheLine; + bank.bytesAccessed += burstSize; if (!rowHitFlag) { bank.tRASDoneAt = bank.freeAt + tRP; @@ -385,7 +434,7 @@ SimpleDRAM::processWriteEvent() "banks_id %d is %lld\n", dram_pkt->rank * banksPerRank + dram_pkt->bank, bank.freeAt); - bytesPerActivate.sample(bytesPerCacheLine); + bytesPerActivate.sample(burstSize); } else panic("Unknown page management policy chosen\n"); @@ -449,34 +498,49 @@ SimpleDRAM::triggerWrites() } void -SimpleDRAM::addToWriteQueue(PacketPtr pkt) +SimpleDRAM::addToWriteQueue(PacketPtr pkt, unsigned int pktCount) { // only add to the write queue here. whenever the request is // eventually done, set the readyTime, and call schedule() assert(pkt->isWrite()); - DRAMPacket* dram_pkt = decodeAddr(pkt); + // if the request size is larger than burst size, the pkt is split into + // multiple DRAM packets + Addr addr = pkt->getAddr(); + for (int cnt = 0; cnt < pktCount; ++cnt) { + unsigned size = std::min((addr | (burstSize - 1)) + 1, + pkt->getAddr() + pkt->getSize()) - addr; + writePktSize[ceilLog2(size)]++; + writeBursts++; - assert(writeQueue.size() < writeBufferSize); - wrQLenPdf[writeQueue.size()]++; + DRAMPacket* dram_pkt = decodeAddr(pkt, addr, size); - DPRINTF(DRAM, "Adding to write queue\n"); + assert(writeQueue.size() < writeBufferSize); + wrQLenPdf[writeQueue.size()]++; - writeQueue.push_back(dram_pkt); + DPRINTF(DRAM, "Adding to write queue\n"); - // Update stats - uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank; - assert(bank_id < ranksPerChannel * banksPerRank); - perBankWrReqs[bank_id]++; + writeQueue.push_back(dram_pkt); - avgWrQLen = writeQueue.size(); + // Update stats + uint32_t bank_id = banksPerRank * dram_pkt->rank + dram_pkt->bank; + assert(bank_id < ranksPerChannel * banksPerRank); + perBankWrReqs[bank_id]++; + + avgWrQLen = writeQueue.size(); + + bytesConsumedWr += dram_pkt->size; + bytesWritten += burstSize; + + // Starting address of next dram pkt (aligend to burstSize boundary) + addr = (addr | (burstSize - 1)) + 1; + } // we do not wait for the writes to be send to the actual memory, // but instead take responsibility for the consistency here and // snoop the write queue for any upcoming reads - - bytesConsumedWr += pkt->getSize(); - bytesWritten += bytesPerCacheLine; + // @todo, if a pkt size is larger than burst size, we might need a + // different front end latency accessAndRespond(pkt, frontendLatency); // If your write buffer is starting to fill up, drain it! @@ -491,15 +555,18 @@ SimpleDRAM::printParams() const // Sanity check print of important parameters DPRINTF(DRAM, "Memory controller %s physical organization\n" \ - "Bytes per cacheline %d\n" \ - "Lines per row buffer %d\n" \ - "Rows per bank %d\n" \ - "Banks per rank %d\n" \ - "Ranks per channel %d\n" \ - "Total mem capacity %u\n", - name(), bytesPerCacheLine, linesPerRowBuffer, rowsPerBank, - banksPerRank, ranksPerChannel, bytesPerCacheLine * - linesPerRowBuffer * rowsPerBank * banksPerRank * ranksPerChannel); + "Number of devices per rank %d\n" \ + "Device bus width (in bits) %d\n" \ + "DRAM data bus burst %d\n" \ + "Row buffer size %d\n" \ + "Columns per row buffer %d\n" \ + "Rows per bank %d\n" \ + "Banks per rank %d\n" \ + "Ranks per channel %d\n" \ + "Total mem capacity %u\n", + name(), devicesPerRank, deviceBusWidth, burstSize, rowBufferSize, + columnsPerRowBuffer, rowsPerBank, banksPerRank, ranksPerChannel, + rowBufferSize * rowsPerBank * banksPerRank * ranksPerChannel); string scheduler = memSchedPolicy == Enums::fcfs ? "FCFS" : "FR-FCFS"; string address_mapping = addrMapping == Enums::RaBaChCo ? "RaBaChCo" : @@ -560,7 +627,7 @@ SimpleDRAM::recvTimingReq(PacketPtr pkt) // This is where we enter from the outside world DPRINTF(DRAM, "recvTimingReq: request %s addr %lld size %d\n", - pkt->cmdString(),pkt->getAddr(), pkt->getSize()); + pkt->cmdString(), pkt->getAddr(), pkt->getSize()); // simply drop inhibited packets for now if (pkt->memInhibitAsserted()) { @@ -569,9 +636,6 @@ SimpleDRAM::recvTimingReq(PacketPtr pkt) return true; } - if (pkt->getSize() == bytesPerCacheLine) - cpuReqs++; - // Every million accesses, print the state of the queues if (numReqs % 1000000 == 0) printQs(); @@ -582,37 +646,39 @@ SimpleDRAM::recvTimingReq(PacketPtr pkt) } prevArrival = curTick(); + + // Find out how many dram packets a pkt translates to + // If the burst size is equal or larger than the pkt size, then a pkt + // translates to only one dram packet. Otherwise, a pkt translates to + // multiple dram packets unsigned size = pkt->getSize(); - if (size > bytesPerCacheLine) - panic("Request size %d is greater than burst size %d", - size, bytesPerCacheLine); + unsigned offset = pkt->getAddr() & (burstSize - 1); + unsigned int dram_pkt_count = divCeil(offset + size, burstSize); // check local buffers and do not accept if full if (pkt->isRead()) { assert(size != 0); - if (readQueueFull()) { + if (readQueueFull(dram_pkt_count)) { DPRINTF(DRAM, "Read queue full, not accepting\n"); // remember that we have to retry this port retryRdReq = true; numRdRetry++; return false; } else { - readPktSize[ceilLog2(size)]++; - addToReadQueue(pkt); + addToReadQueue(pkt, dram_pkt_count); readReqs++; numReqs++; } } else if (pkt->isWrite()) { assert(size != 0); - if (writeQueueFull()) { + if (writeQueueFull(dram_pkt_count)) { DPRINTF(DRAM, "Write queue full, not accepting\n"); // remember that we have to retry this port retryWrReq = true; numWrRetry++; return false; } else { - writePktSize[ceilLog2(size)]++; - addToWriteQueue(pkt); + addToWriteQueue(pkt, dram_pkt_count); writeReqs++; numReqs++; } @@ -633,38 +699,54 @@ SimpleDRAM::processRespondEvent() DPRINTF(DRAM, "processRespondEvent(): Some req has reached its readyTime\n"); - PacketPtr pkt = respQueue.front()->pkt; + DRAMPacket* dram_pkt = respQueue.front(); - // Actually responds to the requestor - bytesConsumedRd += pkt->getSize(); - bytesRead += bytesPerCacheLine; - accessAndRespond(pkt, frontendLatency + backendLatency); + // Actually responds to the requestor + bytesConsumedRd += dram_pkt->size; + bytesRead += burstSize; + if (dram_pkt->burstHelper) { + // it is a split packet + dram_pkt->burstHelper->burstsServiced++; + if (dram_pkt->burstHelper->burstsServiced == + dram_pkt->burstHelper->burstCount) { + // we have now serviced all children packets of a system packet + // so we can now respond to the requester + // @todo we probably want to have a different front end and back + // end latency for split packets + accessAndRespond(dram_pkt->pkt, frontendLatency + backendLatency); + delete dram_pkt->burstHelper; + dram_pkt->burstHelper = NULL; + } + } else { + // it is not a split packet + accessAndRespond(dram_pkt->pkt, frontendLatency + backendLatency); + } - delete respQueue.front(); - respQueue.pop_front(); + delete respQueue.front(); + respQueue.pop_front(); - // Update stats - avgRdQLen = readQueue.size() + respQueue.size(); + // Update stats + avgRdQLen = readQueue.size() + respQueue.size(); - if (!respQueue.empty()) { - assert(respQueue.front()->readyTime >= curTick()); - assert(!respondEvent.scheduled()); - schedule(respondEvent, respQueue.front()->readyTime); - } else { - // if there is nothing left in any queue, signal a drain - if (writeQueue.empty() && readQueue.empty() && - drainManager) { - drainManager->signalDrainDone(); - drainManager = NULL; - } - } + if (!respQueue.empty()) { + assert(respQueue.front()->readyTime >= curTick()); + assert(!respondEvent.scheduled()); + schedule(respondEvent, respQueue.front()->readyTime); + } else { + // if there is nothing left in any queue, signal a drain + if (writeQueue.empty() && readQueue.empty() && + drainManager) { + drainManager->signalDrainDone(); + drainManager = NULL; + } + } - // We have made a location in the queue available at this point, - // so if there is a read that was forced to wait, retry now - if (retryRdReq) { - retryRdReq = false; - port.sendRetry(); - } + // We have made a location in the queue available at this point, + // so if there is a read that was forced to wait, retry now + if (retryRdReq) { + retryRdReq = false; + port.sendRetry(); + } } void @@ -911,7 +993,7 @@ SimpleDRAM::doDRAMAccess(DRAMPacket* dram_pkt) if (pageMgmt == Enums::open) { bank.openRow = dram_pkt->row; bank.freeAt = curTick() + addDelay + accessLat; - bank.bytesAccessed += bytesPerCacheLine; + bank.bytesAccessed += burstSize; // If you activated a new row do to this access, the next access // will have to respect tRAS for this bank. Assume tRAS ~= 3 * tRP. @@ -931,7 +1013,7 @@ SimpleDRAM::doDRAMAccess(DRAMPacket* dram_pkt) bank.freeAt = curTick() + addDelay + accessLat + tRP + tRP; recordActivate(bank.freeAt - tRP - tRP - tCL - tRCD); //essentially (freeAt - tRC) DPRINTF(DRAM,"doDRAMAccess::bank.freeAt is %lld\n",bank.freeAt); - bytesPerActivate.sample(bytesPerCacheLine); + bytesPerActivate.sample(burstSize); } else panic("No page management policy chosen\n"); @@ -1080,19 +1162,27 @@ SimpleDRAM::regStats() readReqs .name(name() + ".readReqs") - .desc("Total number of read requests seen"); + .desc("Total number of read requests accepted by DRAM controller"); writeReqs .name(name() + ".writeReqs") - .desc("Total number of write requests seen"); + .desc("Total number of write requests accepted by DRAM controller"); + + readBursts + .name(name() + ".readBursts") + .desc("Total number of DRAM read bursts. " + "Each DRAM read request translates to either one or multiple " + "DRAM read bursts"); + + writeBursts + .name(name() + ".writeBursts") + .desc("Total number of DRAM write bursts. " + "Each DRAM write request translates to either one or multiple " + "DRAM write bursts"); servicedByWrQ .name(name() + ".servicedByWrQ") - .desc("Number of read reqs serviced by write Q"); - - cpuReqs - .name(name() + ".cpureqs") - .desc("Reqs generatd by CPU via cache - shady"); + .desc("Number of DRAM read bursts serviced by write Q"); neitherReadNorWrite .name(name() + ".neitherReadNorWrite") @@ -1139,28 +1229,28 @@ SimpleDRAM::regStats() .desc("Average queueing delay per request") .precision(2); - avgQLat = totQLat / (readReqs - servicedByWrQ); + avgQLat = totQLat / (readBursts - servicedByWrQ); avgBankLat .name(name() + ".avgBankLat") .desc("Average bank access latency per request") .precision(2); - avgBankLat = totBankLat / (readReqs - servicedByWrQ); + avgBankLat = totBankLat / (readBursts - servicedByWrQ); avgBusLat .name(name() + ".avgBusLat") .desc("Average bus latency per request") .precision(2); - avgBusLat = totBusLat / (readReqs - servicedByWrQ); + avgBusLat = totBusLat / (readBursts - servicedByWrQ); avgMemAccLat .name(name() + ".avgMemAccLat") .desc("Average memory access latency") .precision(2); - avgMemAccLat = totMemAccLat / (readReqs - servicedByWrQ); + avgMemAccLat = totMemAccLat / (readBursts - servicedByWrQ); numRdRetry .name(name() + ".numRdRetry") @@ -1183,22 +1273,22 @@ SimpleDRAM::regStats() .desc("Row buffer hit rate for reads") .precision(2); - readRowHitRate = (readRowHits / (readReqs - servicedByWrQ)) * 100; + readRowHitRate = (readRowHits / (readBursts - servicedByWrQ)) * 100; writeRowHitRate .name(name() + ".writeRowHitRate") .desc("Row buffer hit rate for writes") .precision(2); - writeRowHitRate = (writeRowHits / writeReqs) * 100; + writeRowHitRate = (writeRowHits / writeBursts) * 100; readPktSize - .init(ceilLog2(bytesPerCacheLine) + 1) + .init(ceilLog2(burstSize) + 1) .name(name() + ".readPktSize") .desc("Categorize read packet sizes"); writePktSize - .init(ceilLog2(bytesPerCacheLine) + 1) + .init(ceilLog2(burstSize) + 1) .name(name() + ".writePktSize") .desc("Categorize write packet sizes"); @@ -1213,7 +1303,7 @@ SimpleDRAM::regStats() .desc("What write queue length does an incoming req see"); bytesPerActivate - .init(bytesPerCacheLine * linesPerRowBuffer) + .init(rowBufferSize) .name(name() + ".bytesPerActivate") .desc("Bytes accessed per row activation") .flags(nozero); @@ -1267,7 +1357,7 @@ SimpleDRAM::regStats() .desc("Theoretical peak bandwidth in MB/s") .precision(2); - peakBW = (SimClock::Frequency / tBURST) * bytesPerCacheLine / 1000000; + peakBW = (SimClock::Frequency / tBURST) * burstSize / 1000000; busUtil .name(name() + ".busUtil") diff --git a/src/mem/simple_dram.hh b/src/mem/simple_dram.hh index e4d20163a..313ad067b 100644 --- a/src/mem/simple_dram.hh +++ b/src/mem/simple_dram.hh @@ -11,6 +11,9 @@ * unmodified and in its entirety in all distributions of the software, * modified or unmodified, in source code or in binary form. * + * Copyright (c) 2013 Amin Farmahini-Farahani + * All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright @@ -157,6 +160,27 @@ class SimpleDRAM : public AbstractMemory { } }; + /** + * A burst helper helps organize and manage a packet that is larger than + * the DRAM burst size. A system packet that is larger than the burst size + * is split into multiple DRAM packets and all those DRAM packets point to + * a single burst helper such that we know when the whole packet is served. + */ + class BurstHelper { + + public: + + /** Number of DRAM bursts requred for a system packet **/ + const unsigned int burstCount; + + /** Number of DRAM bursts serviced so far for a system packet **/ + unsigned int burstsServiced; + + BurstHelper(unsigned int _burstCount) + : burstCount(_burstCount), burstsServiced(0) + { } + }; + /** * A DRAM packet stores packets along with the timestamp of when * the packet entered the queue, and also the decoded address. @@ -178,14 +202,34 @@ class SimpleDRAM : public AbstractMemory const uint8_t rank; const uint16_t bank; const uint16_t row; + + /** + * The starting address of the DRAM packet. + * This address could be unaligned to burst size boundaries. The + * reason is to keep the address offset so we can accurately check + * incoming read packets with packets in the write queue. + */ const Addr addr; + + /** + * The size of this dram packet in bytes + * It is always equal or smaller than DRAM burst size + */ + const unsigned int size; + + /** + * A pointer to the BurstHelper if this DRAMPacket is a split packet + * If not a split packet (common case), this is set to NULL + */ + BurstHelper* burstHelper; Bank& bank_ref; - DRAMPacket(PacketPtr _pkt, uint8_t _rank, - uint16_t _bank, uint16_t _row, Addr _addr, Bank& _bank_ref) + DRAMPacket(PacketPtr _pkt, uint8_t _rank, uint16_t _bank, + uint16_t _row, Addr _addr, unsigned int _size, + Bank& _bank_ref) : entryTime(curTick()), readyTime(curTick()), pkt(_pkt), rank(_rank), bank(_bank), row(_row), addr(_addr), - bank_ref(_bank_ref) + size(_size), burstHelper(NULL), bank_ref(_bank_ref) { } }; @@ -212,28 +256,34 @@ class SimpleDRAM : public AbstractMemory /** * Check if the read queue has room for more entries * + * @param pktCount The number of entries needed in the read queue * @return true if read queue is full, false otherwise */ - bool readQueueFull() const; + bool readQueueFull(unsigned int pktCount) const; /** * Check if the write queue has room for more entries * + * @param pktCount The number of entries needed in the write queue * @return true if write queue is full, false otherwise */ - bool writeQueueFull() const; + bool writeQueueFull(unsigned int pktCount) const; /** * When a new read comes in, first check if the write q has a * pending request to the same address.\ If not, decode the - * address to populate rank/bank/row, create a "dram_pkt", and - * push it to the back of the read queue.\ If this is the only + * address to populate rank/bank/row, create one or mutliple + * "dram_pkt", and push them to the back of the read queue.\ + * If this is the only * read request in the system, schedule an event to start * servicing it. * * @param pkt The request packet from the outside world + * @param pktCount The number of DRAM bursts the pkt + * translate to. If pkt size is larger then one full burst, + * then pktCount is greater than one. */ - void addToReadQueue(PacketPtr pkt); + void addToReadQueue(PacketPtr pkt, unsigned int pktCount); /** * Decode the incoming pkt, create a dram_pkt and push to the @@ -242,8 +292,11 @@ class SimpleDRAM : public AbstractMemory * to get full, stop reads, and start draining writes. * * @param pkt The request packet from the outside world + * @param pktCount The number of DRAM bursts the pkt + * translate to. If pkt size is larger then one full burst, + * then pktCount is greater than one. */ - void addToWriteQueue(PacketPtr pkt); + void addToWriteQueue(PacketPtr pkt, unsigned int pktCount); /** * Actually do the DRAM access - figure out the latency it @@ -276,12 +329,16 @@ class SimpleDRAM : public AbstractMemory /** * Address decoder to figure out physical mapping onto ranks, - * banks, and rows. + * banks, and rows. This function is called multiple times on the same + * system packet if the pakcet is larger than burst of the memory. The + * dramPktAddr is used for the offset within the packet. * * @param pkt The packet from the outside world + * @param dramPktAddr The starting address of the DRAM packet + * @param size The size of the DRAM packet in bytes * @return A DRAMPacket pointer with the decoded information */ - DRAMPacket* decodeAddr(PacketPtr pkt); + DRAMPacket* decodeAddr(PacketPtr pkt, Addr dramPktAddr, unsigned int size); /** * The memory schduler/arbiter - picks which read request needs to @@ -376,18 +433,21 @@ class SimpleDRAM : public AbstractMemory /** * The following are basic design parameters of the memory - * controller, and are initialized based on parameter values. The - * bytesPerCacheLine is based on the neighbouring ports cache line - * size and thus determined outside the constructor. Similarly, - * the rowsPerBank is determined based on the capacity, number of - * ranks and banks, the cache line size, and the row buffer size. + * controller, and are initialized based on parameter values. + * The rowsPerBank is determined based on the capacity, number of + * ranks and banks, the burst size, and the row buffer size. */ - uint32_t bytesPerCacheLine; - const uint32_t linesPerRowBuffer; + const uint32_t deviceBusWidth; + const uint32_t burstLength; + const uint32_t deviceRowBufferSize; + const uint32_t devicesPerRank; + const uint32_t burstSize; + const uint32_t rowBufferSize; const uint32_t ranksPerChannel; const uint32_t banksPerRank; const uint32_t channels; uint32_t rowsPerBank; + uint32_t columnsPerRowBuffer; const uint32_t readBufferSize; const uint32_t writeBufferSize; const double writeThresholdPerc; @@ -441,7 +501,8 @@ class SimpleDRAM : public AbstractMemory // All statistics that the model needs to capture Stats::Scalar readReqs; Stats::Scalar writeReqs; - Stats::Scalar cpuReqs; + Stats::Scalar readBursts; + Stats::Scalar writeBursts; Stats::Scalar bytesRead; Stats::Scalar bytesWritten; Stats::Scalar bytesConsumedRd;