ext: add McPAT source
this patch adds the source for mcpat, a power, area, and timing modeling framework.
This commit is contained in:
parent
8d665ee166
commit
e553a7bfa7
415
ext/mcpat/ARM_A9.xml
Normal file
415
ext/mcpat/ARM_A9.xml
Normal file
|
@ -0,0 +1,415 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="2"/>
|
||||
<param name="number_of_L1Directories" value="2"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="40"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="2000"/><!--MHz -->
|
||||
<param name="temperature" value="380"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="1"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
|
||||
<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors? -->
|
||||
<param name="machine_bits" value="32"/>
|
||||
<param name="virtual_address_width" value="32"/>
|
||||
<param name="physical_address_width" value="32"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="2000"/>
|
||||
<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
|
||||
<param name="opt_local" value="1"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="7"/>
|
||||
<param name="x86" value="0"/>
|
||||
<param name="micro_opcode_width" value="8"/>
|
||||
<param name="machine_type" value="0"/>
|
||||
<!-- inorder/OoO; 1 inorder; 0 OOO-->
|
||||
<param name="number_hardware_threads" value="1"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="2"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="2"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="4"/>
|
||||
<param name="peak_issue_width" value="7"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="4"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="1"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="8,8"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="3"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="1"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="20"/>
|
||||
<param name="fp_instruction_window_size" value="15"/>
|
||||
<!-- Numbers need to be confirmed -->
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="0"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="64"/>
|
||||
<param name="phy_Regs_FRF_size" value="64"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="0"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="4"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="0"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="400000"/>
|
||||
<stat name="int_instructions" value="200000"/>
|
||||
<stat name="fp_instructions" value="100000"/>
|
||||
<stat name="branch_instructions" value="100000"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="0"/>
|
||||
<stat name="store_instructions" value="50000"/>
|
||||
<stat name="committed_instructions" value="400000"/>
|
||||
<stat name="committed_int_instructions" value="200000"/>
|
||||
<stat name="committed_fp_instructions" value="100000"/>
|
||||
<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="400000"/>
|
||||
<stat name="ROB_writes" value="400000"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
|
||||
<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
|
||||
<stat name="fp_rename_reads" value="200000"/>
|
||||
<stat name="fp_rename_writes" value="100000"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="400000"/>
|
||||
<stat name="inst_window_writes" value="400000"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="800000"/>
|
||||
<stat name="fp_inst_window_reads" value="200000"/>
|
||||
<stat name="fp_inst_window_writes" value="200000"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="600000"/>
|
||||
<stat name="float_regfile_reads" value="100000"/>
|
||||
<stat name="int_regfile_writes" value="300000"/>
|
||||
<stat name="float_regfile_writes" value="50000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="300000"/>
|
||||
<stat name="fpu_accesses" value="100000"/>
|
||||
<stat name="mul_accesses" value="200000"/>
|
||||
<stat name="cdb_alu_accesses" value="300000"/>
|
||||
<stat name="cdb_mul_accesses" value="200000"/>
|
||||
<stat name="cdb_fpu_accesses" value="100000"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="1"/>
|
||||
<stat name="LSU_duty_cycle" value="0.5"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.5"/>
|
||||
<stat name="ALU_duty_cycle" value="1"/>
|
||||
<stat name="MUL_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_duty_cycle" value="0.3"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="1"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.3"/>
|
||||
<param name="number_of_BPT" value="2"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="4, 4, 4,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/><!--dual threads-->
|
||||
<stat name="total_accesses" value="400000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
|
||||
<param name="buffer_sizes" value="4, 4, 4, 4"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<param name="number_of_BTB" value="2"/>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="2048,4,2, 2, 1,3"/> <!--should be 4096 + 1024 -->
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
|
||||
<stat name="write_accesses" value="0"/>
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="2000"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="2000"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="2000"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="800"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="11824"/>
|
||||
<stat name="write_accesses" value="11276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="2000"/>
|
||||
<param name="type" value="0"/>
|
||||
<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
|
||||
at each time only one node can send req -->
|
||||
<param name="horizontal_nodes" value="1"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="1"/>
|
||||
<param name="output_ports" value="1"/>
|
||||
<!-- For bus the I/O ports should be 1 -->
|
||||
<param name="flit_bits" value="128"/>
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip.
|
||||
chip_coverage <=1 -->
|
||||
<param name="link_routing_over_percentage" value="0.5"/>
|
||||
<!-- Links can route over other components or occupy whole area.
|
||||
by default, 50% of the NoC global links routes over other
|
||||
components -->
|
||||
<stat name="total_accesses" value="100000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="1"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="mc_clock" value="400"/><!--MHz-->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="llc_line_length" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="66666"/>
|
||||
<stat name="memory_reads" value="33333"/>
|
||||
<stat name="memory_writes" value="33333"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
</component>
|
||||
</component>
|
463
ext/mcpat/ARM_A9_2000.xml
Normal file
463
ext/mcpat/ARM_A9_2000.xml
Normal file
|
@ -0,0 +1,463 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<!--Duty cycles in this file are set according to "ARM MPcore
|
||||
ARchitecture performance Enhancement" in MPF Japan 2008 -->
|
||||
<param name="number_of_cores" value="2"/>
|
||||
<param name="number_of_L1Directories" value="2"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="22"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="2000"/><!--MHz -->
|
||||
<param name="temperature" value="340"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
|
||||
<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors? -->
|
||||
<param name="opt_clockrate" value="1"/>
|
||||
<param name="machine_bits" value="32"/>
|
||||
<param name="virtual_address_width" value="32"/>
|
||||
<param name="physical_address_width" value="32"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="2000"/>
|
||||
<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
|
||||
<param name="opt_local" value="1"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="7"/>
|
||||
<param name="x86" value="0"/>
|
||||
<param name="micro_opcode_width" value="8"/>
|
||||
<param name="machine_type" value="0"/>
|
||||
<!-- inorder/OoO; 1 inorder; 0 OOO-->
|
||||
<param name="number_hardware_threads" value="1"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="2"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="2"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="4"/>
|
||||
<param name="peak_issue_width" value="7"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="4"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="1"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="8,8"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="3"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="1"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="20"/>
|
||||
<param name="fp_instruction_window_size" value="15"/>
|
||||
<!-- Numbers need to be confirmed -->
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="0"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="64"/>
|
||||
<param name="phy_Regs_FRF_size" value="64"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="0"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="4"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="0"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="4"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="400000"/>
|
||||
<stat name="int_instructions" value="200000"/>
|
||||
<stat name="fp_instructions" value="100000"/>
|
||||
<stat name="branch_instructions" value="100000"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="0"/>
|
||||
<stat name="store_instructions" value="50000"/>
|
||||
<stat name="committed_instructions" value="400000"/>
|
||||
<stat name="committed_int_instructions" value="200000"/>
|
||||
<stat name="committed_fp_instructions" value="100000"/>
|
||||
<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="400000"/>
|
||||
<stat name="ROB_writes" value="400000"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
|
||||
<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
|
||||
<stat name="fp_rename_reads" value="200000"/>
|
||||
<stat name="fp_rename_writes" value="100000"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="400000"/>
|
||||
<stat name="inst_window_writes" value="400000"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="800000"/>
|
||||
<stat name="fp_inst_window_reads" value="200000"/>
|
||||
<stat name="fp_inst_window_writes" value="200000"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="600000"/>
|
||||
<stat name="float_regfile_reads" value="100000"/>
|
||||
<stat name="int_regfile_writes" value="300000"/>
|
||||
<stat name="float_regfile_writes" value="50000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="300000"/>
|
||||
<stat name="fpu_accesses" value="100000"/>
|
||||
<stat name="mul_accesses" value="200000"/>
|
||||
<stat name="cdb_alu_accesses" value="300000"/>
|
||||
<stat name="cdb_mul_accesses" value="200000"/>
|
||||
<stat name="cdb_fpu_accesses" value="100000"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.9"/>
|
||||
<stat name="BR_duty_cycle" value="0.72"/><!--branch-->
|
||||
<stat name="LSU_duty_cycle" value="0.71"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="0.9"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.71"/>
|
||||
<stat name="ALU_duty_cycle" value="0.76"/>
|
||||
<!-- (.78*2+.71)/3 -->
|
||||
<stat name="MUL_duty_cycle" value="0.82"/>
|
||||
<stat name="FPU_duty_cycle" value="0.0"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.76"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.82"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.0"/>
|
||||
<param name="number_of_BPT" value="2"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="4"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="4, 4, 4,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/><!--dual threads-->
|
||||
<stat name="total_accesses" value="400000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
|
||||
<param name="buffer_sizes" value="4, 4, 4, 4"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<param name="number_of_BTB" value="2"/>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="4096,4,2, 2, 1,1"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
|
||||
<stat name="write_accesses" value="0"/>
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="2000"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="2"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="800"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="11824"/>
|
||||
<stat name="write_accesses" value="11276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="2000"/>
|
||||
<param name="type" value="0"/>
|
||||
<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
|
||||
at each time only one node can send req -->
|
||||
<param name="horizontal_nodes" value="1"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="1"/>
|
||||
<param name="output_ports" value="1"/>
|
||||
<!-- For bus the I/O ports should be 1 -->
|
||||
<param name="flit_bits" value="64"/>
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip.
|
||||
chip_coverage <=1 -->
|
||||
<param name="link_routing_over_percentage" value="0.5"/>
|
||||
<!-- Links can route over other components or occupy whole area.
|
||||
by default, 50% of the NoC global links routes over other
|
||||
components -->
|
||||
<stat name="total_accesses" value="100000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.2"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="400"/><!--MHz-->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
|
||||
<param name="number_mcs" value="1"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="66666"/>
|
||||
<stat name="memory_reads" value="33333"/>
|
||||
<stat name="memory_writes" value="33333"/>
|
||||
<param name="withPHY" value="1"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="1"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="1"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="1"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
463
ext/mcpat/ARM_A9_800.xml
Normal file
463
ext/mcpat/ARM_A9_800.xml
Normal file
|
@ -0,0 +1,463 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<!--Duty cycles in this file are set according to "ARM MPcore
|
||||
ARchitecture performance Enhancement" in MPF Japan 2008 -->
|
||||
<param name="number_of_cores" value="2"/>
|
||||
<param name="number_of_L1Directories" value="2"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="32"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="800"/><!--MHz -->
|
||||
<param name="temperature" value="340"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
|
||||
<param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors? -->
|
||||
<param name="opt_clockrate" value="0"/>
|
||||
<param name="machine_bits" value="32"/>
|
||||
<param name="virtual_address_width" value="32"/>
|
||||
<param name="physical_address_width" value="32"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="800"/>
|
||||
<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
|
||||
<param name="opt_local" value="1"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="7"/>
|
||||
<param name="x86" value="0"/>
|
||||
<param name="micro_opcode_width" value="8"/>
|
||||
<param name="machine_type" value="0"/>
|
||||
<!-- inorder/OoO; 1 inorder; 0 OOO-->
|
||||
<param name="number_hardware_threads" value="1"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="2"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="2"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="4"/>
|
||||
<param name="peak_issue_width" value="7"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="4"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="1"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="8,8"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="3"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="1"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="20"/>
|
||||
<param name="fp_instruction_window_size" value="15"/>
|
||||
<!-- Numbers need to be confirmed -->
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="0"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="64"/>
|
||||
<param name="phy_Regs_FRF_size" value="64"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="0"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="4"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="0"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="4"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="400000"/>
|
||||
<stat name="int_instructions" value="200000"/>
|
||||
<stat name="fp_instructions" value="100000"/>
|
||||
<stat name="branch_instructions" value="100000"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="0"/>
|
||||
<stat name="store_instructions" value="50000"/>
|
||||
<stat name="committed_instructions" value="400000"/>
|
||||
<stat name="committed_int_instructions" value="200000"/>
|
||||
<stat name="committed_fp_instructions" value="100000"/>
|
||||
<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="400000"/>
|
||||
<stat name="ROB_writes" value="400000"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
|
||||
<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
|
||||
<stat name="fp_rename_reads" value="200000"/>
|
||||
<stat name="fp_rename_writes" value="100000"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="400000"/>
|
||||
<stat name="inst_window_writes" value="400000"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="800000"/>
|
||||
<stat name="fp_inst_window_reads" value="200000"/>
|
||||
<stat name="fp_inst_window_writes" value="200000"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="600000"/>
|
||||
<stat name="float_regfile_reads" value="100000"/>
|
||||
<stat name="int_regfile_writes" value="300000"/>
|
||||
<stat name="float_regfile_writes" value="50000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="300000"/>
|
||||
<stat name="fpu_accesses" value="100000"/>
|
||||
<stat name="mul_accesses" value="200000"/>
|
||||
<stat name="cdb_alu_accesses" value="300000"/>
|
||||
<stat name="cdb_mul_accesses" value="200000"/>
|
||||
<stat name="cdb_fpu_accesses" value="100000"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.9"/>
|
||||
<stat name="BR_duty_cycle" value="0.72"/><!--branch-->
|
||||
<stat name="LSU_duty_cycle" value="0.71"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="0.9"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.71"/>
|
||||
<stat name="ALU_duty_cycle" value="0.76"/>
|
||||
<!-- (.78*2+.71)/3 -->
|
||||
<stat name="MUL_duty_cycle" value="0.82"/>
|
||||
<stat name="FPU_duty_cycle" value="0.0"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.76"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.82"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.0"/>
|
||||
<param name="number_of_BPT" value="2"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="4"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="32768,8,4,1,10,10,32,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="4, 4, 4,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/><!--dual threads-->
|
||||
<stat name="total_accesses" value="400000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/>
|
||||
<param name="buffer_sizes" value="4, 4, 4, 4"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<param name="number_of_BTB" value="2"/>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="4096,4,2, 2, 1,1"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
|
||||
<stat name="write_accesses" value="0"/>
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="800"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="2"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="800"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="11824"/>
|
||||
<stat name="write_accesses" value="11276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="800"/>
|
||||
<param name="type" value="0"/>
|
||||
<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
|
||||
at each time only one node can send req -->
|
||||
<param name="horizontal_nodes" value="1"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="1"/>
|
||||
<param name="output_ports" value="1"/>
|
||||
<!-- For bus the I/O ports should be 1 -->
|
||||
<param name="flit_bits" value="64"/>
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip.
|
||||
chip_coverage <=1 -->
|
||||
<param name="link_routing_over_percentage" value="0.5"/>
|
||||
<!-- Links can route over other components or occupy whole area.
|
||||
by default, 50% of the NoC global links routes over other
|
||||
components -->
|
||||
<stat name="total_accesses" value="100000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.2"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="400"/><!--MHz-->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="66666"/>
|
||||
<stat name="memory_reads" value="33333"/>
|
||||
<stat name="memory_writes" value="33333"/>
|
||||
<param name="withPHY" value="1"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
456
ext/mcpat/Alpha21364.xml
Normal file
456
ext/mcpat/Alpha21364.xml
Normal file
|
@ -0,0 +1,456 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="1"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="1"/>
|
||||
<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="90"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="1200"/><!--MHz -->
|
||||
<param name="temperature" value="380"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="0"/><!-- 0 no use; 1 use when approperiate -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="1200"/>
|
||||
<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
|
||||
<param name="opt_local" value="1"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="7"/>
|
||||
<param name="x86" value="0"/>
|
||||
<param name="micro_opcode_width" value="8"/>
|
||||
<param name="machine_type" value="0"/>
|
||||
<!-- inorder/OoO; 1 inorder; 0 OOO-->
|
||||
<param name="number_hardware_threads" value="1"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="4"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="4"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="4"/>
|
||||
<param name="peak_issue_width" value="6"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="4"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="2"/>
|
||||
<param name="prediction_width" value="1"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="7,7"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="4"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="0"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="1"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="20"/>
|
||||
<param name="fp_instruction_window_size" value="15"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="72"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="1"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="0"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="32"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="32"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="2"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="400000"/>
|
||||
<stat name="int_instructions" value="200000"/>
|
||||
<stat name="fp_instructions" value="100000"/>
|
||||
<stat name="branch_instructions" value="100000"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="0"/>
|
||||
<stat name="store_instructions" value="50000"/>
|
||||
<stat name="committed_instructions" value="400000"/>
|
||||
<stat name="committed_int_instructions" value="200000"/>
|
||||
<stat name="committed_fp_instructions" value="100000"/>
|
||||
<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="400000"/>
|
||||
<stat name="ROB_writes" value="400000"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
|
||||
<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
|
||||
<stat name="fp_rename_reads" value="200000"/>
|
||||
<stat name="fp_rename_writes" value="100000"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="400000"/>
|
||||
<stat name="inst_window_writes" value="400000"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="800000"/>
|
||||
<stat name="fp_inst_window_reads" value="200000"/>
|
||||
<stat name="fp_inst_window_writes" value="200000"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="600000"/>
|
||||
<stat name="float_regfile_reads" value="100000"/>
|
||||
<stat name="int_regfile_writes" value="300000"/>
|
||||
<stat name="float_regfile_writes" value="50000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="300000"/>
|
||||
<stat name="fpu_accesses" value="100000"/>
|
||||
<stat name="mul_accesses" value="200000"/>
|
||||
<stat name="cdb_alu_accesses" value="300000"/>
|
||||
<stat name="cdb_mul_accesses" value="200000"/>
|
||||
<stat name="cdb_fpu_accesses" value="100000"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="1"/>
|
||||
<stat name="LSU_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="1"/>
|
||||
<stat name="ALU_duty_cycle" value="1"/>
|
||||
<stat name="MUL_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_duty_cycle" value="1"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="1"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="1"/>
|
||||
<param name="number_of_BPT" value="2"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="128"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="65536,16,2,1,1,2,16,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="128"/><!--dual threads-->
|
||||
<stat name="total_accesses" value="400000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="65536,16,2,1,1,3,16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<param name="number_of_BTB" value="2"/>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="6144,4,2,1, 1,3"/> <!--48Kbits -->
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
|
||||
<stat name="write_accesses" value="0"/>
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="512,4,0,1,1, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1835008,16, 8, 16, 32, 32, 12, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="850"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="11824"/>
|
||||
<stat name="write_accesses" value="11276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="type" value="1"/>
|
||||
<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
|
||||
at each time only one node can send req -->
|
||||
<param name="horizontal_nodes" value="1"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="1"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="8"/>
|
||||
<param name="output_ports" value="7"/>
|
||||
<!-- For bus the I/O ports should be 1 -->
|
||||
<param name="virtual_channel_per_port" value="2"/>
|
||||
<param name="input_buffer_entries_per_vc" value="128"/>
|
||||
<param name="flit_bits" value="40"/>
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip.
|
||||
chip_coverage <=1 -->
|
||||
<param name="link_routing_over_percentage" value="1.0"/>
|
||||
<!-- Links can route over other components or occupy whole area.
|
||||
by default, 50% of the NoC global links routes over other
|
||||
components -->
|
||||
<stat name="total_accesses" value="100000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="1"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="180"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="800"/><!--MHz-->
|
||||
<param name="peak_transfer_rate" value="1600"/><!--MB/S-->
|
||||
<param name="block_size" value="16"/><!--B-->
|
||||
<param name="number_mcs" value="2"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="2"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="32"/>
|
||||
<param name="addressbus_width" value="32"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="6666"/>
|
||||
<stat name="memory_reads" value="3333"/>
|
||||
<stat name="memory_writes" value="3333"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
442
ext/mcpat/Niagara1.xml
Normal file
442
ext/mcpat/Niagara1.xml
Normal file
|
@ -0,0 +1,442 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="8"/>
|
||||
<param name="number_of_L1Directories" value="4"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="4"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="90"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="1200"/><!--MHz -->
|
||||
<param name="temperature" value="380"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="1200"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="9"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
|
||||
<!-- inorder/OoO -->
|
||||
<param name="number_hardware_threads" value="4"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="1"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="1"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="1"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="1"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="0"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="6,6"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="1"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="0.125"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="16"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="16"/>
|
||||
<param name="fp_instruction_window_size" value="16"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="80"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="8"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="32"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="32"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="800000"/>
|
||||
<stat name="int_instructions" value="600000"/>
|
||||
<stat name="fp_instructions" value="20000"/>
|
||||
<stat name="branch_instructions" value="0"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="100000"/>
|
||||
<stat name="store_instructions" value="100000"/>
|
||||
<stat name="committed_instructions" value="800000"/>
|
||||
<stat name="committed_int_instructions" value="600000"/>
|
||||
<stat name="committed_fp_instructions" value="20000"/>
|
||||
<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="263886"/>
|
||||
<stat name="ROB_writes" value="263886"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_accesses" value="263886"/>
|
||||
<stat name="fp_rename_accesses" value="263886"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="263886"/>
|
||||
<stat name="inst_window_writes" value="263886"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="263886"/>
|
||||
<stat name="fp_inst_window_reads" value="263886"/>
|
||||
<stat name="fp_inst_window_writes" value="263886"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="1600000"/>
|
||||
<stat name="float_regfile_reads" value="40000"/>
|
||||
<stat name="int_regfile_writes" value="800000"/>
|
||||
<stat name="float_regfile_writes" value="20000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="800000"/>
|
||||
<stat name="fpu_accesses" value="10000"/>
|
||||
<stat name="mul_accesses" value="100000"/>
|
||||
<stat name="cdb_alu_accesses" value="1000000"/>
|
||||
<stat name="cdb_mul_accesses" value="0"/>
|
||||
<stat name="cdb_fpu_accesses" value="0"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.25"/>
|
||||
<stat name="LSU_duty_cycle" value="0.25"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.25"/>
|
||||
<stat name="ALU_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_duty_cycle" value="0.4"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.4"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="800000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
|
||||
<!-- consider 4-way bank interleaving for Niagara 1 -->
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="0"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="write_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.5"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.35"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="1200"/>
|
||||
<param name="type" value="1"/>
|
||||
<!-- 1 NoC, O bus -->
|
||||
<param name="horizontal_nodes" value="2"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="8"/>
|
||||
<param name="output_ports" value="5"/>
|
||||
<param name="virtual_channel_per_port" value="1"/>
|
||||
<!-- input buffer; in classic routers only input ports need buffers -->
|
||||
<param name="flit_bits" value="136"/>
|
||||
<param name="input_buffer_entries_per_vc" value="2"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
|
||||
<stat name="total_accesses" value="360000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.6"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="4"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
400
ext/mcpat/Niagara1_sharing.xml
Normal file
400
ext/mcpat/Niagara1_sharing.xml
Normal file
|
@ -0,0 +1,400 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="64"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="22"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="3500"/><!--MHz -->
|
||||
<param name="temperature" value="360"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="3500"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="9"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
|
||||
<!-- inorder/OoO -->
|
||||
<param name="number_hardware_threads" value="4"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="1"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="1"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="1"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="1"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="0"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="6,6"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="1"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="0.125"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="16"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="16"/>
|
||||
<param name="fp_instruction_window_size" value="16"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="80"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="8"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="32"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="32"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="800000"/>
|
||||
<stat name="int_instructions" value="600000"/>
|
||||
<stat name="fp_instructions" value="20000"/>
|
||||
<stat name="branch_instructions" value="0"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="100000"/>
|
||||
<stat name="store_instructions" value="100000"/>
|
||||
<stat name="committed_instructions" value="800000"/>
|
||||
<stat name="committed_int_instructions" value="600000"/>
|
||||
<stat name="committed_fp_instructions" value="20000"/>
|
||||
<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="263886"/>
|
||||
<stat name="ROB_writes" value="263886"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_accesses" value="263886"/>
|
||||
<stat name="fp_rename_accesses" value="263886"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="263886"/>
|
||||
<stat name="inst_window_writes" value="263886"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="263886"/>
|
||||
<stat name="fp_inst_window_reads" value="263886"/>
|
||||
<stat name="fp_inst_window_writes" value="263886"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="1600000"/>
|
||||
<stat name="float_regfile_reads" value="40000"/>
|
||||
<stat name="int_regfile_writes" value="800000"/>
|
||||
<stat name="float_regfile_writes" value="20000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="800000"/>
|
||||
<stat name="fpu_accesses" value="10000"/>
|
||||
<stat name="mul_accesses" value="100000"/>
|
||||
<stat name="cdb_alu_accesses" value="1000000"/>
|
||||
<stat name="cdb_mul_accesses" value="0"/>
|
||||
<stat name="cdb_fpu_accesses" value="0"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.25"/>
|
||||
<stat name="LSU_duty_cycle" value="0.25"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.25"/>
|
||||
<stat name="ALU_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_duty_cycle" value="0.4"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.4"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="800000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
|
||||
<!-- consider 4-way bank interleaving for Niagara 1 -->
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="0"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="write_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.5"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.35"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="type" value="1"/>
|
||||
<!-- 1 NoC, O bus -->
|
||||
<param name="horizontal_nodes" value="8"/>
|
||||
<param name="vertical_nodes" value="8"/>
|
||||
<param name="has_global_link" value="1"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="5"/>
|
||||
<param name="output_ports" value="5"/>
|
||||
<param name="virtual_channel_per_port" value="1"/>
|
||||
<!-- input buffer; in classic routers only input ports need buffers -->
|
||||
<param name="flit_bits" value="256"/>
|
||||
<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
|
||||
<stat name="total_accesses" value="360000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="llc_line_length" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="4"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
</component>
|
||||
</component>
|
442
ext/mcpat/Niagara1_sharing_DC.xml
Normal file
442
ext/mcpat/Niagara1_sharing_DC.xml
Normal file
|
@ -0,0 +1,442 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="64"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="8"/>
|
||||
<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="22"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="3500"/><!--MHz -->
|
||||
<param name="temperature" value="360"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="3500"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="9"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
|
||||
<!-- inorder/OoO -->
|
||||
<param name="number_hardware_threads" value="4"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="1"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="1"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="1"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="1"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="0"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="6,6"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="1"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="0.125"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="16"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="16"/>
|
||||
<param name="fp_instruction_window_size" value="16"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="80"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="8"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="32"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="32"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="800000"/>
|
||||
<stat name="int_instructions" value="600000"/>
|
||||
<stat name="fp_instructions" value="20000"/>
|
||||
<stat name="branch_instructions" value="0"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="100000"/>
|
||||
<stat name="store_instructions" value="100000"/>
|
||||
<stat name="committed_instructions" value="800000"/>
|
||||
<stat name="committed_int_instructions" value="600000"/>
|
||||
<stat name="committed_fp_instructions" value="20000"/>
|
||||
<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="263886"/>
|
||||
<stat name="ROB_writes" value="263886"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_accesses" value="263886"/>
|
||||
<stat name="fp_rename_accesses" value="263886"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="263886"/>
|
||||
<stat name="inst_window_writes" value="263886"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="263886"/>
|
||||
<stat name="fp_inst_window_reads" value="263886"/>
|
||||
<stat name="fp_inst_window_writes" value="263886"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="1600000"/>
|
||||
<stat name="float_regfile_reads" value="40000"/>
|
||||
<stat name="int_regfile_writes" value="800000"/>
|
||||
<stat name="float_regfile_writes" value="20000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="800000"/>
|
||||
<stat name="fpu_accesses" value="10000"/>
|
||||
<stat name="mul_accesses" value="100000"/>
|
||||
<stat name="cdb_alu_accesses" value="1000000"/>
|
||||
<stat name="cdb_mul_accesses" value="0"/>
|
||||
<stat name="cdb_fpu_accesses" value="0"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.25"/>
|
||||
<stat name="LSU_duty_cycle" value="0.25"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.25"/>
|
||||
<stat name="ALU_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_duty_cycle" value="0.4"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.4"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="800000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,9,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
|
||||
<!-- consider 4-way bank interleaving for Niagara 1 -->
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="0"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="write_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.5"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.35"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="type" value="1"/>
|
||||
<!-- 1 NoC, O bus -->
|
||||
<param name="horizontal_nodes" value="8"/>
|
||||
<param name="vertical_nodes" value="8"/>
|
||||
<param name="has_global_link" value="1"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="5"/>
|
||||
<param name="output_ports" value="5"/>
|
||||
<param name="virtual_channel_per_port" value="1"/>
|
||||
<!-- input buffer; in classic routers only input ports need buffers -->
|
||||
<param name="flit_bits" value="256"/>
|
||||
<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
|
||||
<stat name="total_accesses" value="360000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
455
ext/mcpat/Niagara1_sharing_SBT.xml
Normal file
455
ext/mcpat/Niagara1_sharing_SBT.xml
Normal file
|
@ -0,0 +1,455 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="64"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="22"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="3500"/><!--MHz -->
|
||||
<param name="temperature" value="360"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="3500"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="9"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
|
||||
<!-- inorder/OoO -->
|
||||
<param name="number_hardware_threads" value="4"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="1"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="1"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="1"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="1"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="0"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="6,6"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="1"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="0.125"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="16"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="16"/>
|
||||
<param name="fp_instruction_window_size" value="16"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="80"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="8"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="32"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="32"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="800000"/>
|
||||
<stat name="int_instructions" value="600000"/>
|
||||
<stat name="fp_instructions" value="20000"/>
|
||||
<stat name="branch_instructions" value="0"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="100000"/>
|
||||
<stat name="store_instructions" value="100000"/>
|
||||
<stat name="committed_instructions" value="800000"/>
|
||||
<stat name="committed_int_instructions" value="600000"/>
|
||||
<stat name="committed_fp_instructions" value="20000"/>
|
||||
<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="263886"/>
|
||||
<stat name="ROB_writes" value="263886"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_accesses" value="263886"/>
|
||||
<stat name="fp_rename_accesses" value="263886"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="263886"/>
|
||||
<stat name="inst_window_writes" value="263886"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="263886"/>
|
||||
<stat name="fp_inst_window_reads" value="263886"/>
|
||||
<stat name="fp_inst_window_writes" value="263886"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="1600000"/>
|
||||
<stat name="float_regfile_reads" value="40000"/>
|
||||
<stat name="int_regfile_writes" value="800000"/>
|
||||
<stat name="float_regfile_writes" value="20000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="800000"/>
|
||||
<stat name="fpu_accesses" value="10000"/>
|
||||
<stat name="mul_accesses" value="100000"/>
|
||||
<stat name="cdb_alu_accesses" value="1000000"/>
|
||||
<stat name="cdb_mul_accesses" value="0"/>
|
||||
<stat name="cdb_fpu_accesses" value="0"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.25"/>
|
||||
<stat name="LSU_duty_cycle" value="0.25"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.25"/>
|
||||
<stat name="ALU_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_duty_cycle" value="0.4"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.4"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="800000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->
|
||||
<param name="Dir_config" value="8388608,9,0,1,100, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,8"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="merged_dir" value="1"/><!--if static bank tag is used as the directory -->
|
||||
<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
|
||||
<!-- consider 4-way bank interleaving for Niagara 1 -->
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="0"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="write_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.5"/>
|
||||
<stat name="coherent_read_accesses" value="400000"/>
|
||||
<stat name="coherent_write_accesses" value="0"/>
|
||||
<stat name="coherent_read_misses" value="400000"/>
|
||||
<stat name="coherent_write_misses" value="0"/>
|
||||
<stat name="dir_duty_cycle" value="0.5"/>
|
||||
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.35"/>
|
||||
<param name="Merged_dir" value="1"/><!--if static bank tag is used as the directory -->
|
||||
<stat name="coherent_read_accesses" value="400000"/>
|
||||
<stat name="coherent_write_accesses" value="0"/>
|
||||
<stat name="coherent_read_misses" value="400000"/>
|
||||
<stat name="coherent_write_misses" value="0"/>
|
||||
<stat name="dir_duty_cycle" value="0.5"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="type" value="1"/>
|
||||
<!-- 1 NoC, O bus -->
|
||||
<param name="horizontal_nodes" value="8"/>
|
||||
<param name="vertical_nodes" value="8"/>
|
||||
<param name="has_global_link" value="1"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="5"/>
|
||||
<param name="output_ports" value="5"/>
|
||||
<param name="virtual_channel_per_port" value="1"/>
|
||||
<!-- input buffer; in classic routers only input ports need buffers -->
|
||||
<param name="flit_bits" value="256"/>
|
||||
<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
|
||||
<stat name="total_accesses" value="360000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
443
ext/mcpat/Niagara1_sharing_ST.xml
Normal file
443
ext/mcpat/Niagara1_sharing_ST.xml
Normal file
|
@ -0,0 +1,443 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="64"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="1"/>
|
||||
<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="22"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="3500"/><!--MHz -->
|
||||
<param name="temperature" value="360"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="3500"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="9"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
|
||||
<!-- inorder/OoO -->
|
||||
<param name="number_hardware_threads" value="4"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="1"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="1"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="1"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="1"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="0"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="6,6"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="1"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="0.125"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="16"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="16"/>
|
||||
<param name="fp_instruction_window_size" value="16"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="80"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="8"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="32"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="32"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="800000"/>
|
||||
<stat name="int_instructions" value="600000"/>
|
||||
<stat name="fp_instructions" value="20000"/>
|
||||
<stat name="branch_instructions" value="0"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="100000"/>
|
||||
<stat name="store_instructions" value="100000"/>
|
||||
<stat name="committed_instructions" value="800000"/>
|
||||
<stat name="committed_int_instructions" value="600000"/>
|
||||
<stat name="committed_fp_instructions" value="20000"/>
|
||||
<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="263886"/>
|
||||
<stat name="ROB_writes" value="263886"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_accesses" value="263886"/>
|
||||
<stat name="fp_rename_accesses" value="263886"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="263886"/>
|
||||
<stat name="inst_window_writes" value="263886"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="263886"/>
|
||||
<stat name="fp_inst_window_reads" value="263886"/>
|
||||
<stat name="fp_inst_window_writes" value="263886"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="1600000"/>
|
||||
<stat name="float_regfile_reads" value="40000"/>
|
||||
<stat name="int_regfile_writes" value="800000"/>
|
||||
<stat name="float_regfile_writes" value="20000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="800000"/>
|
||||
<stat name="fpu_accesses" value="10000"/>
|
||||
<stat name="mul_accesses" value="100000"/>
|
||||
<stat name="cdb_alu_accesses" value="1000000"/>
|
||||
<stat name="cdb_mul_accesses" value="0"/>
|
||||
<stat name="cdb_fpu_accesses" value="0"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.25"/>
|
||||
<stat name="LSU_duty_cycle" value="0.25"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.25"/>
|
||||
<stat name="ALU_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_duty_cycle" value="0.4"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.5"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.4"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="800000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->
|
||||
<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->
|
||||
<param name="Dir_config" value="8388608,9,0,1,100, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="0,0,8"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
<stat name="duty_cycle" value="0.45"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
|
||||
<param name="Merged_dir" value="1"/>
|
||||
<!-- consider 4-way bank interleaving for Niagara 1 -->
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="0"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="write_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.5"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="Merged_dir" value="1"/>
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.35"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="type" value="1"/>
|
||||
<!-- 1 NoC, O bus -->
|
||||
<param name="horizontal_nodes" value="8"/>
|
||||
<param name="vertical_nodes" value="8"/>
|
||||
<param name="has_global_link" value="1"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="5"/>
|
||||
<param name="output_ports" value="5"/>
|
||||
<param name="virtual_channel_per_port" value="1"/>
|
||||
<!-- input buffer; in classic routers only input ports need buffers -->
|
||||
<param name="flit_bits" value="256"/>
|
||||
<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
|
||||
<stat name="total_accesses" value="360000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
</component>
|
||||
</component>
|
438
ext/mcpat/Niagara2.xml
Normal file
438
ext/mcpat/Niagara2.xml
Normal file
|
@ -0,0 +1,438 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="8"/>
|
||||
<param name="number_of_L1Directories" value="8"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="8"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="65"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="1400"/><!--MHz -->
|
||||
<param name="temperature" value="380"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="1400"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="9"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
|
||||
<!-- inorder/OoO -->
|
||||
<param name="number_hardware_threads" value="4"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="1"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="1"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="1"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="1"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="1"/>
|
||||
<param name="prediction_width" value="0"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="2,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="8,8"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="2"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="0"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="1"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="16"/>
|
||||
<param name="fp_instruction_window_size" value="16"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="80"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="32"/>
|
||||
<param name="archi_Regs_FRF_size" value="32"/>
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="80"/>
|
||||
<param name="phy_Regs_FRF_size" value="80"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="8"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="64"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="64"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="1"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="32"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="1600000"/>
|
||||
<stat name="int_instructions" value="1200000"/>
|
||||
<stat name="fp_instructions" value="40000"/>
|
||||
<stat name="branch_instructions" value="0"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="200000"/>
|
||||
<stat name="store_instructions" value="200000"/>
|
||||
<stat name="committed_instructions" value="1600000"/>
|
||||
<stat name="committed_int_instructions" value="1200000"/>
|
||||
<stat name="committed_fp_instructions" value="40000"/>
|
||||
<stat name="pipeline_duty_cycle" value="0.5"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="263886"/>
|
||||
<stat name="ROB_writes" value="263886"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_accesses" value="263886"/>
|
||||
<stat name="fp_rename_accesses" value="263886"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="263886"/>
|
||||
<stat name="inst_window_writes" value="263886"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="263886"/>
|
||||
<stat name="fp_inst_window_reads" value="263886"/>
|
||||
<stat name="fp_inst_window_writes" value="263886"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="3200000"/>
|
||||
<stat name="float_regfile_reads" value="80000"/>
|
||||
<stat name="int_regfile_writes" value="1600000"/>
|
||||
<stat name="float_regfile_writes" value="40000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="1600000"/>
|
||||
<stat name="fpu_accesses" value="10000"/>
|
||||
<stat name="mul_accesses" value="100000"/>
|
||||
<stat name="cdb_alu_accesses" value="1200000"/>
|
||||
<stat name="cdb_mul_accesses" value="0"/>
|
||||
<stat name="cdb_fpu_accesses" value="0"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="0.5"/>
|
||||
<stat name="LSU_duty_cycle" value="0.25"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="0.5"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.25"/>
|
||||
<stat name="ALU_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_duty_cycle" value="0"/>
|
||||
<stat name="FPU_duty_cycle" value="0.6"/>
|
||||
<!--FPU also handles Mul/div -->
|
||||
<stat name="ALU_cdb_duty_cycle" value="0.9"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.6"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="64"/>
|
||||
<stat name="total_accesses" value="800000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="16384,32,8,1,1,7,8,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="128"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="8192,16,4,1, 1,3, 16,0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="8192,4,2,1, 1,3"/>
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1024,2,0,1,1,1, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="1400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="1400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="524228,64,16,1, 8,23, 64,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="1400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="400000"/>
|
||||
<stat name="write_accesses" value="0"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="write_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="1048576,64,16,1, 2,100, 64, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="3500"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="0.35"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="1400"/>
|
||||
<param name="horizontal_nodes" value="2"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="9"/>
|
||||
<param name="output_ports" value="8"/>
|
||||
<param name="virtual_channel_per_port" value="1"/>
|
||||
<!-- input buffer; in classic routers only input ports need buffers -->
|
||||
<param name="flit_bits" value="136"/>
|
||||
<param name="input_buffer_entries_per_vc" value="16"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
|
||||
<stat name="total_accesses" value="160000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="0.1"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="400"/><!--MHz-->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
|
||||
<param name="number_mcs" value="4"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="66666"/>
|
||||
<stat name="memory_reads" value="33333"/>
|
||||
<stat name="memory_writes" value="33333"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="2"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="1"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
456
ext/mcpat/Penryn.xml
Normal file
456
ext/mcpat/Penryn.xml
Normal file
|
@ -0,0 +1,456 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="2"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
|
||||
<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="45"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="3700"/><!--MHz -->
|
||||
<param name="temperature" value="380"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="2"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="3700"/>
|
||||
<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
|
||||
<param name="opt_local" value="1"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="16"/>
|
||||
<param name="x86" value="1"/>
|
||||
<param name="micro_opcode_width" value="8"/>
|
||||
<param name="machine_type" value="0"/>
|
||||
<!-- inorder/OoO; 1 inorder; 0 OOO-->
|
||||
<param name="number_hardware_threads" value="1"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="4"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="4"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="4"/>
|
||||
<param name="peak_issue_width" value="6"/><!--As shown in Wiki figure which has max 5 ports, store data/address is modeled
|
||||
as a single port.-->
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="4"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="2"/>
|
||||
<param name="prediction_width" value="1"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="14,14"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="6"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="2"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/><!--Inst. + micro-op -->
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="1"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="32"/>
|
||||
<param name="fp_instruction_window_size" value="32"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="96"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->
|
||||
<param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="256"/>
|
||||
<param name="phy_Regs_FRF_size" value="256"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="0"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="96"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="48"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="2"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="64"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="400000"/>
|
||||
<stat name="int_instructions" value="200000"/>
|
||||
<stat name="fp_instructions" value="100000"/>
|
||||
<stat name="branch_instructions" value="100000"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="0"/>
|
||||
<stat name="store_instructions" value="50000"/>
|
||||
<stat name="committed_instructions" value="400000"/>
|
||||
<stat name="committed_int_instructions" value="200000"/>
|
||||
<stat name="committed_fp_instructions" value="100000"/>
|
||||
<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="400000"/>
|
||||
<stat name="ROB_writes" value="400000"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
|
||||
<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
|
||||
<stat name="fp_rename_reads" value="200000"/>
|
||||
<stat name="fp_rename_writes" value="100000"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="400000"/>
|
||||
<stat name="inst_window_writes" value="400000"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="800000"/>
|
||||
<stat name="fp_inst_window_reads" value="200000"/>
|
||||
<stat name="fp_inst_window_writes" value="200000"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="600000"/>
|
||||
<stat name="float_regfile_reads" value="100000"/>
|
||||
<stat name="int_regfile_writes" value="300000"/>
|
||||
<stat name="float_regfile_writes" value="50000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="300000"/>
|
||||
<stat name="fpu_accesses" value="100000"/>
|
||||
<stat name="mul_accesses" value="200000"/>
|
||||
<stat name="cdb_alu_accesses" value="300000"/>
|
||||
<stat name="cdb_mul_accesses" value="200000"/>
|
||||
<stat name="cdb_fpu_accesses" value="100000"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="1"/>
|
||||
<stat name="LSU_duty_cycle" value="0.5"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.5"/>
|
||||
<stat name="ALU_duty_cycle" value="1"/>
|
||||
<stat name="MUL_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_duty_cycle" value="0.3"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="1"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.3"/>
|
||||
<param name="number_of_BPT" value="2"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="128"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="32768,32,8,1,4,4,32,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="256"/><!--dual threads-->
|
||||
<stat name="total_accesses" value="400000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="32768,32,8,1, 4,6, 32,1 "/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<param name="number_of_BTB" value="2"/>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
|
||||
<stat name="write_accesses" value="0"/>
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="6291456,64, 16, 8, 8, 23, 32, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3700"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="850"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="11824"/>
|
||||
<stat name="write_accesses" value="11276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="type" value="0"/>
|
||||
<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
|
||||
at each time only one node can send req -->
|
||||
<param name="horizontal_nodes" value="1"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="1"/>
|
||||
<param name="output_ports" value="1"/>
|
||||
<!-- For bus the I/O ports should be 1 -->
|
||||
<param name="flit_bits" value="256"/>
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip.
|
||||
chip_coverage <=1 -->
|
||||
<param name="link_routing_over_percentage" value="0.5"/>
|
||||
<!-- Links can route over other components or occupy whole area.
|
||||
by default, 50% of the NoC global links routes over other
|
||||
components -->
|
||||
<stat name="total_accesses" value="100000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="1"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
||||
|
226
ext/mcpat/README
Normal file
226
ext/mcpat/README
Normal file
|
@ -0,0 +1,226 @@
|
|||
__ __ ____ _ _____ ____ _
|
||||
| \/ | ___| _ \ / \|_ _| | __ ) ___| |_ __ _
|
||||
| |\/| |/ __| |_) / _ \ | | | _ \ / _ \ __|/ _` |
|
||||
| | | | (__| __/ ___ \| | | |_) | __/ |_| (_| |
|
||||
|_| |_|\___|_| /_/ \_\_| |____/ \___|\__|\__,_|
|
||||
|
||||
McPAT: Multicore Power, Area, and Timing
|
||||
Current version 0.8Beta
|
||||
===============================
|
||||
|
||||
McPAT is an architectural modeling tool for chip multiprocessors (CMP)
|
||||
The main focus of McPAT is accurate power and area
|
||||
modeling, and a target clock rate is used as a design constraint.
|
||||
McPAT performs automatic extensive search to find optimal designs
|
||||
that satisfy the target clock frequency.
|
||||
|
||||
For complete documentation of the McPAT, please refer McPAT 1.0
|
||||
technical report and the following paper,
|
||||
"McPAT: An Integrated Power, Area, and Timing Modeling
|
||||
Framework for Multicore and Manycore Architectures",
|
||||
that appears in MICRO 2009. Please cite the paper, if you use
|
||||
McPAT in your work. The bibtex entry is provided below for your convenience.
|
||||
|
||||
@inproceedings{mcpat:micro,
|
||||
author = {Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay B. Brockman and Dean M. Tullsen and Norman P. Jouppi},
|
||||
title = "{McPAT: An Integrated Power, Area, and Timing Modeling Framework for Multicore and Manycore Architectures}",
|
||||
booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture},
|
||||
year = {2009},
|
||||
pages = {469--480},
|
||||
}
|
||||
|
||||
Current McPAT is in its beta release.
|
||||
List of features of beta release
|
||||
===============================
|
||||
The following are the list of features supported by the tool.
|
||||
|
||||
* Power, area, and timing models for CMPs with:
|
||||
Inorder cores both single and multithreaded
|
||||
OOO cores both single and multithreaded
|
||||
Shared/coherent caches with directory hardware:
|
||||
including directory cache, shadowed tag directory
|
||||
and static bank mapped tag directory
|
||||
Network-on-Chip
|
||||
On-chip memory controllers
|
||||
|
||||
* Internal models are based on real modern processors:
|
||||
Inorder models are based on Sun Niagara family
|
||||
OOO models are based on Intel P6 for reservation
|
||||
station based OOO cores, and on Intel Netburst and
|
||||
Alpha 21264 for physical register file based OOO cores.
|
||||
|
||||
* Leakage power modeling considers both sub-threshold leakage
|
||||
and gate leakage power. The impact of operating temperature
|
||||
on both leakage power are considered. Longer channel devices
|
||||
that can reduce leakage significantly with modest performance
|
||||
penalty are also modeled.
|
||||
|
||||
* McPAT supports automatic extensive search to find optimal designs
|
||||
that satisfy the target clock frequency. The timing constraint
|
||||
include both throughput and latency.
|
||||
|
||||
* Interconnect model with different delay, power, and area
|
||||
properties, as well as both the aggressive and conservative
|
||||
interconnect projections on wire technologies.
|
||||
|
||||
* All process specific values used by the McPAT are obtained
|
||||
from ITRS and currently, the McPAT supports 90nm, 65nm, 45nm,
|
||||
32nm, and 22nm technology nodes. At 32nm and 22nm nodes, SOI
|
||||
and DG devices are used. After 45nm, Hi-K metal gates are used.
|
||||
|
||||
How to use the tool?
|
||||
====================
|
||||
|
||||
McPAT takes input parameters from an XML-based interface,
|
||||
then it computes area and peak power of the
|
||||
Please note that the peak power is the absolute worst case power,
|
||||
which could be even higher than TDP.
|
||||
|
||||
1. Steps to run McPAT:
|
||||
-> define the target processor using inorder.xml or OOO.xml
|
||||
-> run the "mcpat" binary:
|
||||
./mcpat -infile <*.xml> -print_level < level of detailed output>
|
||||
./mcpat -h (or mcpat --help) will show the quick help message.
|
||||
|
||||
Rather than being hardwired to certain simulators, McPAT
|
||||
uses an XML-based interface to enable easy integration
|
||||
with various performance simulators. Our collaborator,
|
||||
Richard Strong, at University of California, San Diego,
|
||||
designed an experimental parser for the M5 simulator, aiming for
|
||||
streamlining the integration of McPAT and M5. Please check the M5
|
||||
repository/ for the latest version of the parser.
|
||||
|
||||
2. Optimize:
|
||||
McPAT will try its best to satisfy the target clock rate.
|
||||
When it cannot find a valid solution, it gives out warnings,
|
||||
while still giving a solution that is closest to the timing
|
||||
constraints and calculate power based on it. The optimization
|
||||
will lead to larger power/area numbers for target higher clock
|
||||
rate. McPAT also provides the option "-opt_for_clk" to turn on
|
||||
("-opt_for_clk 1") and off this strict optimization for the
|
||||
timing constraint. When it is off, McPAT always optimize
|
||||
component for ED^2P without worrying about meeting the
|
||||
target clock frequency. By turning it off, the computation time
|
||||
can be reduced, which suites for situations where target clock rate
|
||||
is conservative.
|
||||
|
||||
3. The output:
|
||||
McPAT outputs results in a hierarchical manner. Increasing
|
||||
the "-print_level" will show detailed results inside each
|
||||
component. For each component, major parts are shown, and associated
|
||||
pipeline registers/control logic are added up in total area/power of each
|
||||
components. In general, McPAT does not model the area/overhead of the pad
|
||||
frame used in a processor die.
|
||||
|
||||
4. How to use the XML interface for McPAT
|
||||
4.1 Set up the parameters
|
||||
Parameters of target designs need to be set in the *.xml file for
|
||||
entries taged as "param". McPAT have very detailed parameter settings.
|
||||
please remove the structure parameter from the file if you want
|
||||
to use the default values. Otherwise, the parameters in the xml file
|
||||
will override the default values.
|
||||
|
||||
4.2 Pass the statistics
|
||||
There are two options to get the correct stats: a) the performance
|
||||
simulator can capture all the stats in detail and pass them to McPAT;
|
||||
b). Performance simulator can only capture partial stats and pass
|
||||
them to McPAT, while McPAT can reason about the complete stats using
|
||||
the partial information and the configuration. Therefore, there are
|
||||
some overlap for the stats.
|
||||
|
||||
4.3 Interface XML file structures (PLEASE READ!)
|
||||
The XML is hierarchical from processor level to micro-architecture
|
||||
level. McPAT support both heterogeneous and homogeneous manycore processors.
|
||||
|
||||
1). For heterogeneous processor setup, each component (core, NoC, cache,
|
||||
and etc) must have its own instantiations (core0, core1, ..., coreN).
|
||||
Each instantiation will have different parameters as well as its stats.
|
||||
Thus, the XML file must have multiple "instantiation" of each type of
|
||||
heterogeneous components and the corresponding hetero flags must be set
|
||||
in the XML file. Then state in the XML should be the stats of "a" instantiation
|
||||
(e.g. "a" cores). The reported runtime dynamic is of a single instantiation
|
||||
(e.g. "a" cores). Since the stats for each (e.g. "a" cores) may be different,
|
||||
we will see a whole list of (e.g. "a" cores) with different dynamic power,
|
||||
and total power is just a sum of them.
|
||||
|
||||
2). For homogeneous processors, the same method for heterogeneous can
|
||||
also be used by treating all homogeneous instantiations as heterogeneous.
|
||||
However, a preferred approach is to use a single representative for all
|
||||
the same components (e.g. core0 to represent all cores) and set the
|
||||
processor to have homogeneous components (e.g. <param name="homogeneous_cores
|
||||
" value="1"/> ). Thus, the XML file only has one instantiation to represent
|
||||
all others with the same architectural parameters. The corresponding homo
|
||||
flags must be set in the XML file. Then, the stats in the XML should be
|
||||
the aggregated stats of the sum of all instantiations (e.g. aggregated stats
|
||||
of all cores). In the final results, McPAT will only report a single
|
||||
instantiation of each type of component, and the reported runtime dynamic power
|
||||
is the sum of all instantiations of the same type. This approach can run fast
|
||||
and use much less memory.
|
||||
|
||||
5. Guide for integrating McPAT into performance simulators and bypassing the XML interface
|
||||
The detailed work flow of McPAT has two phases: the initialization phase and
|
||||
the computation phase. Specifically, in order to start the initialization phase a
|
||||
user specifies static configurations, including parameters at all three levels,
|
||||
namely, architectural, circuit, and technology levels. During the initialization
|
||||
phase, McPAT will generate the internal chip representation using the configurations
|
||||
set by the user.
|
||||
The computation phase of McPAT is called by McPAT or the performance simulator
|
||||
during simulation to generate runtime power numbers. Before calling McPAT to
|
||||
compute runtime power numbers, the performance simulator needs to pass the
|
||||
statistics, namely, the activity factors of each individual components to McPAT
|
||||
via the XML interface.
|
||||
The initialization phase is very time-consuming, since it will repeat many
|
||||
times until valid configurations are found or the possible configurations are
|
||||
exhausted. To reduce the overhead, a user can let the simulator to call McPAT
|
||||
directly for computation phase and only call initialization phase once at the
|
||||
beginning of simulation. In this case, the XML interface file is bypassed,
|
||||
please refer to processor.cc to see how the two phases are called.
|
||||
|
||||
6. Sample input files:
|
||||
This package provide sample XML files for validating target processors. Please find the
|
||||
enclosed Niagara1.xml (for the Sun Niagara1 processor), Niagara2.xml (for the Sun Niagara2
|
||||
processor), Alpha21364.xml (for the Alpha21364 processor), and Xeon.xml (for the Intel
|
||||
Xeon Tulsa processor).
|
||||
|
||||
Special instructions for using Xeon.xml:
|
||||
McPAT uses ITRS device types including HP, LSTP, and LOP. Although most
|
||||
designs follow ITRS projections, there are designs with special technologies.
|
||||
For example, the 65nm Xeon Tulsa processor uses 1.25 V rather than 1.1V
|
||||
for the core voltage domain, which results in the changes in threshold voltage,
|
||||
leakage current density, saturation current, and etc, besides the different
|
||||
supply voltage. We use MASTAR to match the special technology as used in Xeon
|
||||
core domain. Therefore, in order to generate accurate results of Xeon
|
||||
Tulsa cores, users need to do make TAR=mcpatXeonCore and use the generated
|
||||
special executable. The L3 cache and buses must be computed using standard
|
||||
ITRS technology.
|
||||
|
||||
|
||||
====================
|
||||
McPAT is in its beginning stage. We are still improving
|
||||
the tool and refining the code. Please come back to its website
|
||||
for newer versions. If you have any comments,
|
||||
questions, or suggestions, please write to us.
|
||||
|
||||
Version history and roadmap
|
||||
|
||||
McPAT Alpha: released Sep. 2009 Experimental release
|
||||
McPAT Beta (0.6): released Nov. 2009 New code base and technology base
|
||||
McPAT Beta (0.7): released May. 2010 Added various new models,
|
||||
including long channel devices, buses model; together
|
||||
with bug fixes and extensive code optimization to reduce
|
||||
memory usage.
|
||||
McPAT Beta (0.8): released Aug. 2010 Added various new models,
|
||||
including on-chip 10Gb ethernet units, PCIe, and flash controllers.
|
||||
Next major release:
|
||||
McPAT 1.0: including advance power-saving states
|
||||
|
||||
Future releases may include the modeling of embedded low-power
|
||||
processors as well as vector processors and GPGPUs.
|
||||
|
||||
|
||||
Sheng Li
|
||||
sheng.li@hp.com
|
||||
|
||||
|
||||
|
||||
|
1798
ext/mcpat/XML_Parse.cc
Normal file
1798
ext/mcpat/XML_Parse.cc
Normal file
File diff suppressed because it is too large
Load diff
591
ext/mcpat/XML_Parse.h
Normal file
591
ext/mcpat/XML_Parse.h
Normal file
|
@ -0,0 +1,591 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef XML_PARSE_H_
|
||||
#define XML_PARSE_H_
|
||||
|
||||
|
||||
//#ifdef WIN32
|
||||
//#define _CRT_SECURE_NO_DEPRECATE
|
||||
//#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "xmlParser.h"
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
void myfree(char *t); // {free(t);}
|
||||
ToXMLStringTool tx,tx2;
|
||||
*/
|
||||
//all subnodes at the level of system.core(0-n)
|
||||
//cache_policy is added into cache property arrays;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
|
||||
typedef struct{
|
||||
int prediction_width;
|
||||
char prediction_scheme[20];
|
||||
int predictor_size;
|
||||
int predictor_entries;
|
||||
int local_predictor_size[20];
|
||||
int local_predictor_entries;
|
||||
int global_predictor_entries;
|
||||
int global_predictor_bits;
|
||||
int chooser_predictor_entries;
|
||||
int chooser_predictor_bits;
|
||||
double predictor_accesses;
|
||||
} predictor_systemcore;
|
||||
typedef struct{
|
||||
int number_entries;
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
double total_hits;
|
||||
double total_accesses;
|
||||
double total_misses;
|
||||
double conflicts;
|
||||
} itlb_systemcore;
|
||||
typedef struct{
|
||||
//params
|
||||
double icache_config[20];
|
||||
int buffer_sizes[20];
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double read_misses;
|
||||
double replacements;
|
||||
double read_hits;
|
||||
double total_hits;
|
||||
double total_misses;
|
||||
double miss_buffer_access;
|
||||
double fill_buffer_accesses;
|
||||
double prefetch_buffer_accesses;
|
||||
double prefetch_buffer_writes;
|
||||
double prefetch_buffer_reads;
|
||||
double prefetch_buffer_hits;
|
||||
double conflicts;
|
||||
} icache_systemcore;
|
||||
typedef struct{
|
||||
//params
|
||||
int number_entries;
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double write_hits;
|
||||
double read_hits;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double total_hits;
|
||||
double total_misses;
|
||||
double conflicts;
|
||||
} dtlb_systemcore;
|
||||
typedef struct{
|
||||
//params
|
||||
double dcache_config[20];
|
||||
int buffer_sizes[20];
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double total_hits;
|
||||
double total_misses;
|
||||
double read_hits;
|
||||
double write_hits;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double replacements;
|
||||
double write_backs;
|
||||
double miss_buffer_access;
|
||||
double fill_buffer_accesses;
|
||||
double prefetch_buffer_accesses;
|
||||
double prefetch_buffer_writes;
|
||||
double prefetch_buffer_reads;
|
||||
double prefetch_buffer_hits;
|
||||
double wbb_writes;
|
||||
double wbb_reads;
|
||||
double conflicts;
|
||||
} dcache_systemcore;
|
||||
typedef struct{
|
||||
//params
|
||||
int BTB_config[20];
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double total_hits;
|
||||
double total_misses;
|
||||
double read_hits;
|
||||
double write_hits;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double replacements;
|
||||
} BTB_systemcore;
|
||||
typedef struct{
|
||||
//all params at the level of system.core(0-n)
|
||||
int clock_rate;
|
||||
bool opt_local;
|
||||
bool x86;
|
||||
int machine_bits;
|
||||
int virtual_address_width;
|
||||
int physical_address_width;
|
||||
int opcode_width;
|
||||
int micro_opcode_width;
|
||||
int instruction_length;
|
||||
int machine_type;
|
||||
int internal_datapath_width;
|
||||
int number_hardware_threads;
|
||||
int fetch_width;
|
||||
int number_instruction_fetch_ports;
|
||||
int decode_width;
|
||||
int issue_width;
|
||||
int peak_issue_width;
|
||||
int commit_width;
|
||||
int pipelines_per_core[20];
|
||||
int pipeline_depth[20];
|
||||
char FPU[20];
|
||||
char divider_multiplier[20];
|
||||
int ALU_per_core;
|
||||
double FPU_per_core;
|
||||
int MUL_per_core;
|
||||
int instruction_buffer_size;
|
||||
int decoded_stream_buffer_size;
|
||||
int instruction_window_scheme;
|
||||
int instruction_window_size;
|
||||
int fp_instruction_window_size;
|
||||
int ROB_size;
|
||||
int archi_Regs_IRF_size;
|
||||
int archi_Regs_FRF_size;
|
||||
int phy_Regs_IRF_size;
|
||||
int phy_Regs_FRF_size;
|
||||
int rename_scheme;
|
||||
int register_windows_size;
|
||||
char LSU_order[20];
|
||||
int store_buffer_size;
|
||||
int load_buffer_size;
|
||||
int memory_ports;
|
||||
char Dcache_dual_pump[20];
|
||||
int RAS_size;
|
||||
int fp_issue_width;
|
||||
int prediction_width;
|
||||
int number_of_BTB;
|
||||
int number_of_BPT;
|
||||
|
||||
//all stats at the level of system.core(0-n)
|
||||
double total_instructions;
|
||||
double int_instructions;
|
||||
double fp_instructions;
|
||||
double branch_instructions;
|
||||
double branch_mispredictions;
|
||||
double committed_instructions;
|
||||
double committed_int_instructions;
|
||||
double committed_fp_instructions;
|
||||
double load_instructions;
|
||||
double store_instructions;
|
||||
double total_cycles;
|
||||
double idle_cycles;
|
||||
double busy_cycles;
|
||||
double instruction_buffer_reads;
|
||||
double instruction_buffer_write;
|
||||
double ROB_reads;
|
||||
double ROB_writes;
|
||||
double rename_accesses;
|
||||
double fp_rename_accesses;
|
||||
double rename_reads;
|
||||
double rename_writes;
|
||||
double fp_rename_reads;
|
||||
double fp_rename_writes;
|
||||
double inst_window_reads;
|
||||
double inst_window_writes;
|
||||
double inst_window_wakeup_accesses;
|
||||
double inst_window_selections;
|
||||
double fp_inst_window_reads;
|
||||
double fp_inst_window_writes;
|
||||
double fp_inst_window_wakeup_accesses;
|
||||
double fp_inst_window_selections;
|
||||
double archi_int_regfile_reads;
|
||||
double archi_float_regfile_reads;
|
||||
double phy_int_regfile_reads;
|
||||
double phy_float_regfile_reads;
|
||||
double phy_int_regfile_writes;
|
||||
double phy_float_regfile_writes;
|
||||
double archi_int_regfile_writes;
|
||||
double archi_float_regfile_writes;
|
||||
double int_regfile_reads;
|
||||
double float_regfile_reads;
|
||||
double int_regfile_writes;
|
||||
double float_regfile_writes;
|
||||
double windowed_reg_accesses;
|
||||
double windowed_reg_transports;
|
||||
double function_calls;
|
||||
double context_switches;
|
||||
double ialu_accesses;
|
||||
double fpu_accesses;
|
||||
double mul_accesses;
|
||||
double cdb_alu_accesses;
|
||||
double cdb_mul_accesses;
|
||||
double cdb_fpu_accesses;
|
||||
double load_buffer_reads;
|
||||
double load_buffer_writes;
|
||||
double load_buffer_cams;
|
||||
double store_buffer_reads;
|
||||
double store_buffer_writes;
|
||||
double store_buffer_cams;
|
||||
double store_buffer_forwards;
|
||||
double main_memory_access;
|
||||
double main_memory_read;
|
||||
double main_memory_write;
|
||||
double pipeline_duty_cycle;
|
||||
|
||||
double IFU_duty_cycle ;
|
||||
double BR_duty_cycle ;
|
||||
double LSU_duty_cycle ;
|
||||
double MemManU_I_duty_cycle;
|
||||
double MemManU_D_duty_cycle ;
|
||||
double ALU_duty_cycle ;
|
||||
double MUL_duty_cycle ;
|
||||
double FPU_duty_cycle ;
|
||||
double ALU_cdb_duty_cycle ;
|
||||
double MUL_cdb_duty_cycle ;
|
||||
double FPU_cdb_duty_cycle ;
|
||||
|
||||
//all subnodes at the level of system.core(0-n)
|
||||
predictor_systemcore predictor;
|
||||
itlb_systemcore itlb;
|
||||
icache_systemcore icache;
|
||||
dtlb_systemcore dtlb;
|
||||
dcache_systemcore dcache;
|
||||
BTB_systemcore BTB;
|
||||
|
||||
} system_core;
|
||||
typedef struct{
|
||||
//params
|
||||
int Directory_type;
|
||||
double Dir_config[20];
|
||||
int buffer_sizes[20];
|
||||
int clockrate;
|
||||
int ports[20];
|
||||
int device_type;
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
char threeD_stack[20];
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double conflicts;
|
||||
double duty_cycle;
|
||||
} system_L1Directory;
|
||||
typedef struct{
|
||||
//params
|
||||
int Directory_type;
|
||||
double Dir_config[20];
|
||||
int buffer_sizes[20];
|
||||
int clockrate;
|
||||
int ports[20];
|
||||
int device_type;
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
char threeD_stack[20];
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double conflicts;
|
||||
double duty_cycle;
|
||||
} system_L2Directory;
|
||||
typedef struct{
|
||||
//params
|
||||
double L2_config[20];
|
||||
int clockrate;
|
||||
int ports[20];
|
||||
int device_type;
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
char threeD_stack[20];
|
||||
int buffer_sizes[20];
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double total_hits;
|
||||
double total_misses;
|
||||
double read_hits;
|
||||
double write_hits;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double replacements;
|
||||
double write_backs;
|
||||
double miss_buffer_accesses;
|
||||
double fill_buffer_accesses;
|
||||
double prefetch_buffer_accesses;
|
||||
double prefetch_buffer_writes;
|
||||
double prefetch_buffer_reads;
|
||||
double prefetch_buffer_hits;
|
||||
double wbb_writes;
|
||||
double wbb_reads;
|
||||
double conflicts;
|
||||
double duty_cycle;
|
||||
|
||||
bool merged_dir;
|
||||
double homenode_read_accesses;
|
||||
double homenode_write_accesses;
|
||||
double homenode_read_hits;
|
||||
double homenode_write_hits;
|
||||
double homenode_read_misses;
|
||||
double homenode_write_misses;
|
||||
double dir_duty_cycle;
|
||||
} system_L2;
|
||||
typedef struct{
|
||||
//params
|
||||
double L3_config[20];
|
||||
int clockrate;
|
||||
int ports[20];
|
||||
int device_type;
|
||||
int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate
|
||||
char threeD_stack[20];
|
||||
int buffer_sizes[20];
|
||||
//stats
|
||||
double total_accesses;
|
||||
double read_accesses;
|
||||
double write_accesses;
|
||||
double total_hits;
|
||||
double total_misses;
|
||||
double read_hits;
|
||||
double write_hits;
|
||||
double read_misses;
|
||||
double write_misses;
|
||||
double replacements;
|
||||
double write_backs;
|
||||
double miss_buffer_accesses;
|
||||
double fill_buffer_accesses;
|
||||
double prefetch_buffer_accesses;
|
||||
double prefetch_buffer_writes;
|
||||
double prefetch_buffer_reads;
|
||||
double prefetch_buffer_hits;
|
||||
double wbb_writes;
|
||||
double wbb_reads;
|
||||
double conflicts;
|
||||
double duty_cycle;
|
||||
|
||||
bool merged_dir;
|
||||
double homenode_read_accesses;
|
||||
double homenode_write_accesses;
|
||||
double homenode_read_hits;
|
||||
double homenode_write_hits;
|
||||
double homenode_read_misses;
|
||||
double homenode_write_misses;
|
||||
double dir_duty_cycle;
|
||||
} system_L3;
|
||||
typedef struct{
|
||||
//params
|
||||
int number_of_inputs_of_crossbars;
|
||||
int number_of_outputs_of_crossbars;
|
||||
int flit_bits;
|
||||
int input_buffer_entries_per_port;
|
||||
int ports_of_input_buffer[20];
|
||||
//stats
|
||||
double crossbar_accesses;
|
||||
} xbar0_systemNoC;
|
||||
typedef struct{
|
||||
//params
|
||||
int clockrate;
|
||||
bool type;
|
||||
bool has_global_link;
|
||||
char topology[20];
|
||||
int horizontal_nodes;
|
||||
int vertical_nodes;
|
||||
int link_throughput;
|
||||
int link_latency;
|
||||
int input_ports;
|
||||
int output_ports;
|
||||
int virtual_channel_per_port;
|
||||
int flit_bits;
|
||||
int input_buffer_entries_per_vc;
|
||||
int ports_of_input_buffer[20];
|
||||
int dual_pump;
|
||||
int number_of_crossbars;
|
||||
char crossbar_type[20];
|
||||
char crosspoint_type[20];
|
||||
xbar0_systemNoC xbar0;
|
||||
int arbiter_type;
|
||||
double chip_coverage;
|
||||
//stats
|
||||
double total_accesses;
|
||||
double duty_cycle;
|
||||
double route_over_perc;
|
||||
} system_NoC;
|
||||
typedef struct{
|
||||
//params
|
||||
int mem_tech_node;
|
||||
int device_clock;
|
||||
int peak_transfer_rate;
|
||||
int internal_prefetch_of_DRAM_chip;
|
||||
int capacity_per_channel;
|
||||
int number_ranks;
|
||||
int num_banks_of_DRAM_chip;
|
||||
int Block_width_of_DRAM_chip;
|
||||
int output_width_of_DRAM_chip;
|
||||
int page_size_of_DRAM_chip;
|
||||
int burstlength_of_DRAM_chip;
|
||||
//stats
|
||||
double memory_accesses;
|
||||
double memory_reads;
|
||||
double memory_writes;
|
||||
} system_mem;
|
||||
typedef struct{
|
||||
//params
|
||||
//Common Param for mc and fc
|
||||
double peak_transfer_rate;
|
||||
int number_mcs;
|
||||
bool withPHY;
|
||||
int type;
|
||||
|
||||
//FCParam
|
||||
//stats
|
||||
double duty_cycle;
|
||||
double total_load_perc;
|
||||
|
||||
//McParam
|
||||
int mc_clock;
|
||||
int llc_line_length;
|
||||
int memory_channels_per_mc;
|
||||
int number_ranks;
|
||||
int req_window_size_per_channel;
|
||||
int IO_buffer_size_per_channel;
|
||||
int databus_width;
|
||||
int addressbus_width;
|
||||
bool LVDS;
|
||||
|
||||
//stats
|
||||
double memory_accesses;
|
||||
double memory_reads;
|
||||
double memory_writes;
|
||||
} system_mc;
|
||||
|
||||
typedef struct{
|
||||
//params
|
||||
int clockrate;
|
||||
int number_units;
|
||||
int type;
|
||||
//stats
|
||||
double duty_cycle;
|
||||
double total_load_perc;
|
||||
} system_niu;
|
||||
|
||||
typedef struct{
|
||||
//params
|
||||
int clockrate;
|
||||
int number_units;
|
||||
int num_channels;
|
||||
int type;
|
||||
bool withPHY;
|
||||
//stats
|
||||
double duty_cycle;
|
||||
double total_load_perc;
|
||||
} system_pcie;
|
||||
|
||||
typedef struct{
|
||||
//All number_of_* at the level of 'system' Ying 03/21/2009
|
||||
int number_of_cores;
|
||||
int number_of_L1Directories;
|
||||
int number_of_L2Directories;
|
||||
int number_of_L2s;
|
||||
bool Private_L2;
|
||||
int number_of_L3s;
|
||||
int number_of_NoCs;
|
||||
int number_of_dir_levels;
|
||||
int domain_size;
|
||||
int first_level_dir;
|
||||
// All params at the level of 'system'
|
||||
int homogeneous_cores;
|
||||
int homogeneous_L1Directories;
|
||||
int homogeneous_L2Directories;
|
||||
double core_tech_node;
|
||||
int target_core_clockrate;
|
||||
int target_chip_area;
|
||||
int temperature;
|
||||
int number_cache_levels;
|
||||
int L1_property;
|
||||
int L2_property;
|
||||
int homogeneous_L2s;
|
||||
int L3_property;
|
||||
int homogeneous_L3s;
|
||||
int homogeneous_NoCs;
|
||||
int homogeneous_ccs;
|
||||
int Max_area_deviation;
|
||||
int Max_power_deviation;
|
||||
int device_type;
|
||||
bool longer_channel_device;
|
||||
bool Embedded;
|
||||
bool opt_dynamic_power;
|
||||
bool opt_lakage_power;
|
||||
bool opt_clockrate;
|
||||
bool opt_area;
|
||||
int interconnect_projection_type;
|
||||
int machine_bits;
|
||||
int virtual_address_width;
|
||||
int physical_address_width;
|
||||
int virtual_memory_page_size;
|
||||
double total_cycles;
|
||||
//system.core(0-n):3rd level
|
||||
system_core core[64];
|
||||
system_L1Directory L1Directory[64];
|
||||
system_L2Directory L2Directory[64];
|
||||
system_L2 L2[64];
|
||||
system_L3 L3[64];
|
||||
system_NoC NoC[64];
|
||||
system_mem mem;
|
||||
system_mc mc;
|
||||
system_mc flashc;
|
||||
system_niu niu;
|
||||
system_pcie pcie;
|
||||
} root_system;
|
||||
|
||||
class ParseXML
|
||||
{
|
||||
public:
|
||||
void parse(char* filepath);
|
||||
void initialize();
|
||||
public:
|
||||
root_system sys;
|
||||
};
|
||||
|
||||
|
||||
#endif /* XML_PARSE_H_ */
|
||||
|
||||
|
||||
|
||||
|
455
ext/mcpat/Xeon.xml
Normal file
455
ext/mcpat/Xeon.xml
Normal file
|
@ -0,0 +1,455 @@
|
|||
<?xml version="1.0" ?>
|
||||
<component id="root" name="root">
|
||||
<component id="system" name="system">
|
||||
<!--McPAT will skip the components if number is set to 0 -->
|
||||
<param name="number_of_cores" value="2"/>
|
||||
<param name="number_of_L1Directories" value="0"/>
|
||||
<param name="number_of_L2Directories" value="0"/>
|
||||
<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
|
||||
<param name="Private_L2" value="1"/><!--1 Private, 0 shared/coherent -->
|
||||
<param name="number_of_L3s" value="1"/> <!-- This number means how many L3 clusters -->
|
||||
<param name="number_of_NoCs" value="1"/>
|
||||
<param name="homogeneous_cores" value="1"/><!--1 means homo -->
|
||||
<param name="homogeneous_L2s" value="1"/>
|
||||
<param name="homogeneous_L1Directorys" value="1"/>
|
||||
<param name="homogeneous_L2Directorys" value="1"/>
|
||||
<param name="homogeneous_L3s" value="1"/>
|
||||
<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
|
||||
<param name="homogeneous_NoCs" value="1"/>
|
||||
<param name="core_tech_node" value="65"/><!-- nm -->
|
||||
<param name="target_core_clockrate" value="3400"/><!--MHz -->
|
||||
<param name="temperature" value="380"/> <!-- Kelvin -->
|
||||
<param name="number_cache_levels" value="3"/>
|
||||
<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
|
||||
<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
|
||||
<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
|
||||
<param name="machine_bits" value="64"/>
|
||||
<param name="virtual_address_width" value="64"/>
|
||||
<param name="physical_address_width" value="52"/>
|
||||
<param name="virtual_memory_page_size" value="4096"/>
|
||||
<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller
|
||||
default value is machine_bits, if not set -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of
|
||||
virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank -->
|
||||
<!-- *********************** cores ******************* -->
|
||||
<component id="system.core0" name="core0">
|
||||
<!-- Core property -->
|
||||
<param name="clock_rate" value="3400"/>
|
||||
<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
|
||||
<param name="opt_local" value="0"/>
|
||||
<param name="instruction_length" value="32"/>
|
||||
<param name="opcode_width" value="16"/>
|
||||
<param name="x86" value="1"/>
|
||||
<param name="micro_opcode_width" value="8"/>
|
||||
<param name="machine_type" value="0"/>
|
||||
<!-- inorder/OoO; 1 inorder; 0 OOO-->
|
||||
<param name="number_hardware_threads" value="2"/>
|
||||
<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
|
||||
it only may be more than one in SMT processors. BTB ports always equals to fetch ports since
|
||||
branch information in consective branch instructions in the same fetch group can be read out from BTB once.-->
|
||||
<param name="fetch_width" value="4"/>
|
||||
<!-- fetch_width determins the size of cachelines of L1 cache block -->
|
||||
<param name="number_instruction_fetch_ports" value="1"/>
|
||||
<param name="decode_width" value="4"/>
|
||||
<!-- decode_width determins the number of ports of the
|
||||
renaming table (both RAM and CAM) scheme -->
|
||||
<param name="issue_width" value="4"/>
|
||||
<param name="peak_issue_width" value="6"/>
|
||||
<!-- issue_width determins the number of ports of Issue window and other logic
|
||||
as in the complexity effective proccessors paper; issue_width==dispatch_width -->
|
||||
<param name="commit_width" value="4"/>
|
||||
<!-- commit_width determins the number of ports of register files -->
|
||||
<param name="fp_issue_width" value="2"/>
|
||||
<param name="prediction_width" value="1"/>
|
||||
<!-- number of branch instructions can be predicted simultannouesl-->
|
||||
<!-- Current version of McPAT does not distinguish int and floating point pipelines
|
||||
Theses parameters are reserved for future use.-->
|
||||
<param name="pipelines_per_core" value="1,1"/>
|
||||
<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
|
||||
<param name="pipeline_depth" value="31,31"/>
|
||||
<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
|
||||
<!-- issue and exe unit-->
|
||||
<param name="ALU_per_core" value="6"/>
|
||||
<!-- contains an adder, a shifter, and a logical unit -->
|
||||
<param name="MUL_per_core" value="1"/>
|
||||
<!-- For MUL and Div -->
|
||||
<param name="FPU_per_core" value="2"/>
|
||||
<!-- buffer between IF and ID stage -->
|
||||
<param name="instruction_buffer_size" value="32"/>
|
||||
<!-- buffer between ID and sche/exe stage -->
|
||||
<param name="decoded_stream_buffer_size" value="16"/>
|
||||
<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
|
||||
<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
|
||||
<param name="instruction_window_size" value="64"/>
|
||||
<param name="fp_instruction_window_size" value="64"/>
|
||||
<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
|
||||
<param name="ROB_size" value="128"/>
|
||||
<!-- each in-flight instruction has an entry in ROB -->
|
||||
<!-- registers -->
|
||||
<param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->
|
||||
<param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
|
||||
<!-- if OoO processor, phy_reg number is needed for renaming logic,
|
||||
renaming logic is for both integer and floating point insts. -->
|
||||
<param name="phy_Regs_IRF_size" value="256"/>
|
||||
<param name="phy_Regs_FRF_size" value="256"/>
|
||||
<!-- rename logic -->
|
||||
<param name="rename_scheme" value="0"/>
|
||||
<!-- can be RAM based(0) or CAM based(1) rename scheme
|
||||
RAM-based scheme will have free list, status table;
|
||||
CAM-based scheme have the valid bit in the data field of the CAM
|
||||
both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
|
||||
Detailed RAT Implementation see TR -->
|
||||
<param name="register_windows_size" value="0"/>
|
||||
<!-- how many windows in the windowed register file, sun processors;
|
||||
no register windowing is used when this number is 0 -->
|
||||
<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
|
||||
They will always try to exeute out-of-order though. -->
|
||||
<param name="LSU_order" value="inorder"/>
|
||||
<param name="store_buffer_size" value="96"/>
|
||||
<!-- By default, in-order cores do not have load buffers -->
|
||||
<param name="load_buffer_size" value="48"/>
|
||||
<!-- number of ports refer to sustainable concurrent memory accesses -->
|
||||
<param name="memory_ports" value="2"/>
|
||||
<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
|
||||
as well as the ports of Dcache which is connected to LSU -->
|
||||
<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
|
||||
<param name="RAS_size" value="64"/>
|
||||
<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check -->
|
||||
<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
|
||||
<stat name="total_instructions" value="400000"/>
|
||||
<stat name="int_instructions" value="200000"/>
|
||||
<stat name="fp_instructions" value="100000"/>
|
||||
<stat name="branch_instructions" value="100000"/>
|
||||
<stat name="branch_mispredictions" value="0"/>
|
||||
<stat name="load_instructions" value="0"/>
|
||||
<stat name="store_instructions" value="50000"/>
|
||||
<stat name="committed_instructions" value="400000"/>
|
||||
<stat name="committed_int_instructions" value="200000"/>
|
||||
<stat name="committed_fp_instructions" value="100000"/>
|
||||
<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
|
||||
<!-- the following cycle stats are used for heterogeneouse cores only,
|
||||
please ignore them if homogeneouse cores -->
|
||||
<stat name="total_cycles" value="100000"/>
|
||||
<stat name="idle_cycles" value="0"/>
|
||||
<stat name="busy_cycles" value="100000"/>
|
||||
<!-- instruction buffer stats -->
|
||||
<!-- ROB stats, both RS and Phy based OoOs have ROB
|
||||
performance simulator should capture the difference on accesses,
|
||||
otherwise, McPAT has to guess based on number of commited instructions. -->
|
||||
<stat name="ROB_reads" value="400000"/>
|
||||
<stat name="ROB_writes" value="400000"/>
|
||||
<!-- RAT accesses -->
|
||||
<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
|
||||
<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
|
||||
<stat name="fp_rename_reads" value="200000"/>
|
||||
<stat name="fp_rename_writes" value="100000"/>
|
||||
<!-- decode and rename stage use this, should be total ic - nop -->
|
||||
<!-- Inst window stats -->
|
||||
<stat name="inst_window_reads" value="400000"/>
|
||||
<stat name="inst_window_writes" value="400000"/>
|
||||
<stat name="inst_window_wakeup_accesses" value="800000"/>
|
||||
<stat name="fp_inst_window_reads" value="200000"/>
|
||||
<stat name="fp_inst_window_writes" value="200000"/>
|
||||
<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
|
||||
<!-- RF accesses -->
|
||||
<stat name="int_regfile_reads" value="600000"/>
|
||||
<stat name="float_regfile_reads" value="100000"/>
|
||||
<stat name="int_regfile_writes" value="300000"/>
|
||||
<stat name="float_regfile_writes" value="50000"/>
|
||||
<!-- accesses to the working reg -->
|
||||
<stat name="function_calls" value="5"/>
|
||||
<stat name="context_switches" value="260343"/>
|
||||
<!-- Number of Windowes switches (number of function calls and returns)-->
|
||||
<!-- Alu stats by default, the processor has one FPU that includes the divider and
|
||||
multiplier. The fpu accesses should include accesses to multiplier and divider -->
|
||||
<stat name="ialu_accesses" value="300000"/>
|
||||
<stat name="fpu_accesses" value="100000"/>
|
||||
<stat name="mul_accesses" value="200000"/>
|
||||
<stat name="cdb_alu_accesses" value="300000"/>
|
||||
<stat name="cdb_mul_accesses" value="200000"/>
|
||||
<stat name="cdb_fpu_accesses" value="100000"/>
|
||||
<!-- multiple cycle accesses should be counted multiple times,
|
||||
otherwise, McPAT can use internal counter for different floating point instructions
|
||||
to get final accesses. But that needs detailed info for floating point inst mix -->
|
||||
<!-- currently the performance simulator should
|
||||
make sure all the numbers are final numbers,
|
||||
including the explicit read/write accesses,
|
||||
and the implicite accesses such as replacements and etc.
|
||||
Future versions of McPAT may be able to reason the implicite access
|
||||
based on param and stats of last level cache
|
||||
The same rule applies to all cache access stats too! -->
|
||||
<!-- following is AF for max power computation.
|
||||
Do not change them, unless you understand them-->
|
||||
<stat name="IFU_duty_cycle" value="1"/>
|
||||
<stat name="LSU_duty_cycle" value="0.5"/>
|
||||
<stat name="MemManU_I_duty_cycle" value="1"/>
|
||||
<stat name="MemManU_D_duty_cycle" value="0.5"/>
|
||||
<stat name="ALU_duty_cycle" value="1"/>
|
||||
<stat name="MUL_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_duty_cycle" value="0.3"/>
|
||||
<stat name="ALU_cdb_duty_cycle" value="1"/>
|
||||
<stat name="MUL_cdb_duty_cycle" value="0.3"/>
|
||||
<stat name="FPU_cdb_duty_cycle" value="0.3"/>
|
||||
<param name="number_of_BPT" value="2"/>
|
||||
<component id="system.core0.predictor" name="PBT">
|
||||
<!-- branch predictor; tournament predictor see Alpha implementation -->
|
||||
<param name="local_predictor_size" value="10,3"/>
|
||||
<param name="local_predictor_entries" value="1024"/>
|
||||
<param name="global_predictor_entries" value="4096"/>
|
||||
<param name="global_predictor_bits" value="2"/>
|
||||
<param name="chooser_predictor_entries" value="4096"/>
|
||||
<param name="chooser_predictor_bits" value="2"/>
|
||||
<!-- These parameters can be combined like below in next version
|
||||
<param name="load_predictor" value="10,3,1024"/>
|
||||
<param name="global_predictor" value="4096,2"/>
|
||||
<param name="predictor_chooser" value="4096,2"/>
|
||||
-->
|
||||
</component>
|
||||
<component id="system.core0.itlb" name="itlb">
|
||||
<param name="number_entries" value="128"/>
|
||||
<stat name="total_accesses" value="200000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<!-- there is no write requests to itlb although writes happen to itlb after miss,
|
||||
which is actually a replacement -->
|
||||
</component>
|
||||
<component id="system.core0.icache" name="icache">
|
||||
<!-- there is no write requests to itlb although writes happen to it after miss,
|
||||
which is actually a replacement -->
|
||||
<param name="icache_config" value="131072,32,8,1,8,3,32,0"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, -->
|
||||
<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
|
||||
<param name="buffer_sizes" value="16, 16, 16,0"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="read_misses" value="0"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dtlb" name="dtlb">
|
||||
<param name="number_entries" value="128"/><!--dual threads-->
|
||||
<stat name="total_accesses" value="400000"/>
|
||||
<stat name="total_misses" value="4"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<component id="system.core0.dcache" name="dcache">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="dcache_config" value="16384,16,4,1, 3,3, 16,1 "/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
</component>
|
||||
<param name="number_of_BTB" value="2"/>
|
||||
<component id="system.core0.BTB" name="BTB">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
|
||||
<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
|
||||
<stat name="write_accesses" value="0"/>
|
||||
</component>
|
||||
</component>
|
||||
<component id="system.L1Directory0" name="L1Directory0">
|
||||
<param name="Directory_type" value="0"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="800000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="20"/>
|
||||
</component>
|
||||
<component id="system.L2Directory0" name="L2Directory0">
|
||||
<param name="Directory_type" value="1"/>
|
||||
<!--0 cam based shadowed tag. 1 directory cache -->
|
||||
<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="buffer_sizes" value="8, 8, 8, 8"/>
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw search ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<!-- altough there are multiple access types,
|
||||
Performance simulator needs to cast them into reads or writes
|
||||
e.g. the invalidates can be considered as writes -->
|
||||
<stat name="read_accesses" value="58824"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="100"/>
|
||||
</component>
|
||||
<component id="system.L20" name="L20">
|
||||
<!-- all the buffer related are optional -->
|
||||
<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<stat name="read_accesses" value="200000"/>
|
||||
<stat name="write_accesses" value="27276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.L30" name="L30">
|
||||
<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
|
||||
<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
|
||||
<param name="clockrate" value="850"/>
|
||||
<param name="ports" value="1,1,1"/>
|
||||
<!-- number of r, w, and rw ports -->
|
||||
<param name="device_type" value="0"/>
|
||||
<param name="buffer_sizes" value="16, 16, 16, 16"/>
|
||||
<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
|
||||
<stat name="read_accesses" value="11824"/>
|
||||
<stat name="write_accesses" value="11276"/>
|
||||
<stat name="read_misses" value="1632"/>
|
||||
<stat name="write_misses" value="183"/>
|
||||
<stat name="conflicts" value="0"/>
|
||||
<stat name="duty_cycle" value="1.0"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.NoC0" name="noc0">
|
||||
<param name="clockrate" value="3400"/>
|
||||
<param name="type" value="0"/>
|
||||
<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
|
||||
at each time only one node can send req -->
|
||||
<param name="horizontal_nodes" value="1"/>
|
||||
<param name="vertical_nodes" value="1"/>
|
||||
<param name="has_global_link" value="0"/>
|
||||
<!-- 1 has global link, 0 does not have global link -->
|
||||
<param name="link_throughput" value="1"/><!--w.r.t clock -->
|
||||
<param name="link_latency" value="1"/><!--w.r.t clock -->
|
||||
<!-- througput >= latency -->
|
||||
<!-- Router architecture -->
|
||||
<param name="input_ports" value="1"/>
|
||||
<param name="output_ports" value="1"/>
|
||||
<!-- For bus the I/O ports should be 1 -->
|
||||
<param name="flit_bits" value="256"/>
|
||||
<param name="chip_coverage" value="1"/>
|
||||
<!-- When multiple NOC present, one NOC will cover part of the whole chip.
|
||||
chip_coverage <=1 -->
|
||||
<param name="link_routing_over_percentage" value="0.5"/>
|
||||
<!-- Links can route over other components or occupy whole area.
|
||||
by default, 50% of the NoC global links routes over other
|
||||
components -->
|
||||
<stat name="total_accesses" value="100000"/>
|
||||
<!-- This is the number of total accesses within the whole network not for each router -->
|
||||
<stat name="duty_cycle" value="1"/>
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.mem" name="mem">
|
||||
<!-- Main memory property -->
|
||||
<param name="mem_tech_node" value="32"/>
|
||||
<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
|
||||
<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
|
||||
<param name="internal_prefetch_of_DRAM_chip" value="4"/>
|
||||
<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
|
||||
<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
|
||||
<!-- above numbers can be easily found from Wikipedia -->
|
||||
<param name="capacity_per_channel" value="4096"/> <!-- MB -->
|
||||
<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
|
||||
Current McPAT assumes single DIMMs are used.-->
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="num_banks_of_DRAM_chip" value="8"/>
|
||||
<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
|
||||
<param name="output_width_of_DRAM_chip" value="8"/>
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
|
||||
<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
|
||||
<param name="burstlength_of_DRAM_chip" value="8"/>
|
||||
<stat name="memory_accesses" value="1052"/>
|
||||
<stat name="memory_reads" value="1052"/>
|
||||
<stat name="memory_writes" value="1052"/>
|
||||
</component>
|
||||
<component id="system.mc" name="mc">
|
||||
<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
|
||||
<!-- current version of McPAT uses published values for base parameters of memory controller
|
||||
improvments on MC will be added in later versions. -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1-->
|
||||
<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
|
||||
<param name="block_size" value="64"/><!--B-->
|
||||
<param name="number_mcs" value="0"/>
|
||||
<!-- current McPAT only supports homogeneous memory controllers -->
|
||||
<param name="memory_channels_per_mc" value="1"/>
|
||||
<param name="number_ranks" value="2"/>
|
||||
<param name="withPHY" value="0"/>
|
||||
<!-- # of ranks of each channel-->
|
||||
<param name="req_window_size_per_channel" value="32"/>
|
||||
<param name="IO_buffer_size_per_channel" value="32"/>
|
||||
<param name="databus_width" value="128"/>
|
||||
<param name="addressbus_width" value="51"/>
|
||||
<!-- McPAT will add the control bus width to the addressbus width automatically -->
|
||||
<stat name="memory_accesses" value="33333"/>
|
||||
<stat name="memory_reads" value="16667"/>
|
||||
<stat name="memory_writes" value="16667"/>
|
||||
<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate
|
||||
the average power per MC or per channel. This is sufficent for most application.
|
||||
Further trackdown can be easily added in later versions. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.niu" name="niu">
|
||||
<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller -->
|
||||
<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns.
|
||||
the low bound of clock rate of a 10Gb MAC is 150Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate
|
||||
the average power per nic or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.pcie" name="pcie">
|
||||
<!-- On chip PCIe controller, including Phy-->
|
||||
<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns.
|
||||
the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
|
||||
<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="clockrate" value="350"/>
|
||||
<param name="number_units" value="0"/>
|
||||
<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate
|
||||
the average power per pcie controller or per channel. This is sufficent for most application. -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
<component id="system.flashc" name="flashc">
|
||||
<param name="number_flashcs" value="0"/>
|
||||
<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
|
||||
<param name="withPHY" value="1"/>
|
||||
<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
|
||||
<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
|
||||
<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth -->
|
||||
<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate
|
||||
the average power per fc or per channel. This is sufficent for most application -->
|
||||
</component>
|
||||
<!--**********************************************************************-->
|
||||
|
||||
</component>
|
||||
</component>
|
||||
|
276
ext/mcpat/arch_const.h
Normal file
276
ext/mcpat/arch_const.h
Normal file
|
@ -0,0 +1,276 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef ARCH_CONST_H_
|
||||
#define ARCH_CONST_H_
|
||||
|
||||
typedef struct{
|
||||
unsigned int capacity;
|
||||
unsigned int assoc;//fully
|
||||
unsigned int blocksize;
|
||||
} array_inputs;
|
||||
|
||||
//Do Not change, unless you want to bypass the XML interface and do not care about the default values.
|
||||
//Global parameters
|
||||
const int number_of_cores = 8;
|
||||
const int number_of_L2s = 1;
|
||||
const int number_of_L3s = 1;
|
||||
const int number_of_NoCs = 1;
|
||||
|
||||
const double archi_F_sz_nm = 90.0;
|
||||
const unsigned int dev_type = 0;
|
||||
const double CLOCKRATE = 1.2*1e9;
|
||||
const double AF = 0.5;
|
||||
//const bool inorder = true;
|
||||
const bool embedded = false; //NEW
|
||||
|
||||
const bool homogeneous_cores = true;
|
||||
const bool temperature = 360;
|
||||
const int number_cache_levels = 3;
|
||||
const int L1_property = 0; //private 0; coherent 1, shared 2.
|
||||
const int L2_property = 2;
|
||||
const bool homogeneous_L2s = true;
|
||||
const bool L3_property = 2;
|
||||
const bool homogeneous_L3s = true;
|
||||
const double Max_area_deviation = 50;
|
||||
const double Max_dynamic_deviation =50; //New
|
||||
const int opt_dynamic_power = 1;
|
||||
const int opt_lakage_power = 0;
|
||||
const int opt_area = 0;
|
||||
const int interconnect_projection_type = 0;
|
||||
|
||||
//******************************Core Parameters
|
||||
#if (inorder)
|
||||
const int opcode_length = 8;//Niagara
|
||||
const int reg_length = 5;//Niagara
|
||||
const int instruction_length = 32;//Niagara
|
||||
const int data_width = 64;
|
||||
#else
|
||||
const int opcode_length = 8;//16;//Niagara
|
||||
const int reg_length = 7;//Niagara
|
||||
const int instruction_length = 32;//Niagara
|
||||
const int data_width = 64;
|
||||
#endif
|
||||
|
||||
|
||||
//Caches
|
||||
//itlb
|
||||
const int itlbsize=512;
|
||||
const int itlbassoc=0;//fully
|
||||
const int itlbblocksize=8;
|
||||
//icache
|
||||
const int icachesize=32768;
|
||||
const int icacheassoc=4;
|
||||
const int icacheblocksize=32;
|
||||
//dtlb
|
||||
const int dtlbsize=512;
|
||||
const int dtlbassoc=0;//fully
|
||||
const int dtlbblocksize=8;
|
||||
//dcache
|
||||
const int dcachesize=32768;
|
||||
const int dcacheassoc=4;
|
||||
const int dcacheblocksize=32;
|
||||
const int dcache_write_buffers=8;
|
||||
|
||||
//cache controllers
|
||||
//IB,
|
||||
const int numIBEntries = 64;
|
||||
const int IBsize = 64;//2*4*instruction_length/8*2;
|
||||
const int IBassoc = 0;//In Niagara it is still fully associ
|
||||
const int IBblocksize = 4;
|
||||
|
||||
//IFB and MIL should have the same parameters CAM
|
||||
const int IFBsize=128;//
|
||||
const int IFBassoc=0;//In Niagara it is still fully associ
|
||||
const int IFBblocksize=4;
|
||||
|
||||
|
||||
|
||||
|
||||
const int icache_write_buffers=8;
|
||||
|
||||
//register file RAM
|
||||
const int regfilesize=5760;
|
||||
const int regfileassoc=1;
|
||||
const int regfileblocksize=18;
|
||||
//regwin RAM
|
||||
const int regwinsize=256;
|
||||
const int regwinassoc=1;
|
||||
const int regwinblocksize=8;
|
||||
|
||||
|
||||
|
||||
//store buffer, lsq
|
||||
const int lsqsize=512;
|
||||
const int lsqassoc=0;
|
||||
const int lsqblocksize=8;
|
||||
|
||||
//data fill queue RAM
|
||||
const int dfqsize=1024;
|
||||
const int dfqassoc=1;
|
||||
const int dfqblocksize=16;
|
||||
|
||||
//outside the cores
|
||||
//L2 cache bank
|
||||
const int l2cachesize=262144;
|
||||
const int l2cacheassoc=16;
|
||||
const int l2cacheblocksize=64;
|
||||
|
||||
//L2 directory
|
||||
const int l2dirsize=1024;
|
||||
const int l2dirassoc=0;
|
||||
const int l2dirblocksize=2;
|
||||
|
||||
//crossbar
|
||||
//PCX
|
||||
const int PCX_NUMBER_INPUT_PORTS_CROSSBAR = 8;
|
||||
const int PCX_NUMBER_OUTPUT_PORTS_CROSSBAR = 9;
|
||||
const int PCX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =144;
|
||||
//PCX buffer RAM
|
||||
const int pcx_buffersize=1024;
|
||||
const int pcx_bufferassoc=1;
|
||||
const int pcx_bufferblocksize=32;
|
||||
const int pcx_numbuffer=5;
|
||||
//pcx arbiter
|
||||
const int pcx_arbsize=128;
|
||||
const int pcx_arbassoc=1;
|
||||
const int pcx_arbblocksize=2;
|
||||
const int pcx_numarb=5;
|
||||
|
||||
//CPX
|
||||
const int CPX_NUMBER_INPUT_PORTS_CROSSBAR = 5;
|
||||
const int CPX_NUMBER_OUTPUT_PORTS_CROSSBAR = 8;
|
||||
const int CPX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =150;
|
||||
//CPX buffer RAM
|
||||
const int cpx_buffersize=1024;
|
||||
const int cpx_bufferassoc=1;
|
||||
const int cpx_bufferblocksize=32;
|
||||
const int cpx_numbuffer=8;
|
||||
//cpx arbiter
|
||||
const int cpx_arbsize=128;
|
||||
const int cpx_arbassoc=1;
|
||||
const int cpx_arbblocksize=2;
|
||||
const int cpx_numarb=8;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
const int numPhysFloatRegs=256;
|
||||
const int numPhysIntRegs=32;
|
||||
const int numROBEntries=192;
|
||||
const int umRobs=1;
|
||||
|
||||
const int BTBEntries=4096;
|
||||
const int BTBTagSize=16;
|
||||
const int LFSTSize=1024;
|
||||
const int LQEntries=32;
|
||||
const int RASSize=16;
|
||||
const int SQEntries=32;
|
||||
const int SSITSize=1024;
|
||||
const int activity=0;
|
||||
const int backComSize=5;
|
||||
const int cachePorts=200;
|
||||
const int choiceCtrBits=2;
|
||||
const int choicePredictorSize=8192;
|
||||
|
||||
|
||||
const int commitWidth=8;
|
||||
const int decodeWidth=8;
|
||||
const int dispatchWidth=8;
|
||||
const int fetchWidth=8;
|
||||
const int issueWidth=1;
|
||||
const int renameWidth=8;
|
||||
//what is this forwardComSize=5??
|
||||
|
||||
const int globalCtrBits=2;
|
||||
const int globalHistoryBits=13;
|
||||
const int globalPredictorSize=8192;
|
||||
|
||||
|
||||
|
||||
const int localCtrBits=2;
|
||||
const int localHistoryBits=11;
|
||||
const int localHistoryTableSize=2048;
|
||||
const int localPredictorSize=2048;
|
||||
|
||||
const double Woutdrvnandn =30 *0.09;//(24.0 * LSCALE)
|
||||
const double Woutdrvnandp =12.5 *0.09;//(10.0 * LSCALE)
|
||||
const double Woutdrvnorn =7.5*0.09;//(6.0 * LSCALE)
|
||||
const double Woutdrvnorp =50 * 0.09;// (40.0 * LSCALE)
|
||||
const double Woutdrivern =60*0.09;//(48.0 * LSCALE)
|
||||
const double Woutdriverp =100 * 0.09;//(80.0 * LSCALE)
|
||||
|
||||
/*
|
||||
smtCommitPolicy=RoundRobin
|
||||
smtFetchPolicy=SingleThread
|
||||
smtIQPolicy=Partitioned
|
||||
smtIQThreshold=100
|
||||
smtLSQPolicy=Partitioned
|
||||
smtLSQThreshold=100
|
||||
smtNumFetchingThreads=1
|
||||
smtROBPolicy=Partitioned
|
||||
smtROBThreshold=100
|
||||
squashWidth=8
|
||||
*/
|
||||
|
||||
/*
|
||||
prefetch_access=false
|
||||
prefetch_cache_check_push=true
|
||||
prefetch_data_accesses_only=false
|
||||
prefetch_degree=1
|
||||
prefetch_latency=10000
|
||||
prefetch_miss=false
|
||||
prefetch_past_page=false
|
||||
prefetch_policy=none
|
||||
prefetch_serial_squash=false
|
||||
prefetch_use_cpu_id=true
|
||||
prefetcher_size=100
|
||||
prioritizeRequests=false
|
||||
repl=Null
|
||||
|
||||
|
||||
split=false
|
||||
split_size=0
|
||||
subblock_size=0
|
||||
tgts_per_mshr=20
|
||||
trace_addr=0
|
||||
two_queue=false
|
||||
|
||||
cpu_side=system.cpu0.dcache_port
|
||||
mem_side=system.tol2bus.port[2]
|
||||
*/
|
||||
|
||||
//[system.cpu0.dtb]
|
||||
//type=AlphaDT
|
||||
|
||||
|
||||
#endif /* ARCH_CONST_H_ */
|
302
ext/mcpat/array.cc
Normal file
302
ext/mcpat/array.cc
Normal file
|
@ -0,0 +1,302 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#define GLOBALVAR
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "area.h"
|
||||
#include "array.h"
|
||||
#include "decoder.h"
|
||||
#include "globalvar.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
ArrayST::ArrayST(const InputParameter *configure_interface,
|
||||
string _name,
|
||||
enum Device_ty device_ty_,
|
||||
bool opt_local_,
|
||||
enum Core_type core_ty_,
|
||||
bool _is_default)
|
||||
:l_ip(*configure_interface),
|
||||
name(_name),
|
||||
device_ty(device_ty_),
|
||||
opt_local(opt_local_),
|
||||
core_ty(core_ty_),
|
||||
is_default(_is_default)
|
||||
{
|
||||
|
||||
if (l_ip.cache_sz<64) l_ip.cache_sz=64;
|
||||
l_ip.error_checking();//not only do the error checking but also fill some missing parameters
|
||||
optimize_array();
|
||||
|
||||
}
|
||||
|
||||
|
||||
void ArrayST::compute_base_power()
|
||||
{
|
||||
//l_ip.out_w =l_ip.line_sz*8;
|
||||
local_result=cacti_interface(&l_ip);
|
||||
|
||||
}
|
||||
|
||||
void ArrayST::optimize_array()
|
||||
{
|
||||
list<uca_org_t > candidate_solutions(0);
|
||||
list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
|
||||
|
||||
uca_org_t * temp_res = 0;
|
||||
local_result.valid=false;
|
||||
|
||||
double throughput=l_ip.throughput, latency=l_ip.latency;
|
||||
double area_efficiency_threshold = 20.0;
|
||||
bool throughput_overflow=true, latency_overflow=true;
|
||||
compute_base_power();
|
||||
|
||||
if ((local_result.cycle_time - throughput) <= 1e-10 )
|
||||
throughput_overflow=false;
|
||||
if ((local_result.access_time - latency)<= 1e-10)
|
||||
latency_overflow=false;
|
||||
|
||||
if (opt_for_clk && opt_local)
|
||||
{
|
||||
if (throughput_overflow || latency_overflow)
|
||||
{
|
||||
l_ip.ed=0;
|
||||
|
||||
l_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied.
|
||||
l_ip.cycle_time_wt = 1000;
|
||||
|
||||
l_ip.area_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
|
||||
l_ip.dynamic_power_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
|
||||
l_ip.leakage_power_wt = 10;
|
||||
|
||||
l_ip.delay_dev = 1000000;//Fixed number, make sure timing can be satisfied.
|
||||
l_ip.cycle_time_dev = 100;
|
||||
|
||||
l_ip.area_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
|
||||
l_ip.dynamic_power_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
|
||||
l_ip.leakage_power_dev = 1000000;
|
||||
|
||||
throughput_overflow=true; //Reset overflow flag before start optimization iterations
|
||||
latency_overflow=true;
|
||||
|
||||
temp_res = &local_result; //Clean up the result for optimized for ED^2P
|
||||
temp_res->cleanup();
|
||||
}
|
||||
|
||||
|
||||
while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10
|
||||
{
|
||||
compute_base_power();
|
||||
|
||||
l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration
|
||||
|
||||
// from best area to worst area -->worst timing to best timing
|
||||
if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)||
|
||||
(local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0))
|
||||
{ //if no satisfiable solution is found,the most aggressive one is left
|
||||
candidate_solutions.push_back(local_result);
|
||||
//output_data_csv(candidate_solutions.back());
|
||||
if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10))
|
||||
//ensure stop opt not because of cam
|
||||
{
|
||||
throughput_overflow=false;
|
||||
latency_overflow=false;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
//TODO: whether checking the partial satisfied results too, or just change the mark???
|
||||
if ((local_result.cycle_time - throughput) <= 1e-10)
|
||||
throughput_overflow=false;
|
||||
if ((local_result.access_time - latency)<= 1e-10)
|
||||
latency_overflow=false;
|
||||
|
||||
if (l_ip.cycle_time_dev > 10)
|
||||
{ //if not >10 local_result is the last result, it cannot be cleaned up
|
||||
temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up
|
||||
temp_res->cleanup();
|
||||
}
|
||||
}
|
||||
// l_ip.cycle_time_dev-=10;
|
||||
// l_ip.delay_dev-=10;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (l_ip.assoc > 0)
|
||||
{
|
||||
//For array structures except CAM and FA, Give warning but still provide a result with best timing found
|
||||
if (throughput_overflow==true)
|
||||
cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl;
|
||||
if (latency_overflow==true)
|
||||
cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl;
|
||||
}
|
||||
|
||||
// else
|
||||
// {
|
||||
// /*According to "Content-Addressable Memory (CAM) Circuits and
|
||||
// Architectures": A Tutorial and Survey
|
||||
// by Kostas Pagiamtzis et al.
|
||||
// CAM structures can be heavily pipelined and use look-ahead techniques,
|
||||
// therefore timing can be relaxed. But McPAT does not model the advanced
|
||||
// techniques. If continue optimizing, the area efficiency will be too low
|
||||
// */
|
||||
// //For CAM and FA, stop opt if area efficiency is too low
|
||||
// if (throughput_overflow==true)
|
||||
// cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name
|
||||
// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
|
||||
// if (latency_overflow==true)
|
||||
// cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name
|
||||
// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
|
||||
// }
|
||||
|
||||
//double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time;
|
||||
double min_dynamic_energy=BIGNUM;
|
||||
if (candidate_solutions.empty()==false)
|
||||
{
|
||||
local_result.valid=true;
|
||||
for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter)
|
||||
|
||||
{
|
||||
if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic)
|
||||
{
|
||||
min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
|
||||
min_dynamic_energy_iter = candidate_iter;
|
||||
local_result = *(min_dynamic_energy_iter);
|
||||
//TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match.
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
candidate_iter->cleanup() ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
candidate_solutions.clear();
|
||||
}
|
||||
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
|
||||
|
||||
double macro_layout_overhead = g_tp.macro_layout_overhead;
|
||||
double chip_PR_overhead = g_tp.chip_layout_overhead;
|
||||
double total_overhead = macro_layout_overhead*chip_PR_overhead;
|
||||
local_result.area *= total_overhead;
|
||||
|
||||
//maintain constant power density
|
||||
double pppm_t[4] = {total_overhead,1,1,total_overhead};
|
||||
|
||||
double sckRation = g_tp.sckt_co_eff;
|
||||
local_result.power.readOp.dynamic *= sckRation;
|
||||
local_result.power.writeOp.dynamic *= sckRation;
|
||||
local_result.power.searchOp.dynamic *= sckRation;
|
||||
local_result.power.readOp.leakage *= l_ip.nbanks;
|
||||
local_result.power.readOp.longer_channel_leakage =
|
||||
local_result.power.readOp.leakage*long_channel_device_reduction;
|
||||
local_result.power = local_result.power* pppm_t;
|
||||
|
||||
local_result.data_array2->power.readOp.dynamic *= sckRation;
|
||||
local_result.data_array2->power.writeOp.dynamic *= sckRation;
|
||||
local_result.data_array2->power.searchOp.dynamic *= sckRation;
|
||||
local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
|
||||
local_result.data_array2->power.readOp.longer_channel_leakage =
|
||||
local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
|
||||
local_result.data_array2->power = local_result.data_array2->power* pppm_t;
|
||||
|
||||
|
||||
if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
|
||||
{
|
||||
local_result.tag_array2->power.readOp.dynamic *= sckRation;
|
||||
local_result.tag_array2->power.writeOp.dynamic *= sckRation;
|
||||
local_result.tag_array2->power.searchOp.dynamic *= sckRation;
|
||||
local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
|
||||
local_result.tag_array2->power.readOp.longer_channel_leakage =
|
||||
local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
|
||||
local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void ArrayST::leakage_feedback(double temperature)
|
||||
{
|
||||
// Update the temperature. l_ip is already set and error-checked in the creator function.
|
||||
l_ip.temp = (unsigned int)round(temperature/10.0)*10;
|
||||
|
||||
// This corresponds to cacti_interface() in the initialization process. Leakage power is updated here.
|
||||
reconfigure(&l_ip,&local_result);
|
||||
|
||||
// Scale the power values. This is part of ArrayST::optimize_array().
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
|
||||
|
||||
double macro_layout_overhead = g_tp.macro_layout_overhead;
|
||||
double chip_PR_overhead = g_tp.chip_layout_overhead;
|
||||
double total_overhead = macro_layout_overhead*chip_PR_overhead;
|
||||
|
||||
double pppm_t[4] = {total_overhead,1,1,total_overhead};
|
||||
|
||||
double sckRation = g_tp.sckt_co_eff;
|
||||
local_result.power.readOp.dynamic *= sckRation;
|
||||
local_result.power.writeOp.dynamic *= sckRation;
|
||||
local_result.power.searchOp.dynamic *= sckRation;
|
||||
local_result.power.readOp.leakage *= l_ip.nbanks;
|
||||
local_result.power.readOp.longer_channel_leakage = local_result.power.readOp.leakage*long_channel_device_reduction;
|
||||
local_result.power = local_result.power* pppm_t;
|
||||
|
||||
local_result.data_array2->power.readOp.dynamic *= sckRation;
|
||||
local_result.data_array2->power.writeOp.dynamic *= sckRation;
|
||||
local_result.data_array2->power.searchOp.dynamic *= sckRation;
|
||||
local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
|
||||
local_result.data_array2->power.readOp.longer_channel_leakage = local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
|
||||
local_result.data_array2->power = local_result.data_array2->power* pppm_t;
|
||||
|
||||
if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
|
||||
{
|
||||
local_result.tag_array2->power.readOp.dynamic *= sckRation;
|
||||
local_result.tag_array2->power.writeOp.dynamic *= sckRation;
|
||||
local_result.tag_array2->power.searchOp.dynamic *= sckRation;
|
||||
local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
|
||||
local_result.tag_array2->power.readOp.longer_channel_leakage = local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
|
||||
local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
|
||||
}
|
||||
}
|
||||
|
||||
ArrayST:: ~ArrayST()
|
||||
{
|
||||
local_result.cleanup();
|
||||
}
|
101
ext/mcpat/array.h
Normal file
101
ext/mcpat/array.h
Normal file
|
@ -0,0 +1,101 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef ARRAY_H_
|
||||
#define ARRAY_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "basic_components.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "const.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class ArrayST :public Component{
|
||||
public:
|
||||
ArrayST(){};
|
||||
ArrayST(const InputParameter *configure_interface, string _name, enum Device_ty device_ty_, bool opt_local_=true, enum Core_type core_ty_=Inorder, bool _is_default=true);
|
||||
|
||||
InputParameter l_ip;
|
||||
string name;
|
||||
enum Device_ty device_ty;
|
||||
bool opt_local;
|
||||
enum Core_type core_ty;
|
||||
bool is_default;
|
||||
uca_org_t local_result;
|
||||
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
|
||||
virtual void optimize_array();
|
||||
virtual void compute_base_power();
|
||||
virtual ~ArrayST();
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
};
|
||||
|
||||
class InstCache :public Component{
|
||||
public:
|
||||
ArrayST* caches;
|
||||
ArrayST* missb;
|
||||
ArrayST* ifb;
|
||||
ArrayST* prefetchb;
|
||||
powerDef power_t;//temp value holder for both (max) power and runtime power
|
||||
InstCache(){caches=0;missb=0;ifb=0;prefetchb=0;};
|
||||
~InstCache(){
|
||||
if (caches) {//caches->local_result.cleanup();
|
||||
delete caches; caches=0;}
|
||||
if (missb) {//missb->local_result.cleanup();
|
||||
delete missb; missb=0;}
|
||||
if (ifb) {//ifb->local_result.cleanup();
|
||||
delete ifb; ifb=0;}
|
||||
if (prefetchb) {//prefetchb->local_result.cleanup();
|
||||
delete prefetchb; prefetchb=0;}
|
||||
};
|
||||
};
|
||||
|
||||
class DataCache :public InstCache{
|
||||
public:
|
||||
ArrayST* wbb;
|
||||
DataCache(){wbb=0;};
|
||||
~DataCache(){
|
||||
if (wbb) {//wbb->local_result.cleanup();
|
||||
delete wbb; wbb=0;}
|
||||
};
|
||||
};
|
||||
|
||||
#endif /* TLB_H_ */
|
127
ext/mcpat/basic_components.cc
Normal file
127
ext/mcpat/basic_components.cc
Normal file
|
@ -0,0 +1,127 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "basic_components.h"
|
||||
|
||||
double longer_channel_device_reduction(
|
||||
enum Device_ty device_ty,
|
||||
enum Core_type core_ty)
|
||||
{
|
||||
|
||||
double longer_channel_device_percentage_core;
|
||||
double longer_channel_device_percentage_uncore;
|
||||
double longer_channel_device_percentage_llc;
|
||||
|
||||
double long_channel_device_reduction;
|
||||
|
||||
longer_channel_device_percentage_llc = 1.0;
|
||||
longer_channel_device_percentage_uncore = 0.82;
|
||||
if (core_ty==OOO)
|
||||
{
|
||||
longer_channel_device_percentage_core = 0.56;//0.54 Xeon Tulsa //0.58 Nehelam
|
||||
//longer_channel_device_percentage_uncore = 0.76;//0.85 Nehelam
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
longer_channel_device_percentage_core = 0.8;//0.8;//Niagara
|
||||
//longer_channel_device_percentage_uncore = 0.9;//Niagara
|
||||
}
|
||||
|
||||
if (device_ty==Core_device)
|
||||
{
|
||||
long_channel_device_reduction = (1- longer_channel_device_percentage_core)
|
||||
+ longer_channel_device_percentage_core * g_tp.peri_global.long_channel_leakage_reduction;
|
||||
}
|
||||
else if (device_ty==Uncore_device)
|
||||
{
|
||||
long_channel_device_reduction = (1- longer_channel_device_percentage_uncore)
|
||||
+ longer_channel_device_percentage_uncore * g_tp.peri_global.long_channel_leakage_reduction;
|
||||
}
|
||||
else if (device_ty==LLC_device)
|
||||
{
|
||||
long_channel_device_reduction = (1- longer_channel_device_percentage_llc)
|
||||
+ longer_channel_device_percentage_llc * g_tp.peri_global.long_channel_leakage_reduction;
|
||||
}
|
||||
else
|
||||
{
|
||||
cout<<"unknown device category"<<endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
return long_channel_device_reduction;
|
||||
}
|
||||
|
||||
statsComponents operator+(const statsComponents & x, const statsComponents & y)
|
||||
{
|
||||
statsComponents z;
|
||||
|
||||
z.access = x.access + y.access;
|
||||
z.hit = x.hit + y.hit;
|
||||
z.miss = x.miss + y.miss;
|
||||
|
||||
return z;
|
||||
}
|
||||
|
||||
statsComponents operator*(const statsComponents & x, double const * const y)
|
||||
{
|
||||
statsComponents z;
|
||||
|
||||
z.access = x.access*y[0];
|
||||
z.hit = x.hit*y[1];
|
||||
z.miss = x.miss*y[2];
|
||||
|
||||
return z;
|
||||
}
|
||||
|
||||
statsDef operator+(const statsDef & x, const statsDef & y)
|
||||
{
|
||||
statsDef z;
|
||||
|
||||
z.readAc = x.readAc + y.readAc;
|
||||
z.writeAc = x.writeAc + y.writeAc;
|
||||
z.searchAc = x.searchAc + y.searchAc;
|
||||
return z;
|
||||
}
|
||||
|
||||
statsDef operator*(const statsDef & x, double const * const y)
|
||||
{
|
||||
statsDef z;
|
||||
|
||||
z.readAc = x.readAc*y;
|
||||
z.writeAc = x.writeAc*y;
|
||||
z.searchAc = x.searchAc*y;
|
||||
return z;
|
||||
}
|
265
ext/mcpat/basic_components.h
Normal file
265
ext/mcpat/basic_components.h
Normal file
|
@ -0,0 +1,265 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef BASIC_COMPONENTS_H_
|
||||
#define BASIC_COMPONENTS_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "parameter.h"
|
||||
|
||||
const double cdb_overhead = 1.1;
|
||||
|
||||
enum FU_type {
|
||||
FPU,
|
||||
ALU,
|
||||
MUL
|
||||
};
|
||||
|
||||
enum Core_type {
|
||||
OOO,
|
||||
Inorder
|
||||
};
|
||||
|
||||
enum Renaming_type {
|
||||
RAMbased,
|
||||
CAMbased
|
||||
};
|
||||
|
||||
enum Scheduler_type {
|
||||
PhysicalRegFile,
|
||||
ReservationStation
|
||||
};
|
||||
|
||||
enum cache_level {
|
||||
L2,
|
||||
L3,
|
||||
L1Directory,
|
||||
L2Directory
|
||||
};
|
||||
|
||||
enum MemoryCtrl_type {
|
||||
MC, //memory controller
|
||||
FLASHC //flash controller
|
||||
};
|
||||
|
||||
enum Dir_type {
|
||||
ST,//shadowed tag
|
||||
DC,//directory cache
|
||||
SBT,//static bank tag
|
||||
NonDir
|
||||
|
||||
};
|
||||
|
||||
enum Cache_policy {
|
||||
Write_through,
|
||||
Write_back
|
||||
};
|
||||
|
||||
enum Device_ty {
|
||||
Core_device,
|
||||
Uncore_device,
|
||||
LLC_device
|
||||
};
|
||||
|
||||
class statsComponents
|
||||
{
|
||||
public:
|
||||
double access;
|
||||
double hit;
|
||||
double miss;
|
||||
|
||||
statsComponents() : access(0), hit(0), miss(0) {}
|
||||
statsComponents(const statsComponents & obj) { *this = obj; }
|
||||
statsComponents & operator=(const statsComponents & rhs)
|
||||
{
|
||||
access = rhs.access;
|
||||
hit = rhs.hit;
|
||||
miss = rhs.miss;
|
||||
return *this;
|
||||
}
|
||||
void reset() { access = 0; hit = 0; miss = 0;}
|
||||
|
||||
friend statsComponents operator+(const statsComponents & x, const statsComponents & y);
|
||||
friend statsComponents operator*(const statsComponents & x, double const * const y);
|
||||
};
|
||||
|
||||
class statsDef
|
||||
{
|
||||
public:
|
||||
statsComponents readAc;
|
||||
statsComponents writeAc;
|
||||
statsComponents searchAc;
|
||||
|
||||
statsDef() : readAc(), writeAc(),searchAc() { }
|
||||
void reset() { readAc.reset(); writeAc.reset();searchAc.reset();}
|
||||
|
||||
friend statsDef operator+(const statsDef & x, const statsDef & y);
|
||||
friend statsDef operator*(const statsDef & x, double const * const y);
|
||||
};
|
||||
|
||||
double longer_channel_device_reduction(
|
||||
enum Device_ty device_ty=Core_device,
|
||||
enum Core_type core_ty=Inorder);
|
||||
|
||||
class CoreDynParam {
|
||||
public:
|
||||
CoreDynParam(){};
|
||||
CoreDynParam(ParseXML *XML_interface, int ithCore_);
|
||||
// :XML(XML_interface),
|
||||
// ithCore(ithCore_)
|
||||
// core_ty(inorder),
|
||||
// rm_ty(CAMbased),
|
||||
// scheu_ty(PhysicalRegFile),
|
||||
// clockRate(1e9),//1GHz
|
||||
// arch_ireg_width(32),
|
||||
// arch_freg_width(32),
|
||||
// phy_ireg_width(128),
|
||||
// phy_freg_width(128),
|
||||
// perThreadState(8),
|
||||
// globalCheckpoint(32),
|
||||
// instructionLength(32){};
|
||||
//ParseXML * XML;
|
||||
bool opt_local;
|
||||
bool x86;
|
||||
bool Embedded;
|
||||
enum Core_type core_ty;
|
||||
enum Renaming_type rm_ty;
|
||||
enum Scheduler_type scheu_ty;
|
||||
double clockRate,executionTime;
|
||||
int arch_ireg_width, arch_freg_width, phy_ireg_width, phy_freg_width;
|
||||
int num_IRF_entry, num_FRF_entry, num_ifreelist_entries, num_ffreelist_entries;
|
||||
int fetchW, decodeW,issueW,peak_issueW, commitW,peak_commitW, predictionW, fp_issueW, fp_decodeW;
|
||||
int perThreadState, globalCheckpoint, instruction_length, pc_width, opcode_length, micro_opcode_length;
|
||||
int num_hthreads, pipeline_stages, fp_pipeline_stages, num_pipelines, num_fp_pipelines;
|
||||
int num_alus, num_muls;
|
||||
double num_fpus;
|
||||
int int_data_width, fp_data_width,v_address_width, p_address_width;
|
||||
double pipeline_duty_cycle, total_cycles, busy_cycles, idle_cycles;
|
||||
bool regWindowing,multithreaded;
|
||||
double pppm_lkg_multhread[4];
|
||||
double IFU_duty_cycle,BR_duty_cycle,LSU_duty_cycle,MemManU_I_duty_cycle,
|
||||
MemManU_D_duty_cycle, ALU_duty_cycle,MUL_duty_cycle,
|
||||
FPU_duty_cycle, ALU_cdb_duty_cycle,MUL_cdb_duty_cycle,
|
||||
FPU_cdb_duty_cycle;
|
||||
~CoreDynParam(){};
|
||||
};
|
||||
|
||||
class CacheDynParam {
|
||||
public:
|
||||
CacheDynParam(){};
|
||||
CacheDynParam(ParseXML *XML_interface, int ithCache_);
|
||||
string name;
|
||||
enum Dir_type dir_ty;
|
||||
double clockRate,executionTime;
|
||||
double capacity, blockW, assoc, nbanks;
|
||||
double throughput, latency;
|
||||
double duty_cycle, dir_duty_cycle;
|
||||
//double duty_cycle;
|
||||
int missb_size, fu_size, prefetchb_size, wbb_size;
|
||||
~CacheDynParam(){};
|
||||
};
|
||||
|
||||
class MCParam {
|
||||
public:
|
||||
MCParam(){};
|
||||
MCParam(ParseXML *XML_interface, int ithCache_);
|
||||
string name;
|
||||
double clockRate,num_mcs, peakDataTransferRate, num_channels;
|
||||
// double mcTEPowerperGhz;
|
||||
// double mcPHYperGbit;
|
||||
// double area;
|
||||
int llcBlockSize, dataBusWidth, addressBusWidth;
|
||||
int opcodeW;
|
||||
int memAccesses;
|
||||
int memRank;
|
||||
int type;
|
||||
double frontend_duty_cycle, duty_cycle, perc_load;
|
||||
double executionTime, reads, writes;
|
||||
bool LVDS, withPHY;
|
||||
|
||||
~MCParam(){};
|
||||
};
|
||||
|
||||
class NoCParam {
|
||||
public:
|
||||
NoCParam(){};
|
||||
NoCParam(ParseXML *XML_interface, int ithCache_);
|
||||
string name;
|
||||
double clockRate;
|
||||
int flit_size;
|
||||
int input_ports, output_ports, min_ports, global_linked_ports;
|
||||
int virtual_channel_per_port,input_buffer_entries_per_vc;
|
||||
int horizontal_nodes,vertical_nodes, total_nodes;
|
||||
double executionTime, total_access, link_throughput,link_latency,
|
||||
duty_cycle, chip_coverage, route_over_perc;
|
||||
bool has_global_link, type;
|
||||
|
||||
~NoCParam(){};
|
||||
};
|
||||
|
||||
class ProcParam {
|
||||
public:
|
||||
ProcParam(){};
|
||||
ProcParam(ParseXML *XML_interface, int ithCache_);
|
||||
string name;
|
||||
int numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir,numMC, numMCChannel;
|
||||
bool homoCore, homoL2, homoL3, homoNOC, homoL1Dir, homoL2Dir;
|
||||
|
||||
~ProcParam(){};
|
||||
};
|
||||
|
||||
class NIUParam {
|
||||
public:
|
||||
NIUParam(){};
|
||||
NIUParam(ParseXML *XML_interface, int ithCache_);
|
||||
string name;
|
||||
double clockRate;
|
||||
int num_units;
|
||||
int type;
|
||||
double duty_cycle, perc_load;
|
||||
~NIUParam(){};
|
||||
};
|
||||
|
||||
class PCIeParam {
|
||||
public:
|
||||
PCIeParam(){};
|
||||
PCIeParam(ParseXML *XML_interface, int ithCache_);
|
||||
string name;
|
||||
double clockRate;
|
||||
int num_channels, num_units;
|
||||
bool withPHY;
|
||||
int type;
|
||||
double duty_cycle, perc_load;
|
||||
~PCIeParam(){};
|
||||
};
|
||||
#endif /* BASIC_COMPONENTS_H_ */
|
94
ext/mcpat/cacti/README
Normal file
94
ext/mcpat/cacti/README
Normal file
|
@ -0,0 +1,94 @@
|
|||
-----------------------------------------------------------
|
||||
____ _ ____ _____ ___ __ ____
|
||||
/ ___| / \ / ___|_ _|_ _| / /_ | ___|
|
||||
| | / _ \| | | | | | | '_ \ |___ \
|
||||
| |___ / ___ \ |___ | | | | | (_) | ___) |
|
||||
\____/_/ \_\____| |_| |___| \___(_)____/
|
||||
|
||||
|
||||
A Tool to Model Caches/Memories
|
||||
-----------------------------------------------------------
|
||||
|
||||
CACTI is an analytical tool that takes a set of cache/memory para-
|
||||
meters as input and calculates its access time, power, cycle
|
||||
time, and area.
|
||||
CACTI was originally developed by Dr. Jouppi and Dr. Wilton
|
||||
in 1993 and since then it has undergone five major
|
||||
revisions.
|
||||
|
||||
List of features (version 1-6.5):
|
||||
===============================
|
||||
The following is the list of features supported by the tool.
|
||||
|
||||
* Power, delay, area, and cycle time model for
|
||||
direct mapped caches
|
||||
set-associative caches
|
||||
fully associative caches
|
||||
Embedded DRAM memories
|
||||
Commodity DRAM memories
|
||||
|
||||
* Support for modeling multi-ported uniform cache access (UCA)
|
||||
and multi-banked, multi-ported non-uniform cache access (NUCA).
|
||||
|
||||
* Leakage power calculation that also considers the operating
|
||||
temperature of the cache.
|
||||
|
||||
* Router power model.
|
||||
|
||||
* Interconnect model with different delay, power, and area
|
||||
properties including low-swing wire model.
|
||||
|
||||
* An interface to perform trade-off analysis involving power, delay,
|
||||
area, and bandwidth.
|
||||
|
||||
* All process specific values used by the tool are obtained
|
||||
from ITRS and currently, the tool supports 90nm, 65nm, 45nm,
|
||||
and 32nm technology nodes.
|
||||
|
||||
Version 6.5 has a new c++ code base and includes numerous bug fixes.
|
||||
CACTI 5.3 and 6.0 activate an entire row of mats to read/write a single
|
||||
block of data. This technique improves reliability at the cost of
|
||||
power. CACTI 6.5 activates minimum number of mats just enough to retrieve
|
||||
a block to minimize power.
|
||||
|
||||
How to use the tool?
|
||||
====================
|
||||
Prior versions of CACTI take input parameters such as cache
|
||||
size and technology node as a set of command line arguments.
|
||||
To avoid a long list of command line arguments,
|
||||
CACTI 6.5 lets users specify their cache model in a more
|
||||
detailed manner by using a config file (cache.cfg).
|
||||
|
||||
-> define the cache model using cache.cfg
|
||||
-> run the "cacti" binary <./cacti -infile cache.cfg>
|
||||
|
||||
CACTI6.5 also provides a command line interface similar to earlier versions
|
||||
of CACTI. The command line interface can be used as
|
||||
|
||||
./cacti cache_size line_size associativity rw_ports excl_read_ports excl_write_ports
|
||||
single_ended_read_ports search_ports banks tech_node output_width specific_tag tag_width
|
||||
access_mode cache main_mem obj_func_delay obj_func_dynamic_power obj_func_leakage_power
|
||||
obj_func_cycle_time obj_func_area dev_func_delay dev_func_dynamic_power dev_func_leakage_power
|
||||
dev_func_area dev_func_cycle_time ed_ed2_none temp wt data_arr_ram_cell_tech_flavor_in
|
||||
data_arr_peri_global_tech_flavor_in tag_arr_ram_cell_tech_flavor_in tag_arr_peri_global_tech_flavor_in
|
||||
interconnect_projection_type_in wire_inside_mat_type_in wire_outside_mat_type_in
|
||||
REPEATERS_IN_HTREE_SEGMENTS_in VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in
|
||||
BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in PAGE_SIZE_BITS_in BURST_LENGTH_in
|
||||
INTERNAL_PREFETCH_WIDTH_in force_wiretype wiretype force_config ndwl ndbl nspd ndcm
|
||||
ndsam1 ndsam2 ecc
|
||||
|
||||
For complete documentation of the tool, please refer CACTI-5.3 and 6.0
|
||||
technical reports and the following paper,
|
||||
"Optimizing NUCA Organizations and Wiring Alternatives for
|
||||
Large Caches With CACTI 6.0", that appears in MICRO 2007.
|
||||
|
||||
We are still improving the tool and refining the code. If you
|
||||
have any comments, questions, or suggestions please write to
|
||||
us.
|
||||
|
||||
Naveen Muralimanohar Jung Ho Ahn Sheng Li
|
||||
naveen.muralimanohar@hp.com gajh@snu.ac.kr sheng.li@hp.com
|
||||
|
||||
|
||||
|
||||
|
916
ext/mcpat/cacti/Ucache.cc
Normal file
916
ext/mcpat/cacti/Ucache.cc
Normal file
|
@ -0,0 +1,916 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
|
||||
#include "Ucache.h"
|
||||
#include "area.h"
|
||||
#include "bank.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "component.h"
|
||||
#include "const.h"
|
||||
#include "decoder.h"
|
||||
#include "parameter.h"
|
||||
#include "subarray.h"
|
||||
#include "uca.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
const uint32_t nthreads = NTHREADS;
|
||||
|
||||
|
||||
void min_values_t::update_min_values(const min_values_t * val)
|
||||
{
|
||||
min_delay = (min_delay > val->min_delay) ? val->min_delay : min_delay;
|
||||
min_dyn = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn;
|
||||
min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage;
|
||||
min_area = (min_area > val->min_area) ? val->min_area : min_area;
|
||||
min_cyc = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void min_values_t::update_min_values(const uca_org_t & res)
|
||||
{
|
||||
min_delay = (min_delay > res.access_time) ? res.access_time : min_delay;
|
||||
min_dyn = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn;
|
||||
min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage;
|
||||
min_area = (min_area > res.area) ? res.area : min_area;
|
||||
min_cyc = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc;
|
||||
}
|
||||
|
||||
void min_values_t::update_min_values(const nuca_org_t * res)
|
||||
{
|
||||
min_delay = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay;
|
||||
min_dyn = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn;
|
||||
min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage;
|
||||
min_area = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area;
|
||||
min_cyc = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc;
|
||||
}
|
||||
|
||||
void min_values_t::update_min_values(const mem_array * res)
|
||||
{
|
||||
min_delay = (min_delay > res->access_time) ? res->access_time : min_delay;
|
||||
min_dyn = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn;
|
||||
min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage;
|
||||
min_area = (min_area > res->area) ? res->area : min_area;
|
||||
min_cyc = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void * calc_time_mt_wrapper(void * void_obj)
|
||||
{
|
||||
calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj;
|
||||
uint32_t tid = calc_obj->tid;
|
||||
list<mem_array *> & data_arr = calc_obj->data_arr;
|
||||
list<mem_array *> & tag_arr = calc_obj->tag_arr;
|
||||
bool is_tag = calc_obj->is_tag;
|
||||
bool pure_ram = calc_obj->pure_ram;
|
||||
bool pure_cam = calc_obj->pure_cam;
|
||||
bool is_main_mem = calc_obj->is_main_mem;
|
||||
double Nspd_min = calc_obj->Nspd_min;
|
||||
min_values_t * data_res = calc_obj->data_res;
|
||||
min_values_t * tag_res = calc_obj->tag_res;
|
||||
|
||||
data_arr.clear();
|
||||
data_arr.push_back(new mem_array);
|
||||
tag_arr.clear();
|
||||
tag_arr.push_back(new mem_array);
|
||||
|
||||
uint32_t Ndwl_niter = _log2(MAXDATAN) + 1;
|
||||
uint32_t Ndbl_niter = _log2(MAXDATAN) + 1;
|
||||
uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1;
|
||||
uint32_t niter = Ndwl_niter * Ndbl_niter * Ndcm_niter;
|
||||
|
||||
|
||||
bool is_valid_partition;
|
||||
int wt_min, wt_max;
|
||||
|
||||
if (g_ip->force_wiretype) {
|
||||
if (g_ip->wt == 0) {
|
||||
wt_min = Low_swing;
|
||||
wt_max = Low_swing;
|
||||
}
|
||||
else {
|
||||
wt_min = Global;
|
||||
wt_max = Low_swing-1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
wt_min = Global;
|
||||
wt_max = Low_swing;
|
||||
}
|
||||
|
||||
for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2)
|
||||
{
|
||||
for (int wr = wt_min; wr <= wt_max; wr++)
|
||||
{
|
||||
for (uint32_t iter = tid; iter < niter; iter += nthreads)
|
||||
{
|
||||
// reconstruct Ndwl, Ndbl, Ndcm
|
||||
unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter));
|
||||
unsigned int Ndbl = 1 << ((iter / (Ndcm_niter))%Ndbl_niter);
|
||||
unsigned int Ndcm = 1 << (iter % Ndcm_niter);
|
||||
for(unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX; Ndsam_lev_1 *= 2)
|
||||
{
|
||||
for(unsigned int Ndsam_lev_2 = 1; Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2)
|
||||
{
|
||||
//for debuging
|
||||
if (g_ip->force_cache_config && is_tag == false)
|
||||
{
|
||||
wr = g_ip->wt;
|
||||
Ndwl = g_ip->ndwl;
|
||||
Ndbl = g_ip->ndbl;
|
||||
Ndcm = g_ip->ndcm;
|
||||
if(g_ip->nspd != 0) {
|
||||
Nspd = g_ip->nspd;
|
||||
}
|
||||
if(g_ip->ndsam1 != 0) {
|
||||
Ndsam_lev_1 = g_ip->ndsam1;
|
||||
Ndsam_lev_2 = g_ip->ndsam2;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_tag == true)
|
||||
{
|
||||
is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl,
|
||||
Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
|
||||
tag_arr.back(), 0, NULL, NULL,
|
||||
is_main_mem);
|
||||
}
|
||||
// If it's a fully-associative cache, the data array partition parameters are identical to that of
|
||||
// the tag array, so compute data array partition properties also here.
|
||||
if (is_tag == false || g_ip->fully_assoc)
|
||||
{
|
||||
is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl,
|
||||
Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
|
||||
data_arr.back(), 0, NULL, NULL,
|
||||
is_main_mem);
|
||||
}
|
||||
|
||||
if (is_valid_partition)
|
||||
{
|
||||
if (is_tag == true)
|
||||
{
|
||||
tag_arr.back()->wt = (enum Wire_type) wr;
|
||||
tag_res->update_min_values(tag_arr.back());
|
||||
tag_arr.push_back(new mem_array);
|
||||
}
|
||||
if (is_tag == false || g_ip->fully_assoc)
|
||||
{
|
||||
data_arr.back()->wt = (enum Wire_type) wr;
|
||||
data_res->update_min_values(data_arr.back());
|
||||
data_arr.push_back(new mem_array);
|
||||
}
|
||||
}
|
||||
|
||||
if (g_ip->force_cache_config && is_tag == false)
|
||||
{
|
||||
wr = wt_max;
|
||||
iter = niter;
|
||||
if(g_ip->nspd != 0) {
|
||||
Nspd = MAXDATASPD;
|
||||
}
|
||||
if (g_ip->ndsam1 != 0) {
|
||||
Ndsam_lev_1 = MAX_COL_MUX+1;
|
||||
Ndsam_lev_2 = MAX_COL_MUX+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete data_arr.back();
|
||||
delete tag_arr.back();
|
||||
data_arr.pop_back();
|
||||
tag_arr.pop_back();
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool calculate_time(
|
||||
bool is_tag,
|
||||
int pure_ram,
|
||||
bool pure_cam,
|
||||
double Nspd,
|
||||
unsigned int Ndwl,
|
||||
unsigned int Ndbl,
|
||||
unsigned int Ndcm,
|
||||
unsigned int Ndsam_lev_1,
|
||||
unsigned int Ndsam_lev_2,
|
||||
mem_array *ptr_array,
|
||||
int flag_results_populate,
|
||||
results_mem_array *ptr_results,
|
||||
uca_org_t *ptr_fin_res,
|
||||
bool is_main_mem)
|
||||
{
|
||||
DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
|
||||
|
||||
if (dyn_p.is_valid == false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
UCA * uca = new UCA(dyn_p);
|
||||
|
||||
|
||||
if (flag_results_populate)
|
||||
{ //For the final solution, populate the ptr_results data structure -- TODO: copy only necessary variables
|
||||
}
|
||||
else
|
||||
{
|
||||
int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir;
|
||||
int num_mats = uca->bank.dp.num_mats;
|
||||
bool is_fa = uca->bank.dp.fully_assoc;
|
||||
bool pure_cam = uca->bank.dp.pure_cam;
|
||||
ptr_array->Ndwl = Ndwl;
|
||||
ptr_array->Ndbl = Ndbl;
|
||||
ptr_array->Nspd = Nspd;
|
||||
ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing;
|
||||
ptr_array->Ndsam_lev_1 = Ndsam_lev_1;
|
||||
ptr_array->Ndsam_lev_2 = Ndsam_lev_2;
|
||||
ptr_array->access_time = uca->access_time;
|
||||
ptr_array->cycle_time = uca->cycle_time;
|
||||
ptr_array->multisubbank_interleave_cycle_time = uca->multisubbank_interleave_cycle_time;
|
||||
ptr_array->area_ram_cells = uca->area_all_dataramcells;
|
||||
ptr_array->area = uca->area.get_area();
|
||||
ptr_array->height = uca->area.h;
|
||||
ptr_array->width = uca->area.w;
|
||||
ptr_array->mat_height = uca->bank.mat.area.h;
|
||||
ptr_array->mat_length = uca->bank.mat.area.w;
|
||||
ptr_array->subarray_height = uca->bank.mat.subarray.area.h;
|
||||
ptr_array->subarray_length = uca->bank.mat.subarray.area.w;
|
||||
ptr_array->power = uca->power;
|
||||
ptr_array->delay_senseamp_mux_decoder =
|
||||
MAX(uca->delay_array_to_sa_mux_lev_1_decoder,
|
||||
uca->delay_array_to_sa_mux_lev_2_decoder);
|
||||
ptr_array->delay_before_subarray_output_driver = uca->delay_before_subarray_output_driver;
|
||||
ptr_array->delay_from_subarray_output_driver_to_output = uca->delay_from_subarray_out_drv_to_out;
|
||||
|
||||
ptr_array->delay_route_to_bank = uca->htree_in_add->delay;
|
||||
ptr_array->delay_input_htree = uca->bank.htree_in_add->delay;
|
||||
ptr_array->delay_row_predecode_driver_and_block = uca->bank.mat.r_predec->delay;
|
||||
ptr_array->delay_row_decoder = uca->bank.mat.row_dec->delay;
|
||||
ptr_array->delay_bitlines = uca->bank.mat.delay_bitline;
|
||||
ptr_array->delay_matchlines = uca->bank.mat.delay_matchchline;
|
||||
ptr_array->delay_sense_amp = uca->bank.mat.delay_sa;
|
||||
ptr_array->delay_subarray_output_driver = uca->bank.mat.delay_subarray_out_drv_htree;
|
||||
ptr_array->delay_dout_htree = uca->bank.htree_out_data->delay;
|
||||
ptr_array->delay_comparator = uca->bank.mat.delay_comparator;
|
||||
|
||||
ptr_array->all_banks_height = uca->area.h;
|
||||
ptr_array->all_banks_width = uca->area.w;
|
||||
ptr_array->area_efficiency = uca->area_all_dataramcells * 100 / (uca->area.get_area());
|
||||
|
||||
ptr_array->power_routing_to_bank = uca->power_routing_to_bank;
|
||||
ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power;
|
||||
ptr_array->power_data_input_htree = uca->bank.htree_in_data->power;
|
||||
// cout<<"power_data_input_htree"<<uca->bank.htree_in_data->power.readOp.leakage<<endl;
|
||||
ptr_array->power_data_output_htree = uca->bank.htree_out_data->power;
|
||||
// cout<<"power_data_output_htree"<<uca->bank.htree_out_data->power.readOp.leakage<<endl;
|
||||
ptr_array->power_row_predecoder_drivers = uca->bank.mat.r_predec->driver_power;
|
||||
ptr_array->power_row_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_row_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_row_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_row_predecoder_blocks = uca->bank.mat.r_predec->block_power;
|
||||
ptr_array->power_row_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_row_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_row_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders;
|
||||
ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_bit_mux_predecoder_drivers = uca->bank.mat.b_mux_predec->driver_power;
|
||||
ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_bit_mux_predecoder_blocks = uca->bank.mat.b_mux_predec->block_power;
|
||||
ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders;
|
||||
ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bit_mux_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bit_mux_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_drivers = uca->bank.mat.sa_mux_lev_1_predec->driver_power;
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_blocks = uca->bank.mat.sa_mux_lev_1_predec->block_power;
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_senseamp_mux_lev_1_decoders = uca->bank.mat.power_sa_mux_lev_1_decoders;
|
||||
ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_drivers = uca->bank.mat.sa_mux_lev_2_predec->driver_power;
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_blocks = uca->bank.mat.sa_mux_lev_2_predec->block_power;
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_senseamp_mux_lev_2_decoders = uca->bank.mat.power_sa_mux_lev_2_decoders;
|
||||
ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_bitlines = uca->bank.mat.power_bitline;
|
||||
ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_sense_amps = uca->bank.mat.power_sa;
|
||||
ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_prechg_eq_drivers = uca->bank.mat.power_bl_precharge_eq_drv;
|
||||
ptr_array->power_prechg_eq_drivers.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_prechg_eq_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_prechg_eq_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_output_drivers_at_subarray = uca->bank.mat.power_subarray_out_drv;
|
||||
ptr_array->power_output_drivers_at_subarray.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
ptr_array->power_comparators = uca->bank.mat.power_comparator;
|
||||
ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir;
|
||||
ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir;
|
||||
|
||||
// cout << " num of mats: " << dyn_p.num_mats << endl;
|
||||
if (is_fa || pure_cam)
|
||||
{
|
||||
ptr_array->power_htree_in_search = uca->bank.htree_in_search->power;
|
||||
// cout<<"power_htree_in_search"<<uca->bank.htree_in_search->power.readOp.leakage<<endl;
|
||||
ptr_array->power_htree_out_search = uca->bank.htree_out_search->power;
|
||||
// cout<<"power_htree_out_search"<<uca->bank.htree_out_search->power.readOp.leakage<<endl;
|
||||
ptr_array->power_searchline = uca->bank.mat.power_searchline;
|
||||
// cout<<"power_searchlineh"<<uca->bank.mat.power_searchline.readOp.leakage<<endl;
|
||||
ptr_array->power_searchline.searchOp.dynamic *= num_mats;
|
||||
ptr_array->power_searchline_precharge = uca->bank.mat.power_searchline_precharge;
|
||||
ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats;
|
||||
ptr_array->power_matchlines = uca->bank.mat.power_matchline;
|
||||
ptr_array->power_matchlines.searchOp.dynamic *= num_mats;
|
||||
ptr_array->power_matchline_precharge = uca->bank.mat.power_matchline_precharge;
|
||||
ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats;
|
||||
ptr_array->power_matchline_to_wordline_drv = uca->bank.mat.power_ml_to_ram_wl_drv;
|
||||
// cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.power_matchline.searchOp.leakage<<endl;
|
||||
}
|
||||
|
||||
ptr_array->activate_energy = uca->activate_energy;
|
||||
ptr_array->read_energy = uca->read_energy;
|
||||
ptr_array->write_energy = uca->write_energy;
|
||||
ptr_array->precharge_energy = uca->precharge_energy;
|
||||
ptr_array->refresh_power = uca->refresh_power;
|
||||
ptr_array->leak_power_subbank_closed_page = uca->leak_power_subbank_closed_page;
|
||||
ptr_array->leak_power_subbank_open_page = uca->leak_power_subbank_open_page;
|
||||
ptr_array->leak_power_request_and_reply_networks = uca->leak_power_request_and_reply_networks;
|
||||
|
||||
ptr_array->precharge_delay = uca->precharge_delay;
|
||||
|
||||
|
||||
// cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.<<endl;
|
||||
//
|
||||
// if (!(is_fa || pure_cam))
|
||||
// {
|
||||
// cout << " num of cols: " << dyn_p.num_c_subarray << endl;
|
||||
// }
|
||||
// else if (is_fa)
|
||||
// {
|
||||
// cout << " num of cols: " << dyn_p.tag_num_c_subarray+ dyn_p.data_num_c_subarray<< endl;
|
||||
// } else
|
||||
// cout << " num of cols: " << dyn_p.tag_num_c_subarray<< endl;
|
||||
// cout << uca->bank.mat.subarray.get_total_cell_area()<<endl;
|
||||
}
|
||||
|
||||
|
||||
delete uca;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool check_uca_org(uca_org_t & u, min_values_t *minval)
|
||||
{
|
||||
if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
|
||||
g_ip->dynamic_power_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
|
||||
g_ip->leakage_power_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
|
||||
g_ip->cycle_time_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.area - minval->min_area)/minval->min_area)*100 >
|
||||
g_ip->area_dev) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool check_mem_org(mem_array & u, const min_values_t *minval)
|
||||
{
|
||||
if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
|
||||
g_ip->dynamic_power_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
|
||||
g_ip->leakage_power_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
|
||||
g_ip->cycle_time_dev) {
|
||||
return false;
|
||||
}
|
||||
if (((u.area - minval->min_area)/minval->min_area)*100 >
|
||||
g_ip->area_dev) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void find_optimal_uca(uca_org_t *res, min_values_t * minval, list<uca_org_t> & ulist)
|
||||
{
|
||||
double cost = 0;
|
||||
double min_cost = BIGNUM;
|
||||
float d, a, dp, lp, c;
|
||||
|
||||
dp = g_ip->dynamic_power_wt;
|
||||
lp = g_ip->leakage_power_wt;
|
||||
a = g_ip->area_wt;
|
||||
d = g_ip->delay_wt;
|
||||
c = g_ip->cycle_time_wt;
|
||||
|
||||
if (ulist.empty() == true)
|
||||
{
|
||||
cout << "ERROR: no valid cache organizations found" << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end(); niter++)
|
||||
{
|
||||
if (g_ip->ed == 1)
|
||||
{
|
||||
cost = ((niter)->access_time/minval->min_delay) * ((niter)->power.readOp.dynamic/minval->min_dyn);
|
||||
if (min_cost > cost)
|
||||
{
|
||||
min_cost = cost;
|
||||
*res = (*(niter));
|
||||
}
|
||||
}
|
||||
else if (g_ip->ed == 2)
|
||||
{
|
||||
cost = ((niter)->access_time/minval->min_delay)*
|
||||
((niter)->access_time/minval->min_delay)*
|
||||
((niter)->power.readOp.dynamic/minval->min_dyn);
|
||||
if (min_cost > cost)
|
||||
{
|
||||
min_cost = cost;
|
||||
*res = (*(niter));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* check whether the current organization
|
||||
* meets the input deviation constraints
|
||||
*/
|
||||
bool v = check_uca_org(*niter, minval);
|
||||
//if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
|
||||
|
||||
if (v)
|
||||
{
|
||||
cost = (d * ((niter)->access_time/minval->min_delay) +
|
||||
c * ((niter)->cycle_time/minval->min_cyc) +
|
||||
dp * ((niter)->power.readOp.dynamic/minval->min_dyn) +
|
||||
lp * ((niter)->power.readOp.leakage/minval->min_leakage) +
|
||||
a * ((niter)->area/minval->min_area));
|
||||
//fprintf(stderr, "cost = %g\n", cost);
|
||||
|
||||
if (min_cost > cost) {
|
||||
min_cost = cost;
|
||||
*res = (*(niter));
|
||||
niter = ulist.erase(niter);
|
||||
if (niter!=ulist.begin())
|
||||
niter--;
|
||||
}
|
||||
}
|
||||
else {
|
||||
niter = ulist.erase(niter);
|
||||
if (niter!=ulist.begin())
|
||||
niter--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (min_cost == BIGNUM)
|
||||
{
|
||||
cout << "ERROR: no cache organizations met optimization criteria" << endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void filter_tag_arr(const min_values_t * min, list<mem_array *> & list)
|
||||
{
|
||||
double cost = BIGNUM;
|
||||
double cur_cost;
|
||||
double wt_delay = g_ip->delay_wt, wt_dyn = g_ip->dynamic_power_wt, wt_leakage = g_ip->leakage_power_wt, wt_cyc = g_ip->cycle_time_wt, wt_area = g_ip->area_wt;
|
||||
mem_array * res = NULL;
|
||||
|
||||
if (list.empty() == true)
|
||||
{
|
||||
cout << "ERROR: no valid tag organizations found" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
while (list.empty() != true)
|
||||
{
|
||||
bool v = check_mem_org(*list.back(), min);
|
||||
if (v)
|
||||
{
|
||||
cur_cost = wt_delay * (list.back()->access_time/min->min_delay) +
|
||||
wt_dyn * (list.back()->power.readOp.dynamic/min->min_dyn) +
|
||||
wt_leakage * (list.back()->power.readOp.leakage/min->min_leakage) +
|
||||
wt_area * (list.back()->area/min->min_area) +
|
||||
wt_cyc * (list.back()->cycle_time/min->min_cyc);
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_cost = BIGNUM;
|
||||
}
|
||||
if (cur_cost < cost)
|
||||
{
|
||||
if (res != NULL)
|
||||
{
|
||||
delete res;
|
||||
}
|
||||
cost = cur_cost;
|
||||
res = list.back();
|
||||
}
|
||||
else
|
||||
{
|
||||
delete list.back();
|
||||
}
|
||||
list.pop_back();
|
||||
}
|
||||
if(!res)
|
||||
{
|
||||
cout << "ERROR: no valid tag organizations found" << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
list.push_back(res);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void filter_data_arr(list<mem_array *> & curr_list)
|
||||
{
|
||||
if (curr_list.empty() == true)
|
||||
{
|
||||
cout << "ERROR: no valid data array organizations found" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
list<mem_array *>::iterator iter;
|
||||
|
||||
for (iter = curr_list.begin(); iter != curr_list.end(); ++iter)
|
||||
{
|
||||
mem_array * m = *iter;
|
||||
|
||||
if (m == NULL) exit(1);
|
||||
|
||||
if(((m->access_time - m->arr_min->min_delay)/m->arr_min->min_delay > 0.5) &&
|
||||
((m->power.readOp.dynamic - m->arr_min->min_dyn)/m->arr_min->min_dyn > 0.5))
|
||||
{
|
||||
delete m;
|
||||
iter = curr_list.erase(iter);
|
||||
iter --;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Performs exhaustive search across different sub-array sizes,
|
||||
* wire types and aspect ratios to find an optimal UCA organization
|
||||
* 1. First different valid tag array organizations are calculated
|
||||
* and stored in tag_arr array
|
||||
* 2. The exhaustive search is repeated to find valid data array
|
||||
* organizations and stored in data_arr array
|
||||
* 3. Cache area, delay, power, and cycle time for different
|
||||
* cache organizations are calculated based on the
|
||||
* above results
|
||||
* 4. Cache model with least cost is picked from sol_list
|
||||
*/
|
||||
void solve(uca_org_t *fin_res)
|
||||
{
|
||||
bool is_dram = false;
|
||||
int pure_ram = g_ip->pure_ram;
|
||||
bool pure_cam = g_ip->pure_cam;
|
||||
|
||||
init_tech_params(g_ip->F_sz_um, false);
|
||||
|
||||
|
||||
list<mem_array *> tag_arr (0);
|
||||
list<mem_array *> data_arr(0);
|
||||
list<mem_array *>::iterator miter;
|
||||
list<uca_org_t> sol_list(1, uca_org_t());
|
||||
|
||||
fin_res->tag_array.access_time = 0;
|
||||
fin_res->tag_array.Ndwl = 0;
|
||||
fin_res->tag_array.Ndbl = 0;
|
||||
fin_res->tag_array.Nspd = 0;
|
||||
fin_res->tag_array.deg_bl_muxing = 0;
|
||||
fin_res->tag_array.Ndsam_lev_1 = 0;
|
||||
fin_res->tag_array.Ndsam_lev_2 = 0;
|
||||
|
||||
|
||||
// distribute calculate_time() execution to multiple threads
|
||||
calc_time_mt_wrapper_struct * calc_array = new calc_time_mt_wrapper_struct[nthreads];
|
||||
pthread_t threads[nthreads];
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
calc_array[t].tid = t;
|
||||
calc_array[t].pure_ram = pure_ram;
|
||||
calc_array[t].pure_cam = pure_cam;
|
||||
calc_array[t].data_res = new min_values_t();
|
||||
calc_array[t].tag_res = new min_values_t();
|
||||
}
|
||||
|
||||
bool is_tag;
|
||||
uint32_t ram_cell_tech_type;
|
||||
|
||||
// If it's a cache, first calculate the area, delay and power for all tag array partitions.
|
||||
if (!(pure_ram||pure_cam||g_ip->fully_assoc))
|
||||
{ //cache
|
||||
is_tag = true;
|
||||
ram_cell_tech_type = g_ip->tag_arr_ram_cell_tech_type;
|
||||
is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
|
||||
init_tech_params(g_ip->F_sz_um, is_tag);
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
calc_array[t].is_tag = is_tag;
|
||||
calc_array[t].is_main_mem = false;
|
||||
calc_array[t].Nspd_min = 0.125;
|
||||
pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
|
||||
}
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
calc_array[t].data_arr.sort(mem_array::lt);
|
||||
data_arr.merge(calc_array[t].data_arr, mem_array::lt);
|
||||
calc_array[t].tag_arr.sort(mem_array::lt);
|
||||
tag_arr.merge(calc_array[t].tag_arr, mem_array::lt);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// calculate the area, delay and power for all data array partitions (for cache or plain RAM).
|
||||
// if (!g_ip->fully_assoc)
|
||||
// {//in the new cacti, cam, fully_associative cache are processed as single array in the data portion
|
||||
is_tag = false;
|
||||
ram_cell_tech_type = g_ip->data_arr_ram_cell_tech_type;
|
||||
is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
|
||||
init_tech_params(g_ip->F_sz_um, is_tag);
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
calc_array[t].is_tag = is_tag;
|
||||
calc_array[t].is_main_mem = g_ip->is_main_mem;
|
||||
if (!(pure_cam||g_ip->fully_assoc))
|
||||
{
|
||||
calc_array[t].Nspd_min = (double)(g_ip->out_w)/(double)(g_ip->block_sz*8);
|
||||
}
|
||||
else
|
||||
{
|
||||
calc_array[t].Nspd_min = 1;
|
||||
}
|
||||
|
||||
pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
|
||||
}
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
|
||||
data_arr.clear();
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
calc_array[t].data_arr.sort(mem_array::lt);
|
||||
data_arr.merge(calc_array[t].data_arr, mem_array::lt);
|
||||
}
|
||||
// }
|
||||
|
||||
|
||||
min_values_t * d_min = new min_values_t();
|
||||
min_values_t * t_min = new min_values_t();
|
||||
min_values_t * cache_min = new min_values_t();
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
d_min->update_min_values(calc_array[t].data_res);
|
||||
t_min->update_min_values(calc_array[t].tag_res);
|
||||
}
|
||||
|
||||
for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
|
||||
{
|
||||
(*miter)->arr_min = d_min;
|
||||
}
|
||||
|
||||
|
||||
//cout << data_arr.size() << "\t" << tag_arr.size() <<" before\n";
|
||||
filter_data_arr(data_arr);
|
||||
if(!(pure_ram||pure_cam||g_ip->fully_assoc))
|
||||
{
|
||||
filter_tag_arr(t_min, tag_arr);
|
||||
}
|
||||
//cout << data_arr.size() << "\t" << tag_arr.size() <<" after\n";
|
||||
|
||||
|
||||
if (pure_ram||pure_cam||g_ip->fully_assoc)
|
||||
{
|
||||
for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
|
||||
{
|
||||
uca_org_t & curr_org = sol_list.back();
|
||||
curr_org.tag_array2 = NULL;
|
||||
curr_org.data_array2 = (*miter);
|
||||
|
||||
curr_org.find_delay();
|
||||
curr_org.find_energy();
|
||||
curr_org.find_area();
|
||||
curr_org.find_cyc();
|
||||
|
||||
//update min values for the entire cache
|
||||
cache_min->update_min_values(curr_org);
|
||||
|
||||
sol_list.push_back(uca_org_t());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (tag_arr.empty() != true)
|
||||
{
|
||||
mem_array * arr_temp = (tag_arr.back());
|
||||
//delete tag_arr.back();
|
||||
tag_arr.pop_back();
|
||||
|
||||
for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
|
||||
{
|
||||
uca_org_t & curr_org = sol_list.back();
|
||||
curr_org.tag_array2 = arr_temp;
|
||||
curr_org.data_array2 = (*miter);
|
||||
|
||||
curr_org.find_delay();
|
||||
curr_org.find_energy();
|
||||
curr_org.find_area();
|
||||
curr_org.find_cyc();
|
||||
|
||||
//update min values for the entire cache
|
||||
cache_min->update_min_values(curr_org);
|
||||
|
||||
sol_list.push_back(uca_org_t());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sol_list.pop_back();
|
||||
|
||||
find_optimal_uca(fin_res, cache_min, sol_list);
|
||||
|
||||
sol_list.clear();
|
||||
|
||||
for (miter = data_arr.begin(); miter != data_arr.end(); ++miter)
|
||||
{
|
||||
if (*miter != fin_res->data_array2)
|
||||
{
|
||||
delete *miter;
|
||||
}
|
||||
}
|
||||
data_arr.clear();
|
||||
|
||||
for (uint32_t t = 0; t < nthreads; t++)
|
||||
{
|
||||
delete calc_array[t].data_res;
|
||||
delete calc_array[t].tag_res;
|
||||
}
|
||||
|
||||
delete [] calc_array;
|
||||
delete cache_min;
|
||||
delete d_min;
|
||||
delete t_min;
|
||||
}
|
||||
|
||||
void update(uca_org_t *fin_res)
|
||||
{
|
||||
if(fin_res->tag_array2)
|
||||
{
|
||||
init_tech_params(g_ip->F_sz_um,true);
|
||||
DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam, fin_res->tag_array2->Nspd, fin_res->tag_array2->Ndwl, fin_res->tag_array2->Ndbl, fin_res->tag_array2->Ndcm, fin_res->tag_array2->Ndsam_lev_1, fin_res->tag_array2->Ndsam_lev_2, g_ip->is_main_mem);
|
||||
if(tag_arr_dyn_p.is_valid)
|
||||
{
|
||||
UCA * tag_arr = new UCA(tag_arr_dyn_p);
|
||||
fin_res->tag_array2->power = tag_arr->power;
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
init_tech_params(g_ip->F_sz_um,false);
|
||||
DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam, fin_res->data_array2->Nspd, fin_res->data_array2->Ndwl, fin_res->data_array2->Ndbl, fin_res->data_array2->Ndcm, fin_res->data_array2->Ndsam_lev_1, fin_res->data_array2->Ndsam_lev_2, g_ip->is_main_mem);
|
||||
if(data_arr_dyn_p.is_valid)
|
||||
{
|
||||
UCA * data_arr = new UCA(data_arr_dyn_p);
|
||||
fin_res->data_array2->power = data_arr->power;
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fin_res->find_energy();
|
||||
}
|
||||
|
115
ext/mcpat/cacti/Ucache.h
Normal file
115
ext/mcpat/cacti/Ucache.h
Normal file
|
@ -0,0 +1,115 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __UCACHE_H__
|
||||
#define __UCACHE_H__
|
||||
|
||||
#include <list>
|
||||
|
||||
#include "area.h"
|
||||
#include "nuca.h"
|
||||
#include "router.h"
|
||||
|
||||
class min_values_t
|
||||
{
|
||||
public:
|
||||
double min_delay;
|
||||
double min_dyn;
|
||||
double min_leakage;
|
||||
double min_area;
|
||||
double min_cyc;
|
||||
|
||||
min_values_t() : min_delay(BIGNUM), min_dyn(BIGNUM), min_leakage(BIGNUM), min_area(BIGNUM), min_cyc(BIGNUM) { }
|
||||
|
||||
void update_min_values(const min_values_t * val);
|
||||
void update_min_values(const uca_org_t & res);
|
||||
void update_min_values(const nuca_org_t * res);
|
||||
void update_min_values(const mem_array * res);
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct solution
|
||||
{
|
||||
int tag_array_index;
|
||||
int data_array_index;
|
||||
list<mem_array *>::iterator tag_array_iter;
|
||||
list<mem_array *>::iterator data_array_iter;
|
||||
double access_time;
|
||||
double cycle_time;
|
||||
double area;
|
||||
double efficiency;
|
||||
powerDef total_power;
|
||||
};
|
||||
|
||||
|
||||
|
||||
bool calculate_time(
|
||||
bool is_tag,
|
||||
int pure_ram,
|
||||
bool pure_cam,
|
||||
double Nspd,
|
||||
unsigned int Ndwl,
|
||||
unsigned int Ndbl,
|
||||
unsigned int Ndcm,
|
||||
unsigned int Ndsam_lev_1,
|
||||
unsigned int Ndsam_lev_2,
|
||||
mem_array *ptr_array,
|
||||
int flag_results_populate,
|
||||
results_mem_array *ptr_results,
|
||||
uca_org_t *ptr_fin_res,
|
||||
bool is_main_mem);
|
||||
void update(uca_org_t *fin_res);
|
||||
|
||||
void solve(uca_org_t *fin_res);
|
||||
void init_tech_params(double tech, bool is_tag);
|
||||
|
||||
|
||||
struct calc_time_mt_wrapper_struct
|
||||
{
|
||||
uint32_t tid;
|
||||
bool is_tag;
|
||||
bool pure_ram;
|
||||
bool pure_cam;
|
||||
bool is_main_mem;
|
||||
double Nspd_min;
|
||||
|
||||
min_values_t * data_res;
|
||||
min_values_t * tag_res;
|
||||
|
||||
list<mem_array *> data_arr;
|
||||
list<mem_array *> tag_arr;
|
||||
};
|
||||
|
||||
void *calc_time_mt_wrapper(void * void_obj);
|
||||
|
||||
#endif
|
130
ext/mcpat/cacti/arbiter.cc
Normal file
130
ext/mcpat/cacti/arbiter.cc
Normal file
|
@ -0,0 +1,130 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include "arbiter.h"
|
||||
|
||||
Arbiter::Arbiter(
|
||||
double n_req,
|
||||
double flit_size_,
|
||||
double output_len,
|
||||
TechnologyParameter::DeviceType *dt
|
||||
):R(n_req), flit_size(flit_size_),
|
||||
o_len (output_len), deviceType(dt)
|
||||
{
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
|
||||
Vdd = dt->Vdd;
|
||||
double technology = g_ip->F_sz_um;
|
||||
NTn1 = 13.5*technology/2;
|
||||
PTn1 = 76*technology/2;
|
||||
NTn2 = 13.5*technology/2;
|
||||
PTn2 = 76*technology/2;
|
||||
NTi = 12.5*technology/2;
|
||||
PTi = 25*technology/2;
|
||||
NTtr = 10*technology/2; /*Transmission gate's nmos tr. length*/
|
||||
PTtr = 20*technology/2; /* pmos tr. length*/
|
||||
}
|
||||
|
||||
Arbiter::~Arbiter(){}
|
||||
|
||||
double
|
||||
Arbiter::arb_req() {
|
||||
double temp = ((R-1)*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)) + 2*gate_C(NTn2, 0) +
|
||||
gate_C(PTn2, 0) + gate_C(NTi, 0) + gate_C(PTi, 0) +
|
||||
drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def));
|
||||
return temp;
|
||||
}
|
||||
|
||||
double
|
||||
Arbiter::arb_pri() {
|
||||
double temp = 2*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)); /* switching capacitance
|
||||
of flip-flop is ignored */
|
||||
return temp;
|
||||
}
|
||||
|
||||
|
||||
double
|
||||
Arbiter::arb_grant() {
|
||||
double temp = drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + crossbar_ctrline();
|
||||
return temp;
|
||||
}
|
||||
|
||||
double
|
||||
Arbiter::arb_int() {
|
||||
double temp = (drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) +
|
||||
2*gate_C(NTn2, 0) + gate_C(PTn2, 0));
|
||||
return temp;
|
||||
}
|
||||
|
||||
void
|
||||
Arbiter::compute_power() {
|
||||
power.readOp.dynamic = (R*arb_req()*Vdd*Vdd/2 + R*arb_pri()*Vdd*Vdd/2 +
|
||||
arb_grant()*Vdd*Vdd + arb_int()*0.5*Vdd*Vdd);
|
||||
double nor1_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
|
||||
double nor2_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
|
||||
double not_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
|
||||
double nor1_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
|
||||
double nor2_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
|
||||
double not_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
|
||||
power.readOp.leakage = (nor1_leak + nor2_leak + not_leak)*Vdd; //FIXME include priority table leakage
|
||||
power.readOp.gate_leakage = nor1_leak_gate*Vdd + nor2_leak_gate*Vdd + not_leak_gate*Vdd;
|
||||
}
|
||||
|
||||
double //wire cap with triple spacing
|
||||
Arbiter::Cw3(double length) {
|
||||
Wire wc(g_ip->wt, length, 1, 3, 3);
|
||||
double temp = (wc.wire_cap(length,true));
|
||||
return temp;
|
||||
}
|
||||
|
||||
double
|
||||
Arbiter::crossbar_ctrline() {
|
||||
double temp = (Cw3(o_len * 1e-6 /* m */) +
|
||||
drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(NTi, 0) + gate_C(PTi, 0));
|
||||
return temp;
|
||||
}
|
||||
|
||||
double
|
||||
Arbiter::transmission_buf_ctrcap() {
|
||||
double temp = gate_C(NTtr, 0)+gate_C(PTtr, 0);
|
||||
return temp;
|
||||
}
|
||||
|
||||
|
||||
void Arbiter::print_arbiter()
|
||||
{
|
||||
cout << "\nArbiter Stats (" << R << " input arbiter" << ")\n\n";
|
||||
cout << "Flit size : " << flit_size << " bits" << endl;
|
||||
cout << "Dynamic Power : " << power.readOp.dynamic*1e9 << " (nJ)" << endl;
|
||||
cout << "Leakage Power : " << power.readOp.leakage*1e3 << " (mW)" << endl;
|
||||
}
|
||||
|
||||
|
79
ext/mcpat/cacti/arbiter.h
Normal file
79
ext/mcpat/cacti/arbiter.h
Normal file
|
@ -0,0 +1,79 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef __ARBITER__
|
||||
#define __ARBITER__
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "mat.h"
|
||||
#include "parameter.h"
|
||||
#include "wire.h"
|
||||
|
||||
class Arbiter : public Component
|
||||
{
|
||||
public:
|
||||
Arbiter(
|
||||
double Req,
|
||||
double flit_sz,
|
||||
double output_len,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
|
||||
~Arbiter();
|
||||
|
||||
void print_arbiter();
|
||||
double arb_req();
|
||||
double arb_pri();
|
||||
double arb_grant();
|
||||
double arb_int();
|
||||
void compute_power();
|
||||
double Cw3(double len);
|
||||
double crossbar_ctrline();
|
||||
double transmission_buf_ctrcap();
|
||||
|
||||
|
||||
|
||||
private:
|
||||
double NTn1, PTn1, NTn2, PTn2, R, PTi, NTi;
|
||||
double flit_size;
|
||||
double NTtr, PTtr;
|
||||
double o_len;
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
double TriS1, TriS2;
|
||||
double min_w_pmos, Vdd;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
47
ext/mcpat/cacti/area.cc
Normal file
47
ext/mcpat/cacti/area.cc
Normal file
|
@ -0,0 +1,47 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "area.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "component.h"
|
||||
#include "decoder.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
71
ext/mcpat/cacti/area.h
Normal file
71
ext/mcpat/cacti/area.h
Normal file
|
@ -0,0 +1,71 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __AREA_H__
|
||||
#define __AREA_H__
|
||||
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Area
|
||||
{
|
||||
public:
|
||||
double w;
|
||||
double h;
|
||||
|
||||
Area():w(0), h(0), area(0) { }
|
||||
double get_w() const { return w; }
|
||||
double get_h() const { return h; }
|
||||
double get_area() const
|
||||
{
|
||||
if (w == 0 && h == 0)
|
||||
{
|
||||
return area;
|
||||
}
|
||||
else
|
||||
{
|
||||
return w*h;
|
||||
}
|
||||
}
|
||||
void set_w(double w_) { w = w_; }
|
||||
void set_h(double h_) { h = h_; }
|
||||
void set_area(double a_) { area = a_; }
|
||||
|
||||
private:
|
||||
double area;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
198
ext/mcpat/cacti/bank.cc
Executable file
198
ext/mcpat/cacti/bank.cc
Executable file
|
@ -0,0 +1,198 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "bank.h"
|
||||
|
||||
Bank::Bank(const DynamicParameter & dyn_p):
|
||||
dp(dyn_p), mat(dp),
|
||||
num_addr_b_mat(dyn_p.number_addr_bits_mat),
|
||||
num_mats_hor_dir(dyn_p.num_mats_h_dir), num_mats_ver_dir(dyn_p.num_mats_v_dir)
|
||||
{
|
||||
int RWP;
|
||||
int ERP;
|
||||
int EWP;
|
||||
int SCHP;
|
||||
|
||||
if (dp.use_inp_params)
|
||||
{
|
||||
RWP = dp.num_rw_ports;
|
||||
ERP = dp.num_rd_ports;
|
||||
EWP = dp.num_wr_ports;
|
||||
SCHP = dp.num_search_ports;
|
||||
}
|
||||
else
|
||||
{
|
||||
RWP = g_ip->num_rw_ports;
|
||||
ERP = g_ip->num_rd_ports;
|
||||
EWP = g_ip->num_wr_ports;
|
||||
SCHP = g_ip->num_search_ports;
|
||||
}
|
||||
|
||||
int total_addrbits = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
|
||||
int datainbits = dp.num_di_b_bank_per_port * (RWP + EWP);
|
||||
int dataoutbits = dp.num_do_b_bank_per_port * (RWP + ERP);
|
||||
int searchinbits;
|
||||
int searchoutbits;
|
||||
|
||||
if (dp.fully_assoc || dp.pure_cam)
|
||||
{
|
||||
datainbits = dp.num_di_b_bank_per_port * (RWP + EWP);
|
||||
dataoutbits = dp.num_do_b_bank_per_port * (RWP + ERP);
|
||||
searchinbits = dp.num_si_b_bank_per_port * SCHP;
|
||||
searchoutbits = dp.num_so_b_bank_per_port * SCHP;
|
||||
}
|
||||
|
||||
if (!(dp.fully_assoc || dp.pure_cam))
|
||||
{
|
||||
if (g_ip->fast_access && dp.is_tag == false)
|
||||
{
|
||||
dataoutbits *= g_ip->data_assoc;
|
||||
}
|
||||
|
||||
htree_in_add = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
|
||||
htree_in_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
|
||||
htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
|
||||
|
||||
// htree_out_data = new Htree2 (g_ip->wt,(double) 100, (double)100,
|
||||
// total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
|
||||
|
||||
area.w = htree_in_data->area.w;
|
||||
area.h = htree_in_data->area.h;
|
||||
}
|
||||
else
|
||||
{
|
||||
htree_in_add = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits, searchinbits,dataoutbits,searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
|
||||
htree_in_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
|
||||
htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
|
||||
htree_in_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree,true, true);
|
||||
htree_out_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
|
||||
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree,true);
|
||||
|
||||
area.w = htree_in_data->area.w;
|
||||
area.h = htree_in_data->area.h;
|
||||
}
|
||||
|
||||
num_addr_b_row_dec = _log2(mat.subarray.num_rows);
|
||||
num_addr_b_routed_to_mat_for_act = num_addr_b_row_dec;
|
||||
num_addr_b_routed_to_mat_for_rd_or_wr = num_addr_b_mat - num_addr_b_row_dec;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Bank::~Bank()
|
||||
{
|
||||
delete htree_in_add;
|
||||
delete htree_out_data;
|
||||
delete htree_in_data;
|
||||
if (dp.fully_assoc || dp.pure_cam)
|
||||
{
|
||||
delete htree_in_search;
|
||||
delete htree_out_search;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Bank::compute_delays(double inrisetime)
|
||||
{
|
||||
return mat.compute_delays(inrisetime);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Bank::compute_power_energy()
|
||||
{
|
||||
mat.compute_power_energy();
|
||||
|
||||
if (!(dp.fully_assoc || dp.pure_cam))
|
||||
{
|
||||
power.readOp.dynamic += mat.power.readOp.dynamic * dp.num_act_mats_hor_dir;
|
||||
power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
|
||||
power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
|
||||
|
||||
power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
|
||||
power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
|
||||
|
||||
power.readOp.leakage += htree_in_add->power.readOp.leakage;
|
||||
power.readOp.leakage += htree_in_data->power.readOp.leakage;
|
||||
power.readOp.leakage += htree_out_data->power.readOp.leakage;
|
||||
power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
|
||||
power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
|
||||
power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
power.readOp.dynamic += mat.power.readOp.dynamic ;//for fa and cam num_act_mats_hor_dir is 1 for plain r/w
|
||||
power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
|
||||
power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
|
||||
|
||||
power.searchOp.dynamic += mat.power.searchOp.dynamic * dp.num_mats;
|
||||
power.searchOp.dynamic += mat.power_bl_precharge_eq_drv.searchOp.dynamic +
|
||||
mat.power_sa.searchOp.dynamic +
|
||||
mat.power_bitline.searchOp.dynamic +
|
||||
mat.power_subarray_out_drv.searchOp.dynamic+
|
||||
mat.ml_to_ram_wl_drv->power.readOp.dynamic;
|
||||
|
||||
power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
|
||||
power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
|
||||
|
||||
power.searchOp.dynamic += htree_in_search->power.searchOp.dynamic;
|
||||
power.searchOp.dynamic += htree_out_search->power.searchOp.dynamic;
|
||||
|
||||
power.readOp.leakage += htree_in_add->power.readOp.leakage;
|
||||
power.readOp.leakage += htree_in_data->power.readOp.leakage;
|
||||
power.readOp.leakage += htree_out_data->power.readOp.leakage;
|
||||
power.readOp.leakage += htree_in_search->power.readOp.leakage;
|
||||
power.readOp.leakage += htree_out_search->power.readOp.leakage;
|
||||
|
||||
|
||||
power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
|
||||
power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
|
||||
power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
|
||||
power.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage;
|
||||
power.readOp.gate_leakage += htree_out_search->power.readOp.gate_leakage;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
69
ext/mcpat/cacti/bank.h
Executable file
69
ext/mcpat/cacti/bank.h
Executable file
|
@ -0,0 +1,69 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __BANK_H__
|
||||
#define __BANK_H__
|
||||
|
||||
#include "component.h"
|
||||
#include "decoder.h"
|
||||
#include "htree2.h"
|
||||
#include "mat.h"
|
||||
|
||||
class Bank : public Component
|
||||
{
|
||||
public:
|
||||
Bank(const DynamicParameter & dyn_p);
|
||||
~Bank();
|
||||
double compute_delays(double inrisetime); // return outrisetime
|
||||
void compute_power_energy();
|
||||
|
||||
const DynamicParameter & dp;
|
||||
Mat mat;
|
||||
Htree2 *htree_in_add;
|
||||
Htree2 *htree_in_data;
|
||||
Htree2 *htree_out_data;
|
||||
Htree2 *htree_in_search;
|
||||
Htree2 *htree_out_search;
|
||||
|
||||
int num_addr_b_mat;
|
||||
int num_mats_hor_dir;
|
||||
int num_mats_ver_dir;
|
||||
|
||||
int num_addr_b_row_dec;
|
||||
int num_addr_b_routed_to_mat_for_act;
|
||||
int num_addr_b_routed_to_mat_for_rd_or_wr;
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif
|
829
ext/mcpat/cacti/basic_circuit.cc
Normal file
829
ext/mcpat/cacti/basic_circuit.cc
Normal file
|
@ -0,0 +1,829 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "basic_circuit.h"
|
||||
#include "parameter.h"
|
||||
|
||||
uint32_t _log2(uint64_t num)
|
||||
{
|
||||
uint32_t log2 = 0;
|
||||
|
||||
if (num == 0)
|
||||
{
|
||||
std::cerr << "log0?" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
while (num > 1)
|
||||
{
|
||||
num = (num >> 1);
|
||||
log2++;
|
||||
}
|
||||
|
||||
return log2;
|
||||
}
|
||||
|
||||
|
||||
bool is_pow2(int64_t val)
|
||||
{
|
||||
if (val <= 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
else if (val == 1)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (_log2(val) != _log2(val-1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int powers (int base, int n)
|
||||
{
|
||||
int i, p;
|
||||
|
||||
p = 1;
|
||||
for (i = 1; i <= n; ++i)
|
||||
p *= base;
|
||||
return p;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------*/
|
||||
|
||||
double logtwo (double x)
|
||||
{
|
||||
assert(x > 0);
|
||||
return ((double) (log (x) / log (2.0)));
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------*/
|
||||
|
||||
|
||||
double gate_C(
|
||||
double width,
|
||||
double wirelength,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
const TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if (_is_dram && _is_cell)
|
||||
{
|
||||
dt = &g_tp.dram_acc; //DRAM cell access transistor
|
||||
}
|
||||
else if (_is_dram && _is_wl_tr)
|
||||
{
|
||||
dt = &g_tp.dram_wl; //DRAM wordline transistor
|
||||
}
|
||||
else if (!_is_dram && _is_cell)
|
||||
{
|
||||
dt = &g_tp.sram_cell; // SRAM cell access transistor
|
||||
}
|
||||
else
|
||||
{
|
||||
dt = &g_tp.peri_global;
|
||||
}
|
||||
|
||||
return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
|
||||
}
|
||||
|
||||
|
||||
// returns gate capacitance in Farads
|
||||
// actually this function is the same as gate_C() now
|
||||
double gate_C_pass(
|
||||
double width, // gate width in um (length is Lphy_periph_global)
|
||||
double wirelength, // poly wire length going to gate in lambda
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
// v5.0
|
||||
const TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((_is_dram) && (_is_cell))
|
||||
{
|
||||
dt = &g_tp.dram_acc; //DRAM cell access transistor
|
||||
}
|
||||
else if ((_is_dram) && (_is_wl_tr))
|
||||
{
|
||||
dt = &g_tp.dram_wl; //DRAM wordline transistor
|
||||
}
|
||||
else if ((!_is_dram) && _is_cell)
|
||||
{
|
||||
dt = &g_tp.sram_cell; // SRAM cell access transistor
|
||||
}
|
||||
else
|
||||
{
|
||||
dt = &g_tp.peri_global;
|
||||
}
|
||||
|
||||
return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double drain_C_(
|
||||
double width,
|
||||
int nchannel,
|
||||
int stack,
|
||||
int next_arg_thresh_folding_width_or_height_cell,
|
||||
double fold_dimension,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
double w_folded_tr;
|
||||
const TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((_is_dram) && (_is_cell))
|
||||
{
|
||||
dt = &g_tp.dram_acc; // DRAM cell access transistor
|
||||
}
|
||||
else if ((_is_dram) && (_is_wl_tr))
|
||||
{
|
||||
dt = &g_tp.dram_wl; // DRAM wordline transistor
|
||||
}
|
||||
else if ((!_is_dram) && _is_cell)
|
||||
{
|
||||
dt = &g_tp.sram_cell; // SRAM cell access transistor
|
||||
}
|
||||
else
|
||||
{
|
||||
dt = &g_tp.peri_global;
|
||||
}
|
||||
|
||||
double c_junc_area = dt->C_junc;
|
||||
double c_junc_sidewall = dt->C_junc_sidewall;
|
||||
double c_fringe = 2*dt->C_fringe;
|
||||
double c_overlap = 2*dt->C_overlap;
|
||||
double drain_C_metal_connecting_folded_tr = 0;
|
||||
|
||||
// determine the width of the transistor after folding (if it is getting folded)
|
||||
if (next_arg_thresh_folding_width_or_height_cell == 0)
|
||||
{ // interpret fold_dimension as the the folding width threshold
|
||||
// i.e. the value of transistor width above which the transistor gets folded
|
||||
w_folded_tr = fold_dimension;
|
||||
}
|
||||
else
|
||||
{ // interpret fold_dimension as the height of the cell that this transistor is part of.
|
||||
double h_tr_region = fold_dimension - 2 * g_tp.HPOWERRAIL;
|
||||
// TODO : w_folded_tr must come from Component::compute_gate_area()
|
||||
double ratio_p_to_n = 2.0 / (2.0 + 1.0);
|
||||
if (nchannel)
|
||||
{
|
||||
w_folded_tr = (1 - ratio_p_to_n) * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
|
||||
}
|
||||
else
|
||||
{
|
||||
w_folded_tr = ratio_p_to_n * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
|
||||
}
|
||||
}
|
||||
int num_folded_tr = (int) (ceil(width / w_folded_tr));
|
||||
|
||||
if (num_folded_tr < 2)
|
||||
{
|
||||
w_folded_tr = width;
|
||||
}
|
||||
|
||||
double total_drain_w = (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) + // only for drain
|
||||
(stack - 1) * g_tp.spacing_poly_to_poly;
|
||||
double drain_h_for_sidewall = w_folded_tr;
|
||||
double total_drain_height_for_cap_wrt_gate = w_folded_tr + 2 * w_folded_tr * (stack - 1);
|
||||
if (num_folded_tr > 1)
|
||||
{
|
||||
total_drain_w += (num_folded_tr - 2) * (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +
|
||||
(num_folded_tr - 1) * ((stack - 1) * g_tp.spacing_poly_to_poly);
|
||||
|
||||
if (num_folded_tr%2 == 0)
|
||||
{
|
||||
drain_h_for_sidewall = 0;
|
||||
}
|
||||
total_drain_height_for_cap_wrt_gate *= num_folded_tr;
|
||||
drain_C_metal_connecting_folded_tr = g_tp.wire_local.C_per_um * total_drain_w;
|
||||
}
|
||||
|
||||
double drain_C_area = c_junc_area * total_drain_w * w_folded_tr;
|
||||
double drain_C_sidewall = c_junc_sidewall * (drain_h_for_sidewall + 2 * total_drain_w);
|
||||
double drain_C_wrt_gate = (c_fringe + c_overlap) * total_drain_height_for_cap_wrt_gate;
|
||||
|
||||
return (drain_C_area + drain_C_sidewall + drain_C_wrt_gate + drain_C_metal_connecting_folded_tr);
|
||||
}
|
||||
|
||||
|
||||
double tr_R_on(
|
||||
double width,
|
||||
int nchannel,
|
||||
int stack,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
const TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((_is_dram) && (_is_cell))
|
||||
{
|
||||
dt = &g_tp.dram_acc; //DRAM cell access transistor
|
||||
}
|
||||
else if ((_is_dram) && (_is_wl_tr))
|
||||
{
|
||||
dt = &g_tp.dram_wl; //DRAM wordline transistor
|
||||
}
|
||||
else if ((!_is_dram) && _is_cell)
|
||||
{
|
||||
dt = &g_tp.sram_cell; // SRAM cell access transistor
|
||||
}
|
||||
else
|
||||
{
|
||||
dt = &g_tp.peri_global;
|
||||
}
|
||||
|
||||
double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
|
||||
return (stack * restrans / width);
|
||||
}
|
||||
|
||||
|
||||
/* This routine operates in reverse: given a resistance, it finds
|
||||
* the transistor width that would have this R. It is used in the
|
||||
* data wordline to estimate the wordline driver size. */
|
||||
|
||||
// returns width in um
|
||||
double R_to_w(
|
||||
double res,
|
||||
int nchannel,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
const TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((_is_dram) && (_is_cell))
|
||||
{
|
||||
dt = &g_tp.dram_acc; //DRAM cell access transistor
|
||||
}
|
||||
else if ((_is_dram) && (_is_wl_tr))
|
||||
{
|
||||
dt = &g_tp.dram_wl; //DRAM wordline transistor
|
||||
}
|
||||
else if ((!_is_dram) && (_is_cell))
|
||||
{
|
||||
dt = &g_tp.sram_cell; // SRAM cell access transistor
|
||||
}
|
||||
else
|
||||
{
|
||||
dt = &g_tp.peri_global;
|
||||
}
|
||||
|
||||
double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
|
||||
return (restrans / res);
|
||||
}
|
||||
|
||||
|
||||
double pmos_to_nmos_sz_ratio(
|
||||
bool _is_dram,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
double p_to_n_sizing_ratio;
|
||||
if ((_is_dram) && (_is_wl_tr))
|
||||
{ //DRAM wordline transistor
|
||||
p_to_n_sizing_ratio = g_tp.dram_wl.n_to_p_eff_curr_drv_ratio;
|
||||
}
|
||||
else
|
||||
{ //DRAM or SRAM all other transistors
|
||||
p_to_n_sizing_ratio = g_tp.peri_global.n_to_p_eff_curr_drv_ratio;
|
||||
}
|
||||
return p_to_n_sizing_ratio;
|
||||
}
|
||||
|
||||
|
||||
// "Timing Models for MOS Circuits" by Mark Horowitz, 1984
|
||||
double horowitz(
|
||||
double inputramptime, // input rise time
|
||||
double tf, // time constant of gate
|
||||
double vs1, // threshold voltage
|
||||
double vs2, // threshold voltage
|
||||
int rise) // whether input rises or fall
|
||||
{
|
||||
if (inputramptime == 0 && vs1 == vs2)
|
||||
{
|
||||
return tf * (vs1 < 1 ? -log(vs1) : log(vs1));
|
||||
}
|
||||
double a, b, td;
|
||||
|
||||
a = inputramptime / tf;
|
||||
if (rise == RISE)
|
||||
{
|
||||
b = 0.5;
|
||||
td = tf * sqrt(log(vs1)*log(vs1) + 2*a*b*(1.0 - vs1)) + tf*(log(vs1) - log(vs2));
|
||||
}
|
||||
else
|
||||
{
|
||||
b = 0.4;
|
||||
td = tf * sqrt(log(1.0 - vs1)*log(1.0 - vs1) + 2*a*b*(vs1)) + tf*(log(1.0 - vs1) - log(1.0 - vs2));
|
||||
}
|
||||
return (td);
|
||||
}
|
||||
|
||||
double cmos_Ileak(
|
||||
double nWidth,
|
||||
double pWidth,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((!_is_dram)&&(_is_cell))
|
||||
{ //SRAM cell access transistor
|
||||
dt = &(g_tp.sram_cell);
|
||||
}
|
||||
else if ((_is_dram)&&(_is_wl_tr))
|
||||
{ //DRAM wordline transistor
|
||||
dt = &(g_tp.dram_wl);
|
||||
}
|
||||
else
|
||||
{ //DRAM or SRAM all other transistors
|
||||
dt = &(g_tp.peri_global);
|
||||
}
|
||||
return nWidth*dt->I_off_n + pWidth*dt->I_off_p;
|
||||
}
|
||||
|
||||
|
||||
double simplified_nmos_leakage(
|
||||
double nwidth,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((!_is_dram)&&(_is_cell))
|
||||
{ //SRAM cell access transistor
|
||||
dt = &(g_tp.sram_cell);
|
||||
}
|
||||
else if ((_is_dram)&&(_is_wl_tr))
|
||||
{ //DRAM wordline transistor
|
||||
dt = &(g_tp.dram_wl);
|
||||
}
|
||||
else
|
||||
{ //DRAM or SRAM all other transistors
|
||||
dt = &(g_tp.peri_global);
|
||||
}
|
||||
return nwidth * dt->I_off_n;
|
||||
}
|
||||
|
||||
int factorial(int n, int m)
|
||||
{
|
||||
int fa = m, i;
|
||||
for (i=m+1; i<=n; i++)
|
||||
fa *=i;
|
||||
return fa;
|
||||
}
|
||||
|
||||
int combination(int n, int m)
|
||||
{
|
||||
int ret;
|
||||
ret = factorial(n, m+1) / factorial(n - m);
|
||||
return ret;
|
||||
}
|
||||
|
||||
double simplified_pmos_leakage(
|
||||
double pwidth,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((!_is_dram)&&(_is_cell))
|
||||
{ //SRAM cell access transistor
|
||||
dt = &(g_tp.sram_cell);
|
||||
}
|
||||
else if ((_is_dram)&&(_is_wl_tr))
|
||||
{ //DRAM wordline transistor
|
||||
dt = &(g_tp.dram_wl);
|
||||
}
|
||||
else
|
||||
{ //DRAM or SRAM all other transistors
|
||||
dt = &(g_tp.peri_global);
|
||||
}
|
||||
return pwidth * dt->I_off_p;
|
||||
}
|
||||
|
||||
double cmos_Ig_n(
|
||||
double nWidth,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((!_is_dram)&&(_is_cell))
|
||||
{ //SRAM cell access transistor
|
||||
dt = &(g_tp.sram_cell);
|
||||
}
|
||||
else if ((_is_dram)&&(_is_wl_tr))
|
||||
{ //DRAM wordline transistor
|
||||
dt = &(g_tp.dram_wl);
|
||||
}
|
||||
else
|
||||
{ //DRAM or SRAM all other transistors
|
||||
dt = &(g_tp.peri_global);
|
||||
}
|
||||
return nWidth*dt->I_g_on_n;
|
||||
}
|
||||
|
||||
double cmos_Ig_p(
|
||||
double pWidth,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr)
|
||||
{
|
||||
TechnologyParameter::DeviceType * dt;
|
||||
|
||||
if ((!_is_dram)&&(_is_cell))
|
||||
{ //SRAM cell access transistor
|
||||
dt = &(g_tp.sram_cell);
|
||||
}
|
||||
else if ((_is_dram)&&(_is_wl_tr))
|
||||
{ //DRAM wordline transistor
|
||||
dt = &(g_tp.dram_wl);
|
||||
}
|
||||
else
|
||||
{ //DRAM or SRAM all other transistors
|
||||
dt = &(g_tp.peri_global);
|
||||
}
|
||||
return pWidth*dt->I_g_on_p;
|
||||
}
|
||||
|
||||
double cmos_Isub_leakage(
|
||||
double nWidth,
|
||||
double pWidth,
|
||||
int fanin,
|
||||
enum Gate_type g_type,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr,
|
||||
enum Half_net_topology topo)
|
||||
{
|
||||
assert (fanin>=1);
|
||||
double nmos_leak = simplified_nmos_leakage(nWidth, _is_dram, _is_cell, _is_wl_tr);
|
||||
double pmos_leak = simplified_pmos_leakage(pWidth, _is_dram, _is_cell, _is_wl_tr);
|
||||
double Isub=0;
|
||||
int num_states;
|
||||
int num_off_tx;
|
||||
|
||||
num_states = int(pow(2.0, fanin));
|
||||
|
||||
switch (g_type)
|
||||
{
|
||||
case nmos:
|
||||
if (fanin==1)
|
||||
{
|
||||
Isub = nmos_leak/num_states;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (topo==parallel)
|
||||
{
|
||||
Isub=nmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
|
||||
}
|
||||
else
|
||||
{
|
||||
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
|
||||
{
|
||||
//Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
|
||||
Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
|
||||
}
|
||||
Isub /=num_states;
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
case pmos:
|
||||
if (fanin==1)
|
||||
{
|
||||
Isub = pmos_leak/num_states;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (topo==parallel)
|
||||
{
|
||||
Isub=pmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
|
||||
}
|
||||
else
|
||||
{
|
||||
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
|
||||
{
|
||||
//Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
|
||||
Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
|
||||
}
|
||||
Isub /=num_states;
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
case inv:
|
||||
Isub = (nmos_leak + pmos_leak)/2;
|
||||
break;
|
||||
case nand:
|
||||
Isub += fanin*pmos_leak;//the pullup network
|
||||
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pulldown network
|
||||
{
|
||||
//Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
|
||||
Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
|
||||
}
|
||||
Isub /=num_states;
|
||||
break;
|
||||
case nor:
|
||||
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pullup network
|
||||
{
|
||||
//Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
|
||||
Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
|
||||
}
|
||||
Isub += fanin*nmos_leak;//the pulldown network
|
||||
Isub /=num_states;
|
||||
break;
|
||||
case tri:
|
||||
Isub += (nmos_leak + pmos_leak)/2;//enabled
|
||||
Isub += nmos_leak*UNI_LEAK_STACK_FACTOR; //disabled upper bound of leakage power
|
||||
Isub /=2;
|
||||
break;
|
||||
case tg:
|
||||
Isub = (nmos_leak + pmos_leak)/2;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
|
||||
return Isub;
|
||||
}
|
||||
|
||||
|
||||
double cmos_Ig_leakage(
|
||||
double nWidth,
|
||||
double pWidth,
|
||||
int fanin,
|
||||
enum Gate_type g_type,
|
||||
bool _is_dram,
|
||||
bool _is_cell,
|
||||
bool _is_wl_tr,
|
||||
enum Half_net_topology topo)
|
||||
{
|
||||
assert (fanin>=1);
|
||||
double nmos_leak = cmos_Ig_n(nWidth, _is_dram, _is_cell, _is_wl_tr);
|
||||
double pmos_leak = cmos_Ig_p(pWidth, _is_dram, _is_cell, _is_wl_tr);
|
||||
double Ig_on=0;
|
||||
int num_states;
|
||||
int num_on_tx;
|
||||
|
||||
num_states = int(pow(2.0, fanin));
|
||||
|
||||
switch (g_type)
|
||||
{
|
||||
case nmos:
|
||||
if (fanin==1)
|
||||
{
|
||||
Ig_on = nmos_leak/num_states;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (topo==parallel)
|
||||
{
|
||||
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
|
||||
{
|
||||
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
|
||||
//num_on_tx is the number of on tx
|
||||
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
|
||||
{
|
||||
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
|
||||
}
|
||||
Ig_on /=num_states;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case pmos:
|
||||
if (fanin==1)
|
||||
{
|
||||
Ig_on = pmos_leak/num_states;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (topo==parallel)
|
||||
{
|
||||
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
|
||||
{
|
||||
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Ig_on += pmos_leak * fanin;//pull down network when all TXs are on.
|
||||
//num_on_tx is the number of on tx
|
||||
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
|
||||
{
|
||||
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
|
||||
}
|
||||
Ig_on /=num_states;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case inv:
|
||||
Ig_on = (nmos_leak + pmos_leak)/2;
|
||||
break;
|
||||
case nand:
|
||||
//pull up network
|
||||
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
|
||||
{
|
||||
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
|
||||
}
|
||||
|
||||
//pull down network
|
||||
Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
|
||||
//num_on_tx is the number of on tx
|
||||
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
|
||||
{
|
||||
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
|
||||
}
|
||||
Ig_on /=num_states;
|
||||
break;
|
||||
case nor:
|
||||
// num_on_tx is the number of on tx in pull up network
|
||||
Ig_on += pmos_leak * fanin;//pull up network when all TXs are on.
|
||||
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)
|
||||
{
|
||||
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;
|
||||
|
||||
}
|
||||
//pull down network
|
||||
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
|
||||
{
|
||||
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
|
||||
}
|
||||
Ig_on /=num_states;
|
||||
break;
|
||||
case tri:
|
||||
Ig_on += (2*nmos_leak + 2*pmos_leak)/2;//enabled
|
||||
Ig_on += (nmos_leak + pmos_leak)/2; //disabled upper bound of leakage power
|
||||
Ig_on /=2;
|
||||
break;
|
||||
case tg:
|
||||
Ig_on = (nmos_leak + pmos_leak)/2;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
|
||||
return Ig_on;
|
||||
}
|
||||
|
||||
double shortcircuit_simple(
|
||||
double vt,
|
||||
double velocity_index,
|
||||
double c_in,
|
||||
double c_out,
|
||||
double w_nmos,
|
||||
double w_pmos,
|
||||
double i_on_n,
|
||||
double i_on_p,
|
||||
double i_on_n_in,
|
||||
double i_on_p_in,
|
||||
double vdd)
|
||||
{
|
||||
|
||||
double p_short_circuit, p_short_circuit_discharge, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
|
||||
double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
|
||||
|
||||
fo_n = i_on_n/i_on_n_in;
|
||||
fo_p = i_on_p/i_on_p_in;
|
||||
fanout = c_out/c_in;
|
||||
beta_ratio = i_on_p/i_on_n;
|
||||
vt_to_vdd_ratio = vt/vdd;
|
||||
|
||||
//p_short_circuit_discharge_low = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
|
||||
p_short_circuit_discharge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
|
||||
p_short_circuit_charge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
|
||||
// double t1, t2, t3, t4, t5;
|
||||
// t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
|
||||
// t2=pow(velocity_index,2.0);
|
||||
// t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
|
||||
// t4=t1/t2/t3;
|
||||
// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
|
||||
|
||||
p_short_circuit_discharge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
|
||||
p_short_circuit_charge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
|
||||
|
||||
// t1=pow(((vdd-vt)-vt_to_vdd_ratio),1.5);
|
||||
// t2=pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
|
||||
// t3=t1/t2;
|
||||
// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
|
||||
// p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
|
||||
// p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); //harmmoic mean cannot be applied simple formulas.
|
||||
|
||||
p_short_circuit_discharge = p_short_circuit_discharge_low;
|
||||
p_short_circuit_charge = p_short_circuit_charge_low;
|
||||
p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
|
||||
|
||||
return (p_short_circuit);
|
||||
}
|
||||
|
||||
double shortcircuit(
|
||||
double vt,
|
||||
double velocity_index,
|
||||
double c_in,
|
||||
double c_out,
|
||||
double w_nmos,
|
||||
double w_pmos,
|
||||
double i_on_n,
|
||||
double i_on_p,
|
||||
double i_on_n_in,
|
||||
double i_on_p_in,
|
||||
double vdd)
|
||||
{
|
||||
|
||||
double p_short_circuit=0, p_short_circuit_discharge;//, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
|
||||
double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
|
||||
double f_alpha, k_v, e, g_v_alpha, h_v_alpha;
|
||||
|
||||
fo_n = i_on_n/i_on_n_in;
|
||||
fo_p = i_on_p/i_on_p_in;
|
||||
fanout = 1;
|
||||
beta_ratio = i_on_p/i_on_n;
|
||||
vt_to_vdd_ratio = vt/vdd;
|
||||
e = 2.71828;
|
||||
f_alpha = 1/(velocity_index+2) -velocity_index/(2*(velocity_index+3)) +velocity_index/(velocity_index+4)*(velocity_index/2-1);
|
||||
k_v = 0.9/0.8+(vdd-vt)/0.8*log(10*(vdd-vt)/e);
|
||||
g_v_alpha = (velocity_index + 1)*pow((1-velocity_index),velocity_index)*pow((1-velocity_index),velocity_index/2)/f_alpha/pow((1-velocity_index-velocity_index),(velocity_index/2+velocity_index+2));
|
||||
h_v_alpha = pow(2, velocity_index)*(velocity_index+1)*pow((1-velocity_index),velocity_index)/pow((1-velocity_index-velocity_index),(velocity_index+1));
|
||||
|
||||
//p_short_circuit_discharge_low = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
|
||||
// p_short_circuit_discharge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
|
||||
// p_short_circuit_charge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
|
||||
// double t1, t2, t3, t4, t5;
|
||||
// t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
|
||||
// t2=pow(velocity_index,2.0);
|
||||
// t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
|
||||
// t4=t1/t2/t3;
|
||||
//
|
||||
// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
|
||||
//
|
||||
//
|
||||
// p_short_circuit_discharge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
|
||||
// p_short_circuit_charge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
|
||||
//
|
||||
// p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
|
||||
// p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high);
|
||||
//
|
||||
// p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
|
||||
//
|
||||
// p_short_circuit = p_short_circuit_discharge;
|
||||
|
||||
p_short_circuit_discharge = k_v*vdd*vdd*c_in*fo_p*fo_p/((vdd-vt)*g_v_alpha*fanout*beta_ratio/2/k_v + h_v_alpha*fo_p);
|
||||
return (p_short_circuit);
|
||||
}
|
248
ext/mcpat/cacti/basic_circuit.h
Normal file
248
ext/mcpat/cacti/basic_circuit.h
Normal file
|
@ -0,0 +1,248 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __BASIC_CIRCUIT_H__
|
||||
#define __BASIC_CIRCUIT_H__
|
||||
|
||||
#include "cacti_interface.h"
|
||||
#include "const.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define UNI_LEAK_STACK_FACTOR 0.43
|
||||
|
||||
int powers (int base, int n);
|
||||
bool is_pow2(int64_t val);
|
||||
uint32_t _log2(uint64_t num);
|
||||
int factorial(int n, int m = 1);
|
||||
int combination(int n, int m);
|
||||
|
||||
//#define DBG
|
||||
#ifdef DBG
|
||||
#define PRINTDW(a);\
|
||||
a;
|
||||
#else
|
||||
#define PRINTDW(a);\
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
enum Wire_placement {
|
||||
outside_mat,
|
||||
inside_mat,
|
||||
local_wires
|
||||
};
|
||||
|
||||
|
||||
|
||||
enum Htree_type {
|
||||
Add_htree,
|
||||
Data_in_htree,
|
||||
Data_out_htree,
|
||||
Search_in_htree,
|
||||
Search_out_htree,
|
||||
};
|
||||
|
||||
enum Gate_type {
|
||||
nmos,
|
||||
pmos,
|
||||
inv,
|
||||
nand,
|
||||
nor,
|
||||
tri,
|
||||
tg
|
||||
};
|
||||
|
||||
enum Half_net_topology {
|
||||
parallel,
|
||||
series
|
||||
};
|
||||
|
||||
double logtwo (double x);
|
||||
|
||||
double gate_C(
|
||||
double width,
|
||||
double wirelength,
|
||||
bool _is_dram = false,
|
||||
bool _is_sram = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double gate_C_pass(
|
||||
double width,
|
||||
double wirelength,
|
||||
bool _is_dram = false,
|
||||
bool _is_sram = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double drain_C_(
|
||||
double width,
|
||||
int nchannel,
|
||||
int stack,
|
||||
int next_arg_thresh_folding_width_or_height_cell,
|
||||
double fold_dimension,
|
||||
bool _is_dram = false,
|
||||
bool _is_sram = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double tr_R_on(
|
||||
double width,
|
||||
int nchannel,
|
||||
int stack,
|
||||
bool _is_dram = false,
|
||||
bool _is_sram = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double R_to_w(
|
||||
double res,
|
||||
int nchannel,
|
||||
bool _is_dram = false,
|
||||
bool _is_sram = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double horowitz (
|
||||
double inputramptime,
|
||||
double tf,
|
||||
double vs1,
|
||||
double vs2,
|
||||
int rise);
|
||||
|
||||
double pmos_to_nmos_sz_ratio(
|
||||
bool _is_dram = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double simplified_nmos_leakage(
|
||||
double nwidth,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double simplified_pmos_leakage(
|
||||
double pwidth,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
|
||||
double cmos_Ileak(
|
||||
double nWidth,
|
||||
double pWidth,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr = false);
|
||||
|
||||
double cmos_Ig_n(
|
||||
double nWidth,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr= false);
|
||||
|
||||
double cmos_Ig_p(
|
||||
double pWidth,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr= false);
|
||||
|
||||
|
||||
double cmos_Isub_leakage(
|
||||
double nWidth,
|
||||
double pWidth,
|
||||
int fanin,
|
||||
enum Gate_type g_type,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr = false,
|
||||
enum Half_net_topology topo = series);
|
||||
|
||||
double cmos_Ig_leakage(
|
||||
double nWidth,
|
||||
double pWidth,
|
||||
int fanin,
|
||||
enum Gate_type g_type,
|
||||
bool _is_dram = false,
|
||||
bool _is_cell = false,
|
||||
bool _is_wl_tr = false,
|
||||
enum Half_net_topology topo = series);
|
||||
|
||||
double shortcircuit(
|
||||
double vt,
|
||||
double velocity_index,
|
||||
double c_in,
|
||||
double c_out,
|
||||
double w_nmos,
|
||||
double w_pmos,
|
||||
double i_on_n,
|
||||
double i_on_p,
|
||||
double i_on_n_in,
|
||||
double i_on_p_in,
|
||||
double vdd);
|
||||
|
||||
double shortcircuit_simple(
|
||||
double vt,
|
||||
double velocity_index,
|
||||
double c_in,
|
||||
double c_out,
|
||||
double w_nmos,
|
||||
double w_pmos,
|
||||
double i_on_n,
|
||||
double i_on_p,
|
||||
double i_on_n_in,
|
||||
double i_on_p_in,
|
||||
double vdd);
|
||||
//set power point product mask; strictly speaking this is not real point product
|
||||
inline void set_pppm(
|
||||
double * pppv,
|
||||
double a=1,
|
||||
double b=1,
|
||||
double c=1,
|
||||
double d=1
|
||||
){
|
||||
pppv[0]= a;
|
||||
pppv[1]= b;
|
||||
pppv[2]= c;
|
||||
pppv[3]= d;
|
||||
|
||||
}
|
||||
|
||||
inline void set_sppm(
|
||||
double * sppv,
|
||||
double a=1,
|
||||
double b=1,
|
||||
double c=1,
|
||||
double d=1
|
||||
){
|
||||
sppv[0]= a;
|
||||
sppv[1]= b;
|
||||
sppv[2]= c;
|
||||
}
|
||||
|
||||
#endif
|
41
ext/mcpat/cacti/batch_tests
Executable file
41
ext/mcpat/cacti/batch_tests
Executable file
|
@ -0,0 +1,41 @@
|
|||
rm -rf ./out.csv
|
||||
./cacti 8192 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16384 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 32768 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 65536 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 131072 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 262144 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 524288 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 1048576 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 8192 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16384 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 32768 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 65536 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 131072 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 262144 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 524288 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 1048576 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
|
||||
./cacti 8192 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16384 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 32768 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 65536 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 131072 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 262144 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 524288 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 1048576 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
||||
./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
|
175
ext/mcpat/cacti/cache.cfg
Executable file
175
ext/mcpat/cacti/cache.cfg
Executable file
|
@ -0,0 +1,175 @@
|
|||
# Cache size
|
||||
//-size (bytes) 2048
|
||||
//-size (bytes) 4096
|
||||
//-size (bytes) 32768
|
||||
//-size (bytes) 262144
|
||||
//-size (bytes) 1048576
|
||||
//-size (bytes) 2097152
|
||||
//-size (bytes) 4194304
|
||||
//-size (bytes) 8388608
|
||||
//-size (bytes) 16777216
|
||||
//-size (bytes) 33554432
|
||||
//-size (bytes) 134217728
|
||||
//-size (bytes) 67108864
|
||||
-size (bytes) 1073741824
|
||||
|
||||
# Line size
|
||||
//-block size (bytes) 8
|
||||
-block size (bytes) 64
|
||||
|
||||
# To model Fully Associative cache, set associativity to zero
|
||||
//-associativity 0
|
||||
//-associativity 2
|
||||
//-associativity 4
|
||||
-associativity 8
|
||||
//-associativity 16
|
||||
|
||||
-read-write port 1
|
||||
-exclusive read port 0
|
||||
-exclusive write port 0
|
||||
-single ended read ports 0
|
||||
|
||||
# Multiple banks connected using a bus
|
||||
-UCA bank count 1
|
||||
-technology (u) 0.022
|
||||
//-technology (u) 0.040
|
||||
//-technology (u) 0.032
|
||||
//-technology (u) 0.090
|
||||
|
||||
# following three parameters are meaningful only for main memories
|
||||
|
||||
-page size (bits) 8192
|
||||
-burst length 8
|
||||
-internal prefetch width 8
|
||||
|
||||
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
|
||||
-Data array cell type - "itrs-hp"
|
||||
//-Data array cell type - "itrs-lstp"
|
||||
//-Data array cell type - "itrs-lop"
|
||||
|
||||
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
|
||||
-Data array peripheral type - "itrs-hp"
|
||||
//-Data array peripheral type - "itrs-lstp"
|
||||
//-Data array peripheral type - "itrs-lop"
|
||||
|
||||
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
|
||||
-Tag array cell type - "itrs-hp"
|
||||
//-Tag array cell type - "itrs-lstp"
|
||||
//-Tag array cell type - "itrs-lop"
|
||||
|
||||
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
|
||||
-Tag array peripheral type - "itrs-hp"
|
||||
//-Tag array peripheral type - "itrs-lstp"
|
||||
//-Tag array peripheral type - "itrs-lop
|
||||
|
||||
# Bus width include data bits and address bits required by the decoder
|
||||
//-output/input bus width 16
|
||||
-output/input bus width 512
|
||||
|
||||
// 300-400 in steps of 10
|
||||
-operating temperature (K) 360
|
||||
|
||||
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
|
||||
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
|
||||
-cache type "cache"
|
||||
//-cache type "ram"
|
||||
//-cache type "main memory"
|
||||
|
||||
# to model special structure like branch target buffers, directory, etc.
|
||||
# change the tag size parameter
|
||||
# if you want cacti to calculate the tagbits, set the tag size to "default"
|
||||
-tag size (b) "default"
|
||||
//-tag size (b) 22
|
||||
|
||||
# fast - data and tag access happen in parallel
|
||||
# sequential - data array is accessed after accessing the tag array
|
||||
# normal - data array lookup and tag access happen in parallel
|
||||
# final data block is broadcasted in data array h-tree
|
||||
# after getting the signal from the tag array
|
||||
//-access mode (normal, sequential, fast) - "fast"
|
||||
-access mode (normal, sequential, fast) - "normal"
|
||||
//-access mode (normal, sequential, fast) - "sequential"
|
||||
|
||||
|
||||
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
|
||||
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
|
||||
|
||||
# Percentage deviation from the minimum value
|
||||
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
|
||||
# that compromises at most 10% delay.
|
||||
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
|
||||
# percentage values will not produce any valid organizations. For example,
|
||||
# 0:0:100:100:100 will try to identify an organization that has both
|
||||
# least delay and dynamic power. Since such an organization is not possible, CACTI will
|
||||
# throw an error. Refer CACTI-6 Technical report for more details
|
||||
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
|
||||
|
||||
# Objective for NUCA
|
||||
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
|
||||
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
|
||||
|
||||
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
|
||||
# energy-delay or energy-delay sq. product
|
||||
# Note: Optimize tag will disable weight or deviate values mentioned above
|
||||
# Set it to NONE to let weight and deviate values determine the
|
||||
# appropriate cache configuration
|
||||
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
|
||||
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
|
||||
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
|
||||
|
||||
-Cache model (NUCA, UCA) - "UCA"
|
||||
//-Cache model (NUCA, UCA) - "NUCA"
|
||||
|
||||
# In order for CACTI to find the optimal NUCA bank value the following
|
||||
# variable should be assigned 0.
|
||||
-NUCA bank count 0
|
||||
|
||||
# NOTE: for nuca network frequency is set to a default value of
|
||||
# 5GHz in time.c. CACTI automatically
|
||||
# calculates the maximum possible frequency and downgrades this value if necessary
|
||||
|
||||
# By default CACTI considers both full-swing and low-swing
|
||||
# wires to find an optimal configuration. However, it is possible to
|
||||
# restrict the search space by changing the signalling from "default" to
|
||||
# "fullswing" or "lowswing" type.
|
||||
//-Wire signalling (fullswing, lowswing, default) - "Global_10"
|
||||
-Wire signalling (fullswing, lowswing, default) - "default"
|
||||
//-Wire signalling (fullswing, lowswing, default) - "lowswing"
|
||||
|
||||
//-Wire inside mat - "global"
|
||||
-Wire inside mat - "semi-global"
|
||||
//-Wire outside mat - "global"
|
||||
-Wire outside mat - "semi-global"
|
||||
|
||||
//-Interconnect projection - "conservative"
|
||||
-Interconnect projection - "aggressive"
|
||||
|
||||
# Contention in network (which is a function of core count and cache level) is one of
|
||||
# the critical factor used for deciding the optimal bank count value
|
||||
# core count can be 4, 8, or 16
|
||||
//-Core count 4
|
||||
-Core count 8
|
||||
//-Core count 16
|
||||
-Cache level (L2/L3) - "L3"
|
||||
|
||||
-Add ECC - "true"
|
||||
|
||||
//-Print level (DETAILED, CONCISE) - "CONCISE"
|
||||
-Print level (DETAILED, CONCISE) - "DETAILED"
|
||||
|
||||
# for debugging
|
||||
//-Print input parameters - "true"
|
||||
-Print input parameters - "false"
|
||||
# force CACTI to model the cache with the
|
||||
# following Ndbl, Ndwl, Nspd, Ndsam,
|
||||
# and Ndcm values
|
||||
//-Force cache config - "true"
|
||||
-Force cache config - "false"
|
||||
-Ndwl 1
|
||||
-Ndbl 1
|
||||
-Nspd 0
|
||||
-Ndcm 1
|
||||
-Ndsam1 0
|
||||
-Ndsam2 0
|
||||
|
||||
|
8
ext/mcpat/cacti/cacti.i
Normal file
8
ext/mcpat/cacti/cacti.i
Normal file
|
@ -0,0 +1,8 @@
|
|||
%module cacti
|
||||
%{
|
||||
/* Includes the header in the wrapper code */
|
||||
#include "cacti_interface.h"
|
||||
%}
|
||||
|
||||
/* Parse the header file to generate wrappers */
|
||||
%include "cacti_interface.h"
|
51
ext/mcpat/cacti/cacti.mk
Normal file
51
ext/mcpat/cacti/cacti.mk
Normal file
|
@ -0,0 +1,51 @@
|
|||
TARGET = cacti
|
||||
SHELL = /bin/sh
|
||||
.PHONY: all depend clean
|
||||
.SUFFIXES: .cc .o
|
||||
|
||||
ifndef NTHREADS
|
||||
NTHREADS = 8
|
||||
endif
|
||||
|
||||
|
||||
LIBS =
|
||||
INCS = -lm
|
||||
|
||||
ifeq ($(TAG),dbg)
|
||||
DBG = -Wall
|
||||
OPT = -ggdb -g -O0 -DNTHREADS=1 -gstabs+
|
||||
else
|
||||
DBG =
|
||||
OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS)
|
||||
endif
|
||||
|
||||
#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT)
|
||||
CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT)
|
||||
CXX = g++ -m32
|
||||
CC = gcc -m32
|
||||
|
||||
SRCS = area.cc bank.cc mat.cc main.cc Ucache.cc io.cc technology.cc basic_circuit.cc parameter.cc \
|
||||
decoder.cc component.cc uca.cc subarray.cc wire.cc htree2.cc \
|
||||
cacti_interface.cc router.cc nuca.cc crossbar.cc arbiter.cc
|
||||
|
||||
OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
|
||||
PYTHONLIB_SRCS = $(patsubst main.cc, ,$(SRCS)) obj_$(TAG)/cacti_wrap.cc
|
||||
PYTHONLIB_OBJS = $(patsubst %.cc,%.o,$(PYTHONLIB_SRCS))
|
||||
INCLUDES = -I /usr/include/python2.4 -I /usr/lib/python2.4/config
|
||||
|
||||
all: obj_$(TAG)/$(TARGET)
|
||||
cp -f obj_$(TAG)/$(TARGET) $(TARGET)
|
||||
|
||||
obj_$(TAG)/$(TARGET) : $(OBJS)
|
||||
$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
|
||||
|
||||
#obj_$(TAG)/%.o : %.cc
|
||||
# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
|
||||
|
||||
obj_$(TAG)/%.o : %.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
-rm -f *.o _cacti.so cacti.py $(TARGET)
|
||||
|
||||
|
173
ext/mcpat/cacti/cacti_interface.cc
Normal file
173
ext/mcpat/cacti/cacti_interface.cc
Normal file
|
@ -0,0 +1,173 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
|
||||
#include "Ucache.h"
|
||||
#include "area.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "const.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
bool mem_array::lt(const mem_array * m1, const mem_array * m2)
|
||||
{
|
||||
if (m1->Nspd < m2->Nspd) return true;
|
||||
else if (m1->Nspd > m2->Nspd) return false;
|
||||
else if (m1->Ndwl < m2->Ndwl) return true;
|
||||
else if (m1->Ndwl > m2->Ndwl) return false;
|
||||
else if (m1->Ndbl < m2->Ndbl) return true;
|
||||
else if (m1->Ndbl > m2->Ndbl) return false;
|
||||
else if (m1->deg_bl_muxing < m2->deg_bl_muxing) return true;
|
||||
else if (m1->deg_bl_muxing > m2->deg_bl_muxing) return false;
|
||||
else if (m1->Ndsam_lev_1 < m2->Ndsam_lev_1) return true;
|
||||
else if (m1->Ndsam_lev_1 > m2->Ndsam_lev_1) return false;
|
||||
else if (m1->Ndsam_lev_2 < m2->Ndsam_lev_2) return true;
|
||||
else return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void uca_org_t::find_delay()
|
||||
{
|
||||
mem_array * data_arr = data_array2;
|
||||
mem_array * tag_arr = tag_array2;
|
||||
|
||||
// check whether it is a regular cache or scratch ram
|
||||
if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
|
||||
{
|
||||
access_time = data_arr->access_time;
|
||||
}
|
||||
// Both tag and data lookup happen in parallel
|
||||
// and the entire set is sent over the data array h-tree without
|
||||
// waiting for the way-select signal --TODO add the corresponding
|
||||
// power overhead Nav
|
||||
else if (g_ip->fast_access == true)
|
||||
{
|
||||
access_time = MAX(tag_arr->access_time, data_arr->access_time);
|
||||
}
|
||||
// Tag is accessed first. On a hit, way-select signal along with the
|
||||
// address is sent to read/write the appropriate block in the data
|
||||
// array
|
||||
else if (g_ip->is_seq_acc == true)
|
||||
{
|
||||
access_time = tag_arr->access_time + data_arr->access_time;
|
||||
}
|
||||
// Normal access: tag array access and data array access happen in parallel.
|
||||
// But, the data array will wait for the way-select and transfer only the
|
||||
// appropriate block over the h-tree.
|
||||
else
|
||||
{
|
||||
access_time = MAX(tag_arr->access_time + data_arr->delay_senseamp_mux_decoder,
|
||||
data_arr->delay_before_subarray_output_driver) +
|
||||
data_arr->delay_from_subarray_output_driver_to_output;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void uca_org_t::find_energy()
|
||||
{
|
||||
if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache)
|
||||
power = data_array2->power + tag_array2->power;
|
||||
else
|
||||
power = data_array2->power;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void uca_org_t::find_area()
|
||||
{
|
||||
if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)//(g_ip->is_cache == false)
|
||||
{
|
||||
cache_ht = data_array2->height;
|
||||
cache_len = data_array2->width;
|
||||
}
|
||||
else
|
||||
{
|
||||
cache_ht = MAX(tag_array2->height, data_array2->height);
|
||||
cache_len = tag_array2->width + data_array2->width;
|
||||
}
|
||||
area = cache_ht * cache_len;
|
||||
}
|
||||
|
||||
void uca_org_t::adjust_area()
|
||||
{
|
||||
double area_adjust;
|
||||
if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
|
||||
{
|
||||
if (data_array2->area_efficiency/100.0<0.2)
|
||||
{
|
||||
//area_adjust = sqrt(area/(area*(data_array2->area_efficiency/100.0)/0.2));
|
||||
area_adjust = sqrt(0.2/(data_array2->area_efficiency/100.0));
|
||||
cache_ht = cache_ht/area_adjust;
|
||||
cache_len = cache_len/area_adjust;
|
||||
}
|
||||
}
|
||||
area = cache_ht * cache_len;
|
||||
}
|
||||
|
||||
void uca_org_t::find_cyc()
|
||||
{
|
||||
if ((g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache == false)
|
||||
{
|
||||
cycle_time = data_array2->cycle_time;
|
||||
}
|
||||
else
|
||||
{
|
||||
cycle_time = MAX(tag_array2->cycle_time,
|
||||
data_array2->cycle_time);
|
||||
}
|
||||
}
|
||||
|
||||
uca_org_t :: uca_org_t()
|
||||
:tag_array2(0),
|
||||
data_array2(0)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void uca_org_t :: cleanup()
|
||||
{
|
||||
if (data_array2!=0)
|
||||
delete data_array2;
|
||||
if (tag_array2!=0)
|
||||
delete tag_array2;
|
||||
}
|
633
ext/mcpat/cacti/cacti_interface.h
Normal file
633
ext/mcpat/cacti/cacti_interface.h
Normal file
|
@ -0,0 +1,633 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __CACTI_INTERFACE_H__
|
||||
#define __CACTI_INTERFACE_H__
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "const.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
class min_values_t;
|
||||
class mem_array;
|
||||
class uca_org_t;
|
||||
|
||||
|
||||
class powerComponents
|
||||
{
|
||||
public:
|
||||
double dynamic;
|
||||
double leakage;
|
||||
double gate_leakage;
|
||||
double short_circuit;
|
||||
double longer_channel_leakage;
|
||||
|
||||
powerComponents() : dynamic(0), leakage(0), gate_leakage(0), short_circuit(0), longer_channel_leakage(0) { }
|
||||
powerComponents(const powerComponents & obj) { *this = obj; }
|
||||
powerComponents & operator=(const powerComponents & rhs)
|
||||
{
|
||||
dynamic = rhs.dynamic;
|
||||
leakage = rhs.leakage;
|
||||
gate_leakage = rhs.gate_leakage;
|
||||
short_circuit = rhs.short_circuit;
|
||||
longer_channel_leakage = rhs.longer_channel_leakage;
|
||||
return *this;
|
||||
}
|
||||
void reset() { dynamic = 0; leakage = 0; gate_leakage = 0; short_circuit = 0;longer_channel_leakage = 0;}
|
||||
|
||||
friend powerComponents operator+(const powerComponents & x, const powerComponents & y);
|
||||
friend powerComponents operator*(const powerComponents & x, double const * const y);
|
||||
};
|
||||
|
||||
|
||||
|
||||
class powerDef
|
||||
{
|
||||
public:
|
||||
powerComponents readOp;
|
||||
powerComponents writeOp;
|
||||
powerComponents searchOp;//Sheng: for CAM and FA
|
||||
|
||||
powerDef() : readOp(), writeOp(), searchOp() { }
|
||||
void reset() { readOp.reset(); writeOp.reset(); searchOp.reset();}
|
||||
|
||||
friend powerDef operator+(const powerDef & x, const powerDef & y);
|
||||
friend powerDef operator*(const powerDef & x, double const * const y);
|
||||
};
|
||||
|
||||
enum Wire_type
|
||||
{
|
||||
Global /* gloabl wires with repeaters */,
|
||||
Global_5 /* 5% delay penalty */,
|
||||
Global_10 /* 10% delay penalty */,
|
||||
Global_20 /* 20% delay penalty */,
|
||||
Global_30 /* 30% delay penalty */,
|
||||
Low_swing /* differential low power wires with high area overhead */,
|
||||
Semi_global /* mid-level wires with repeaters*/,
|
||||
Transmission /* tranmission lines with high area overhead */,
|
||||
Optical /* optical wires */,
|
||||
Invalid_wtype
|
||||
};
|
||||
|
||||
|
||||
|
||||
class InputParameter
|
||||
{
|
||||
public:
|
||||
void parse_cfg(const string & infile);
|
||||
|
||||
bool error_checking(); // return false if the input parameters are problematic
|
||||
void display_ip();
|
||||
|
||||
unsigned int cache_sz; // in bytes
|
||||
unsigned int line_sz;
|
||||
unsigned int assoc;
|
||||
unsigned int nbanks;
|
||||
unsigned int out_w;// == nr_bits_out
|
||||
bool specific_tag;
|
||||
unsigned int tag_w;
|
||||
unsigned int access_mode;
|
||||
unsigned int obj_func_dyn_energy;
|
||||
unsigned int obj_func_dyn_power;
|
||||
unsigned int obj_func_leak_power;
|
||||
unsigned int obj_func_cycle_t;
|
||||
|
||||
double F_sz_nm; // feature size in nm
|
||||
double F_sz_um; // feature size in um
|
||||
unsigned int num_rw_ports;
|
||||
unsigned int num_rd_ports;
|
||||
unsigned int num_wr_ports;
|
||||
unsigned int num_se_rd_ports; // number of single ended read ports
|
||||
unsigned int num_search_ports; // Sheng: number of search ports for CAM
|
||||
bool is_main_mem;
|
||||
bool is_cache;
|
||||
bool pure_ram;
|
||||
bool pure_cam;
|
||||
bool rpters_in_htree; // if there are repeaters in htree segment
|
||||
unsigned int ver_htree_wires_over_array;
|
||||
unsigned int broadcast_addr_din_over_ver_htrees;
|
||||
unsigned int temp;
|
||||
|
||||
unsigned int ram_cell_tech_type;
|
||||
unsigned int peri_global_tech_type;
|
||||
unsigned int data_arr_ram_cell_tech_type;
|
||||
unsigned int data_arr_peri_global_tech_type;
|
||||
unsigned int tag_arr_ram_cell_tech_type;
|
||||
unsigned int tag_arr_peri_global_tech_type;
|
||||
|
||||
unsigned int burst_len;
|
||||
unsigned int int_prefetch_w;
|
||||
unsigned int page_sz_bits;
|
||||
|
||||
unsigned int ic_proj_type; // interconnect_projection_type
|
||||
unsigned int wire_is_mat_type; // wire_inside_mat_type
|
||||
unsigned int wire_os_mat_type; // wire_outside_mat_type
|
||||
enum Wire_type wt;
|
||||
int force_wiretype;
|
||||
bool print_input_args;
|
||||
unsigned int nuca_cache_sz; // TODO
|
||||
int ndbl, ndwl, nspd, ndsam1, ndsam2, ndcm;
|
||||
bool force_cache_config;
|
||||
|
||||
int cache_level;
|
||||
int cores;
|
||||
int nuca_bank_count;
|
||||
int force_nuca_bank;
|
||||
|
||||
int delay_wt, dynamic_power_wt, leakage_power_wt,
|
||||
cycle_time_wt, area_wt;
|
||||
int delay_wt_nuca, dynamic_power_wt_nuca, leakage_power_wt_nuca,
|
||||
cycle_time_wt_nuca, area_wt_nuca;
|
||||
|
||||
int delay_dev, dynamic_power_dev, leakage_power_dev,
|
||||
cycle_time_dev, area_dev;
|
||||
int delay_dev_nuca, dynamic_power_dev_nuca, leakage_power_dev_nuca,
|
||||
cycle_time_dev_nuca, area_dev_nuca;
|
||||
int ed; //ED or ED2 optimization
|
||||
int nuca;
|
||||
|
||||
bool fast_access;
|
||||
unsigned int block_sz; // bytes
|
||||
unsigned int tag_assoc;
|
||||
unsigned int data_assoc;
|
||||
bool is_seq_acc;
|
||||
bool fully_assoc;
|
||||
unsigned int nsets; // == number_of_sets
|
||||
int print_detail;
|
||||
|
||||
|
||||
bool add_ecc_b_;
|
||||
//parameters for design constraint
|
||||
double throughput;
|
||||
double latency;
|
||||
bool pipelinable;
|
||||
int pipeline_stages;
|
||||
int per_stage_vector;
|
||||
bool with_clock_grid;
|
||||
};
|
||||
|
||||
|
||||
typedef struct{
|
||||
int Ndwl;
|
||||
int Ndbl;
|
||||
double Nspd;
|
||||
int deg_bl_muxing;
|
||||
int Ndsam_lev_1;
|
||||
int Ndsam_lev_2;
|
||||
int number_activated_mats_horizontal_direction;
|
||||
int number_subbanks;
|
||||
int page_size_in_bits;
|
||||
double delay_route_to_bank;
|
||||
double delay_crossbar;
|
||||
double delay_addr_din_horizontal_htree;
|
||||
double delay_addr_din_vertical_htree;
|
||||
double delay_row_predecode_driver_and_block;
|
||||
double delay_row_decoder;
|
||||
double delay_bitlines;
|
||||
double delay_sense_amp;
|
||||
double delay_subarray_output_driver;
|
||||
double delay_bit_mux_predecode_driver_and_block;
|
||||
double delay_bit_mux_decoder;
|
||||
double delay_senseamp_mux_lev_1_predecode_driver_and_block;
|
||||
double delay_senseamp_mux_lev_1_decoder;
|
||||
double delay_senseamp_mux_lev_2_predecode_driver_and_block;
|
||||
double delay_senseamp_mux_lev_2_decoder;
|
||||
double delay_input_htree;
|
||||
double delay_output_htree;
|
||||
double delay_dout_vertical_htree;
|
||||
double delay_dout_horizontal_htree;
|
||||
double delay_comparator;
|
||||
double access_time;
|
||||
double cycle_time;
|
||||
double multisubbank_interleave_cycle_time;
|
||||
double delay_request_network;
|
||||
double delay_inside_mat;
|
||||
double delay_reply_network;
|
||||
double trcd;
|
||||
double cas_latency;
|
||||
double precharge_delay;
|
||||
powerDef power_routing_to_bank;
|
||||
powerDef power_addr_input_htree;
|
||||
powerDef power_data_input_htree;
|
||||
powerDef power_data_output_htree;
|
||||
powerDef power_addr_horizontal_htree;
|
||||
powerDef power_datain_horizontal_htree;
|
||||
powerDef power_dataout_horizontal_htree;
|
||||
powerDef power_addr_vertical_htree;
|
||||
powerDef power_datain_vertical_htree;
|
||||
powerDef power_row_predecoder_drivers;
|
||||
powerDef power_row_predecoder_blocks;
|
||||
powerDef power_row_decoders;
|
||||
powerDef power_bit_mux_predecoder_drivers;
|
||||
powerDef power_bit_mux_predecoder_blocks;
|
||||
powerDef power_bit_mux_decoders;
|
||||
powerDef power_senseamp_mux_lev_1_predecoder_drivers;
|
||||
powerDef power_senseamp_mux_lev_1_predecoder_blocks;
|
||||
powerDef power_senseamp_mux_lev_1_decoders;
|
||||
powerDef power_senseamp_mux_lev_2_predecoder_drivers;
|
||||
powerDef power_senseamp_mux_lev_2_predecoder_blocks;
|
||||
powerDef power_senseamp_mux_lev_2_decoders;
|
||||
powerDef power_bitlines;
|
||||
powerDef power_sense_amps;
|
||||
powerDef power_prechg_eq_drivers;
|
||||
powerDef power_output_drivers_at_subarray;
|
||||
powerDef power_dataout_vertical_htree;
|
||||
powerDef power_comparators;
|
||||
powerDef power_crossbar;
|
||||
powerDef total_power;
|
||||
double area;
|
||||
double all_banks_height;
|
||||
double all_banks_width;
|
||||
double bank_height;
|
||||
double bank_width;
|
||||
double subarray_memory_cell_area_height;
|
||||
double subarray_memory_cell_area_width;
|
||||
double mat_height;
|
||||
double mat_width;
|
||||
double routing_area_height_within_bank;
|
||||
double routing_area_width_within_bank;
|
||||
double area_efficiency;
|
||||
// double perc_power_dyn_routing_to_bank;
|
||||
// double perc_power_dyn_addr_horizontal_htree;
|
||||
// double perc_power_dyn_datain_horizontal_htree;
|
||||
// double perc_power_dyn_dataout_horizontal_htree;
|
||||
// double perc_power_dyn_addr_vertical_htree;
|
||||
// double perc_power_dyn_datain_vertical_htree;
|
||||
// double perc_power_dyn_row_predecoder_drivers;
|
||||
// double perc_power_dyn_row_predecoder_blocks;
|
||||
// double perc_power_dyn_row_decoders;
|
||||
// double perc_power_dyn_bit_mux_predecoder_drivers;
|
||||
// double perc_power_dyn_bit_mux_predecoder_blocks;
|
||||
// double perc_power_dyn_bit_mux_decoders;
|
||||
// double perc_power_dyn_senseamp_mux_lev_1_predecoder_drivers;
|
||||
// double perc_power_dyn_senseamp_mux_lev_1_predecoder_blocks;
|
||||
// double perc_power_dyn_senseamp_mux_lev_1_decoders;
|
||||
// double perc_power_dyn_senseamp_mux_lev_2_predecoder_drivers;
|
||||
// double perc_power_dyn_senseamp_mux_lev_2_predecoder_blocks;
|
||||
// double perc_power_dyn_senseamp_mux_lev_2_decoders;
|
||||
// double perc_power_dyn_bitlines;
|
||||
// double perc_power_dyn_sense_amps;
|
||||
// double perc_power_dyn_prechg_eq_drivers;
|
||||
// double perc_power_dyn_subarray_output_drivers;
|
||||
// double perc_power_dyn_dataout_vertical_htree;
|
||||
// double perc_power_dyn_comparators;
|
||||
// double perc_power_dyn_crossbar;
|
||||
// double perc_power_dyn_spent_outside_mats;
|
||||
// double perc_power_leak_routing_to_bank;
|
||||
// double perc_power_leak_addr_horizontal_htree;
|
||||
// double perc_power_leak_datain_horizontal_htree;
|
||||
// double perc_power_leak_dataout_horizontal_htree;
|
||||
// double perc_power_leak_addr_vertical_htree;
|
||||
// double perc_power_leak_datain_vertical_htree;
|
||||
// double perc_power_leak_row_predecoder_drivers;
|
||||
// double perc_power_leak_row_predecoder_blocks;
|
||||
// double perc_power_leak_row_decoders;
|
||||
// double perc_power_leak_bit_mux_predecoder_drivers;
|
||||
// double perc_power_leak_bit_mux_predecoder_blocks;
|
||||
// double perc_power_leak_bit_mux_decoders;
|
||||
// double perc_power_leak_senseamp_mux_lev_1_predecoder_drivers;
|
||||
// double perc_power_leak_senseamp_mux_lev_1_predecoder_blocks;
|
||||
// double perc_power_leak_senseamp_mux_lev_1_decoders;
|
||||
// double perc_power_leak_senseamp_mux_lev_2_predecoder_drivers;
|
||||
// double perc_power_leak_senseamp_mux_lev_2_predecoder_blocks;
|
||||
// double perc_power_leak_senseamp_mux_lev_2_decoders;
|
||||
// double perc_power_leak_bitlines;
|
||||
// double perc_power_leak_sense_amps;
|
||||
// double perc_power_leak_prechg_eq_drivers;
|
||||
// double perc_power_leak_subarray_output_drivers;
|
||||
// double perc_power_leak_dataout_vertical_htree;
|
||||
// double perc_power_leak_comparators;
|
||||
// double perc_power_leak_crossbar;
|
||||
// double perc_leak_mats;
|
||||
// double perc_active_mats;
|
||||
double refresh_power;
|
||||
double dram_refresh_period;
|
||||
double dram_array_availability;
|
||||
double dyn_read_energy_from_closed_page;
|
||||
double dyn_read_energy_from_open_page;
|
||||
double leak_power_subbank_closed_page;
|
||||
double leak_power_subbank_open_page;
|
||||
double leak_power_request_and_reply_networks;
|
||||
double activate_energy;
|
||||
double read_energy;
|
||||
double write_energy;
|
||||
double precharge_energy;
|
||||
} results_mem_array;
|
||||
|
||||
|
||||
class uca_org_t
|
||||
{
|
||||
public:
|
||||
mem_array * tag_array2;
|
||||
mem_array * data_array2;
|
||||
double access_time;
|
||||
double cycle_time;
|
||||
double area;
|
||||
double area_efficiency;
|
||||
powerDef power;
|
||||
double leak_power_with_sleep_transistors_in_mats;
|
||||
double cache_ht;
|
||||
double cache_len;
|
||||
char file_n[100];
|
||||
double vdd_periph_global;
|
||||
bool valid;
|
||||
results_mem_array tag_array;
|
||||
results_mem_array data_array;
|
||||
|
||||
uca_org_t();
|
||||
void find_delay();
|
||||
void find_energy();
|
||||
void find_area();
|
||||
void find_cyc();
|
||||
void adjust_area();//for McPAT only to adjust routing overhead
|
||||
void cleanup();
|
||||
~uca_org_t(){};
|
||||
};
|
||||
|
||||
void reconfigure(InputParameter *local_interface, uca_org_t *fin_res);
|
||||
|
||||
uca_org_t cacti_interface(const string & infile_name);
|
||||
//McPAT's plain interface, please keep !!!
|
||||
uca_org_t cacti_interface(InputParameter * const local_interface);
|
||||
//McPAT's plain interface, please keep !!!
|
||||
uca_org_t init_interface(InputParameter * const local_interface);
|
||||
//McPAT's plain interface, please keep !!!
|
||||
uca_org_t cacti_interface(
|
||||
int cache_size,
|
||||
int line_size,
|
||||
int associativity,
|
||||
int rw_ports,
|
||||
int excl_read_ports,
|
||||
int excl_write_ports,
|
||||
int single_ended_read_ports,
|
||||
int search_ports,
|
||||
int banks,
|
||||
double tech_node,
|
||||
int output_width,
|
||||
int specific_tag,
|
||||
int tag_width,
|
||||
int access_mode,
|
||||
int cache,
|
||||
int main_mem,
|
||||
int obj_func_delay,
|
||||
int obj_func_dynamic_power,
|
||||
int obj_func_leakage_power,
|
||||
int obj_func_cycle_time,
|
||||
int obj_func_area,
|
||||
int dev_func_delay,
|
||||
int dev_func_dynamic_power,
|
||||
int dev_func_leakage_power,
|
||||
int dev_func_area,
|
||||
int dev_func_cycle_time,
|
||||
int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
|
||||
int temp,
|
||||
int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
|
||||
int data_arr_ram_cell_tech_flavor_in,
|
||||
int data_arr_peri_global_tech_flavor_in,
|
||||
int tag_arr_ram_cell_tech_flavor_in,
|
||||
int tag_arr_peri_global_tech_flavor_in,
|
||||
int interconnect_projection_type_in,
|
||||
int wire_inside_mat_type_in,
|
||||
int wire_outside_mat_type_in,
|
||||
int REPEATERS_IN_HTREE_SEGMENTS_in,
|
||||
int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
|
||||
int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
|
||||
int PAGE_SIZE_BITS_in,
|
||||
int BURST_LENGTH_in,
|
||||
int INTERNAL_PREFETCH_WIDTH_in,
|
||||
int force_wiretype,
|
||||
int wiretype,
|
||||
int force_config,
|
||||
int ndwl,
|
||||
int ndbl,
|
||||
int nspd,
|
||||
int ndcm,
|
||||
int ndsam1,
|
||||
int ndsam2,
|
||||
int ecc);
|
||||
// int cache_size,
|
||||
// int line_size,
|
||||
// int associativity,
|
||||
// int rw_ports,
|
||||
// int excl_read_ports,
|
||||
// int excl_write_ports,
|
||||
// int single_ended_read_ports,
|
||||
// int banks,
|
||||
// double tech_node,
|
||||
// int output_width,
|
||||
// int specific_tag,
|
||||
// int tag_width,
|
||||
// int access_mode,
|
||||
// int cache,
|
||||
// int main_mem,
|
||||
// int obj_func_delay,
|
||||
// int obj_func_dynamic_power,
|
||||
// int obj_func_leakage_power,
|
||||
// int obj_func_area,
|
||||
// int obj_func_cycle_time,
|
||||
// int dev_func_delay,
|
||||
// int dev_func_dynamic_power,
|
||||
// int dev_func_leakage_power,
|
||||
// int dev_func_area,
|
||||
// int dev_func_cycle_time,
|
||||
// int temp,
|
||||
// int data_arr_ram_cell_tech_flavor_in,
|
||||
// int data_arr_peri_global_tech_flavor_in,
|
||||
// int tag_arr_ram_cell_tech_flavor_in,
|
||||
// int tag_arr_peri_global_tech_flavor_in,
|
||||
// int interconnect_projection_type_in,
|
||||
// int wire_inside_mat_type_in,
|
||||
// int wire_outside_mat_type_in,
|
||||
// int REPEATERS_IN_HTREE_SEGMENTS_in,
|
||||
// int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
|
||||
// int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
|
||||
//// double MAXAREACONSTRAINT_PERC_in,
|
||||
//// double MAXACCTIMECONSTRAINT_PERC_in,
|
||||
//// double MAX_PERC_DIFF_IN_DELAY_FROM_BEST_DELAY_REPEATER_SOLUTION_in,
|
||||
// int PAGE_SIZE_BITS_in,
|
||||
// int BURST_LENGTH_in,
|
||||
// int INTERNAL_PREFETCH_WIDTH_in);
|
||||
|
||||
//Naveen's interface
|
||||
uca_org_t cacti_interface(
|
||||
int cache_size,
|
||||
int line_size,
|
||||
int associativity,
|
||||
int rw_ports,
|
||||
int excl_read_ports,
|
||||
int excl_write_ports,
|
||||
int single_ended_read_ports,
|
||||
int banks,
|
||||
double tech_node,
|
||||
int page_sz,
|
||||
int burst_length,
|
||||
int pre_width,
|
||||
int output_width,
|
||||
int specific_tag,
|
||||
int tag_width,
|
||||
int access_mode, //0 normal, 1 seq, 2 fast
|
||||
int cache, //scratch ram or cache
|
||||
int main_mem,
|
||||
int obj_func_delay,
|
||||
int obj_func_dynamic_power,
|
||||
int obj_func_leakage_power,
|
||||
int obj_func_area,
|
||||
int obj_func_cycle_time,
|
||||
int dev_func_delay,
|
||||
int dev_func_dynamic_power,
|
||||
int dev_func_leakage_power,
|
||||
int dev_func_area,
|
||||
int dev_func_cycle_time,
|
||||
int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
|
||||
int temp,
|
||||
int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
|
||||
int data_arr_ram_cell_tech_flavor_in,
|
||||
int data_arr_peri_global_tech_flavor_in,
|
||||
int tag_arr_ram_cell_tech_flavor_in,
|
||||
int tag_arr_peri_global_tech_flavor_in,
|
||||
int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
|
||||
int wire_inside_mat_type_in,
|
||||
int wire_outside_mat_type_in,
|
||||
int is_nuca, // 0 - UCA, 1 - NUCA
|
||||
int core_count,
|
||||
int cache_level, // 0 - L2, 1 - L3
|
||||
int nuca_bank_count,
|
||||
int nuca_obj_func_delay,
|
||||
int nuca_obj_func_dynamic_power,
|
||||
int nuca_obj_func_leakage_power,
|
||||
int nuca_obj_func_area,
|
||||
int nuca_obj_func_cycle_time,
|
||||
int nuca_dev_func_delay,
|
||||
int nuca_dev_func_dynamic_power,
|
||||
int nuca_dev_func_leakage_power,
|
||||
int nuca_dev_func_area,
|
||||
int nuca_dev_func_cycle_time,
|
||||
int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
|
||||
int p_input);
|
||||
|
||||
class mem_array
|
||||
{
|
||||
public:
|
||||
int Ndcm;
|
||||
int Ndwl;
|
||||
int Ndbl;
|
||||
double Nspd;
|
||||
int deg_bl_muxing;
|
||||
int Ndsam_lev_1;
|
||||
int Ndsam_lev_2;
|
||||
double access_time;
|
||||
double cycle_time;
|
||||
double multisubbank_interleave_cycle_time;
|
||||
double area_ram_cells;
|
||||
double area;
|
||||
powerDef power;
|
||||
double delay_senseamp_mux_decoder;
|
||||
double delay_before_subarray_output_driver;
|
||||
double delay_from_subarray_output_driver_to_output;
|
||||
double height;
|
||||
double width;
|
||||
|
||||
double mat_height;
|
||||
double mat_length;
|
||||
double subarray_length;
|
||||
double subarray_height;
|
||||
|
||||
double delay_route_to_bank,
|
||||
delay_input_htree,
|
||||
delay_row_predecode_driver_and_block,
|
||||
delay_row_decoder,
|
||||
delay_bitlines,
|
||||
delay_sense_amp,
|
||||
delay_subarray_output_driver,
|
||||
delay_dout_htree,
|
||||
delay_comparator,
|
||||
delay_matchlines;
|
||||
|
||||
double all_banks_height,
|
||||
all_banks_width,
|
||||
area_efficiency;
|
||||
|
||||
powerDef power_routing_to_bank;
|
||||
powerDef power_addr_input_htree;
|
||||
powerDef power_data_input_htree;
|
||||
powerDef power_data_output_htree;
|
||||
powerDef power_htree_in_search;
|
||||
powerDef power_htree_out_search;
|
||||
powerDef power_row_predecoder_drivers;
|
||||
powerDef power_row_predecoder_blocks;
|
||||
powerDef power_row_decoders;
|
||||
powerDef power_bit_mux_predecoder_drivers;
|
||||
powerDef power_bit_mux_predecoder_blocks;
|
||||
powerDef power_bit_mux_decoders;
|
||||
powerDef power_senseamp_mux_lev_1_predecoder_drivers;
|
||||
powerDef power_senseamp_mux_lev_1_predecoder_blocks;
|
||||
powerDef power_senseamp_mux_lev_1_decoders;
|
||||
powerDef power_senseamp_mux_lev_2_predecoder_drivers;
|
||||
powerDef power_senseamp_mux_lev_2_predecoder_blocks;
|
||||
powerDef power_senseamp_mux_lev_2_decoders;
|
||||
powerDef power_bitlines;
|
||||
powerDef power_sense_amps;
|
||||
powerDef power_prechg_eq_drivers;
|
||||
powerDef power_output_drivers_at_subarray;
|
||||
powerDef power_dataout_vertical_htree;
|
||||
powerDef power_comparators;
|
||||
|
||||
powerDef power_cam_bitline_precharge_eq_drv;
|
||||
powerDef power_searchline;
|
||||
powerDef power_searchline_precharge;
|
||||
powerDef power_matchlines;
|
||||
powerDef power_matchline_precharge;
|
||||
powerDef power_matchline_to_wordline_drv;
|
||||
|
||||
min_values_t *arr_min;
|
||||
enum Wire_type wt;
|
||||
|
||||
// dram stats
|
||||
double activate_energy, read_energy, write_energy, precharge_energy,
|
||||
refresh_power, leak_power_subbank_closed_page, leak_power_subbank_open_page,
|
||||
leak_power_request_and_reply_networks;
|
||||
|
||||
double precharge_delay;
|
||||
|
||||
static bool lt(const mem_array * m1, const mem_array * m2);
|
||||
};
|
||||
|
||||
|
||||
#endif
|
236
ext/mcpat/cacti/component.cc
Normal file
236
ext/mcpat/cacti/component.cc
Normal file
|
@ -0,0 +1,236 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "bank.h"
|
||||
#include "component.h"
|
||||
#include "decoder.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
Component::Component()
|
||||
:area(), power(), rt_power(),delay(0)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
Component::~Component()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Component::compute_diffusion_width(int num_stacked_in, int num_folded_tr)
|
||||
{
|
||||
double w_poly = g_ip->F_sz_um;
|
||||
double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
|
||||
double total_diff_w = 2 * spacing_poly_to_poly + // for both source and drain
|
||||
num_stacked_in * w_poly +
|
||||
(num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
|
||||
|
||||
if (num_folded_tr > 1)
|
||||
{
|
||||
total_diff_w += (num_folded_tr - 2) * 2 * spacing_poly_to_poly +
|
||||
(num_folded_tr - 1) * num_stacked_in * w_poly +
|
||||
(num_folded_tr - 1) * (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
|
||||
}
|
||||
|
||||
return total_diff_w;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Component::compute_gate_area(
|
||||
int gate_type,
|
||||
int num_inputs,
|
||||
double w_pmos,
|
||||
double w_nmos,
|
||||
double h_gate)
|
||||
{
|
||||
if (w_pmos <= 0.0 || w_nmos <= 0.0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
double w_folded_pmos, w_folded_nmos;
|
||||
int num_folded_pmos, num_folded_nmos;
|
||||
double total_ndiff_w, total_pdiff_w;
|
||||
Area gate;
|
||||
|
||||
double h_tr_region = h_gate - 2 * g_tp.HPOWERRAIL;
|
||||
double ratio_p_to_n = w_pmos / (w_pmos + w_nmos);
|
||||
|
||||
if (ratio_p_to_n >= 1 || ratio_p_to_n <= 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
w_folded_pmos = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * ratio_p_to_n;
|
||||
w_folded_nmos = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * (1 - ratio_p_to_n);
|
||||
assert(w_folded_pmos > 0);
|
||||
|
||||
num_folded_pmos = (int) (ceil(w_pmos / w_folded_pmos));
|
||||
num_folded_nmos = (int) (ceil(w_nmos / w_folded_nmos));
|
||||
|
||||
switch (gate_type)
|
||||
{
|
||||
case INV:
|
||||
total_ndiff_w = compute_diffusion_width(1, num_folded_nmos);
|
||||
total_pdiff_w = compute_diffusion_width(1, num_folded_pmos);
|
||||
break;
|
||||
|
||||
case NOR:
|
||||
total_ndiff_w = compute_diffusion_width(1, num_inputs * num_folded_nmos);
|
||||
total_pdiff_w = compute_diffusion_width(num_inputs, num_folded_pmos);
|
||||
break;
|
||||
|
||||
case NAND:
|
||||
total_ndiff_w = compute_diffusion_width(num_inputs, num_folded_nmos);
|
||||
total_pdiff_w = compute_diffusion_width(1, num_inputs * num_folded_pmos);
|
||||
break;
|
||||
default:
|
||||
cout << "Unknown gate type: " << gate_type << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
gate.w = MAX(total_ndiff_w, total_pdiff_w);
|
||||
|
||||
if (w_folded_nmos > w_nmos)
|
||||
{
|
||||
//means that the height of the gate can
|
||||
//be made smaller than the input height specified, so calculate the height of the gate.
|
||||
gate.h = w_nmos + w_pmos + g_tp.MIN_GAP_BET_P_AND_N_DIFFS + 2 * g_tp.HPOWERRAIL;
|
||||
}
|
||||
else
|
||||
{
|
||||
gate.h = h_gate;
|
||||
}
|
||||
return gate.get_area();
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Component::compute_tr_width_after_folding(
|
||||
double input_width,
|
||||
double threshold_folding_width)
|
||||
{//This is actually the width of the cell not the width of a device.
|
||||
//The width of a cell and the width of a device is orthogonal.
|
||||
if (input_width <= 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int num_folded_tr = (int) (ceil(input_width / threshold_folding_width));
|
||||
double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
|
||||
double width_poly = g_ip->F_sz_um;
|
||||
double total_diff_width = num_folded_tr * width_poly + (num_folded_tr + 1) * spacing_poly_to_poly;
|
||||
|
||||
return total_diff_width;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Component::height_sense_amplifier(double pitch_sense_amp)
|
||||
{
|
||||
// compute the height occupied by all PMOS transistors
|
||||
double h_pmos_tr = compute_tr_width_after_folding(g_tp.w_sense_p, pitch_sense_amp) * 2 +
|
||||
compute_tr_width_after_folding(g_tp.w_iso, pitch_sense_amp) +
|
||||
2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
|
||||
|
||||
// compute the height occupied by all NMOS transistors
|
||||
double h_nmos_tr = compute_tr_width_after_folding(g_tp.w_sense_n, pitch_sense_amp) * 2 +
|
||||
compute_tr_width_after_folding(g_tp.w_sense_en, pitch_sense_amp) +
|
||||
2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
|
||||
|
||||
// compute total height by considering gap between the p and n diffusion areas
|
||||
return h_pmos_tr + h_nmos_tr + g_tp.MIN_GAP_BET_P_AND_N_DIFFS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int Component::logical_effort(
|
||||
int num_gates_min,
|
||||
double g,
|
||||
double F,
|
||||
double * w_n,
|
||||
double * w_p,
|
||||
double C_load,
|
||||
double p_to_n_sz_ratio,
|
||||
bool is_dram_,
|
||||
bool is_wl_tr_,
|
||||
double max_w_nmos)
|
||||
{
|
||||
int num_gates = (int) (log(F) / log(fopt));
|
||||
|
||||
// check if num_gates is odd. if so, add 1 to make it even
|
||||
num_gates+= (num_gates % 2) ? 1 : 0;
|
||||
num_gates = MAX(num_gates, num_gates_min);
|
||||
|
||||
// recalculate the effective fanout of each stage
|
||||
double f = pow(F, 1.0 / num_gates);
|
||||
int i = num_gates - 1;
|
||||
double C_in = C_load / f;
|
||||
w_n[i] = (1.0 / (1.0 + p_to_n_sz_ratio)) * C_in / gate_C(1, 0, is_dram_, false, is_wl_tr_);
|
||||
w_n[i] = MAX(w_n[i], g_tp.min_w_nmos_);
|
||||
w_p[i] = p_to_n_sz_ratio * w_n[i];
|
||||
|
||||
if (w_n[i] > max_w_nmos)
|
||||
{
|
||||
double C_ld = gate_C((1 + p_to_n_sz_ratio) * max_w_nmos, 0, is_dram_, false, is_wl_tr_);
|
||||
F = g * C_ld / gate_C(w_n[0] + w_p[0], 0, is_dram_, false, is_wl_tr_);
|
||||
num_gates = (int) (log(F) / log(fopt)) + 1;
|
||||
num_gates+= (num_gates % 2) ? 1 : 0;
|
||||
num_gates = MAX(num_gates, num_gates_min);
|
||||
f = pow(F, 1.0 / (num_gates - 1));
|
||||
i = num_gates - 1;
|
||||
w_n[i] = max_w_nmos;
|
||||
w_p[i] = p_to_n_sz_ratio * w_n[i];
|
||||
}
|
||||
|
||||
for (i = num_gates - 2; i >= 1; i--)
|
||||
{
|
||||
w_n[i] = MAX(w_n[i+1] / f, g_tp.min_w_nmos_);
|
||||
w_p[i] = p_to_n_sz_ratio * w_n[i];
|
||||
}
|
||||
|
||||
assert(num_gates <= MAX_NUMBER_GATES_STAGE);
|
||||
return num_gates;
|
||||
}
|
||||
|
84
ext/mcpat/cacti/component.h
Normal file
84
ext/mcpat/cacti/component.h
Normal file
|
@ -0,0 +1,84 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __COMPONENT_H__
|
||||
#define __COMPONENT_H__
|
||||
|
||||
#include "area.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Crossbar;
|
||||
class Bank;
|
||||
|
||||
class Component
|
||||
{
|
||||
public:
|
||||
Component();
|
||||
~Component();
|
||||
|
||||
Area area;
|
||||
powerDef power,rt_power;
|
||||
double delay;
|
||||
double cycle_time;
|
||||
|
||||
double compute_gate_area(
|
||||
int gate_type,
|
||||
int num_inputs,
|
||||
double w_pmos,
|
||||
double w_nmos,
|
||||
double h_gate);
|
||||
|
||||
double compute_tr_width_after_folding(double input_width, double threshold_folding_width);
|
||||
double height_sense_amplifier(double pitch_sense_amp);
|
||||
|
||||
protected:
|
||||
int logical_effort(
|
||||
int num_gates_min,
|
||||
double g,
|
||||
double F,
|
||||
double * w_n,
|
||||
double * w_p,
|
||||
double C_load,
|
||||
double p_to_n_sz_ratio,
|
||||
bool is_dram_,
|
||||
bool is_wl_tr_,
|
||||
double max_w_nmos);
|
||||
|
||||
private:
|
||||
double compute_diffusion_width(int num_stacked_in, int num_folded_tr);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
270
ext/mcpat/cacti/const.h
Normal file
270
ext/mcpat/cacti/const.h
Normal file
|
@ -0,0 +1,270 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef __CONST_H__
|
||||
#define __CONST_H__
|
||||
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* The following are things you might want to change
|
||||
* when compiling
|
||||
*/
|
||||
|
||||
/*
|
||||
* Address bits in a word, and number of output bits from the cache
|
||||
*/
|
||||
|
||||
/*
|
||||
was: #define ADDRESS_BITS 32
|
||||
now: I'm using 42 bits as in the Power4,
|
||||
since that's bigger then the 36 bits on the Pentium 4
|
||||
and 40 bits on the Opteron
|
||||
*/
|
||||
const int ADDRESS_BITS = 42;
|
||||
|
||||
/*dt: In addition to the tag bits, the tags also include 1 valid bit, 1 dirty bit, 2 bits for a 4-state
|
||||
cache coherency protocoll (MESI), 1 bit for MRU (change this to log(ways) for full LRU).
|
||||
So in total we have 1 + 1 + 2 + 1 = 5 */
|
||||
const int EXTRA_TAG_BITS = 5;
|
||||
|
||||
/* limits on the various N parameters */
|
||||
|
||||
const unsigned int MAXDATAN = 512; // maximum for Ndwl and Ndbl
|
||||
const unsigned int MAXSUBARRAYS = 1048576; // maximum subarrays for data and tag arrays
|
||||
const unsigned int MAXDATASPD = 256; // maximum for Nspd
|
||||
const unsigned int MAX_COL_MUX = 256;
|
||||
|
||||
|
||||
|
||||
#define ROUTER_TYPES 3
|
||||
#define WIRE_TYPES 6
|
||||
|
||||
const double Cpolywire = 0;
|
||||
|
||||
|
||||
/* Threshold voltages (as a proportion of Vdd)
|
||||
If you don't know them, set all values to 0.5 */
|
||||
#define VTHFA1 0.452
|
||||
#define VTHFA2 0.304
|
||||
#define VTHFA3 0.420
|
||||
#define VTHFA4 0.413
|
||||
#define VTHFA5 0.405
|
||||
#define VTHFA6 0.452
|
||||
#define VSINV 0.452
|
||||
#define VTHCOMPINV 0.437
|
||||
#define VTHMUXNAND 0.548 // TODO : this constant must be revisited
|
||||
#define VTHEVALINV 0.452
|
||||
#define VTHSENSEEXTDRV 0.438
|
||||
|
||||
|
||||
//WmuxdrvNANDn and WmuxdrvNANDp are no longer being used but it's part of the old
|
||||
//delay_comparator function which we are using exactly as it used to be, so just setting these to 0
|
||||
const double WmuxdrvNANDn = 0;
|
||||
const double WmuxdrvNANDp = 0;
|
||||
|
||||
|
||||
/*===================================================================*/
|
||||
/*
|
||||
* The following are things you probably wouldn't want to change.
|
||||
*/
|
||||
|
||||
#define BIGNUM 1e30
|
||||
#define INF 9999999
|
||||
#define MAX(a,b) (((a)>(b))?(a):(b))
|
||||
#define MIN(a,b) (((a)<(b))?(a):(b))
|
||||
|
||||
/* Used to communicate with the horowitz model */
|
||||
#define RISE 1
|
||||
#define FALL 0
|
||||
#define NCH 1
|
||||
#define PCH 0
|
||||
|
||||
|
||||
#define EPSILON 0.5 //v4.1: This constant is being used in order to fix floating point -> integer
|
||||
//conversion problems that were occuring within CACTI. Typical problem that was occuring was
|
||||
//that with different compilers a floating point number like 3.0 would get represented as either
|
||||
//2.9999....or 3.00000001 and then the integer part of the floating point number (3.0) would
|
||||
//be computed differently depending on the compiler. What we are doing now is to replace
|
||||
//int (x) with (int) (x+EPSILON) where EPSILON is 0.5. This would fix such problems. Note that
|
||||
//this works only when x is an integer >= 0.
|
||||
/*
|
||||
* Sheng thinks this is more a solution to solve the simple truncate problem
|
||||
* (http://www.cs.tut.fi/~jkorpela/round.html) rather than the problem mentioned above.
|
||||
* Unfortunately, this solution causes nasty bugs (different results when using O0 and O3).
|
||||
* Moreover, round is not correct in CACTI since when an extra fraction of bit/line is needed,
|
||||
* we need to provide a complete bit/line even the fraction is just 0.01.
|
||||
* So, in later version than 6.5 we use (int)ceil() to get double to int conversion.
|
||||
*/
|
||||
|
||||
#define EPSILON2 0.1
|
||||
#define EPSILON3 0.6
|
||||
|
||||
|
||||
#define MINSUBARRAYROWS 16 //For simplicity in modeling, for the row decoding structure, we assume
|
||||
//that each row predecode block is composed of at least one 2-4 decoder. When the outputs from the
|
||||
//row predecode blocks are combined this means that there are at least 4*4=16 row decode outputs
|
||||
#define MAXSUBARRAYROWS 262144 //Each row predecode block produces a max of 2^9 outputs. So
|
||||
//the maximum number of row decode outputs will be 2^9*2^9
|
||||
#define MINSUBARRAYCOLS 2
|
||||
#define MAXSUBARRAYCOLS 262144
|
||||
|
||||
|
||||
#define INV 0
|
||||
#define NOR 1
|
||||
#define NAND 2
|
||||
|
||||
|
||||
#define NUMBER_TECH_FLAVORS 4
|
||||
|
||||
#define NUMBER_INTERCONNECT_PROJECTION_TYPES 2 //aggressive and conservative
|
||||
//0 = Aggressive projections, 1 = Conservative projections
|
||||
#define NUMBER_WIRE_TYPES 4 //local, semi-global and global
|
||||
//1 = 'Semi-global' wire type, 2 = 'Global' wire type
|
||||
|
||||
|
||||
const int dram_cell_tech_flavor = 3;
|
||||
|
||||
|
||||
#define VBITSENSEMIN 0.08 //minimum bitline sense voltage is fixed to be 80 mV.
|
||||
|
||||
#define fopt 4.0
|
||||
|
||||
#define INPUT_WIRE_TO_INPUT_GATE_CAP_RATIO 0
|
||||
#define BUFFER_SEPARATION_LENGTH_MULTIPLIER 1
|
||||
#define NUMBER_MATS_PER_REDUNDANT_MAT 8
|
||||
|
||||
#define NUMBER_STACKED_DIE_LAYERS 1
|
||||
|
||||
// this variable can be set to carry out solution optimization for
|
||||
// a maximum area allocation.
|
||||
#define STACKED_DIE_LAYER_ALLOTED_AREA_mm2 0 //6.24 //6.21//71.5
|
||||
|
||||
// this variable can also be employed when solution optimization
|
||||
// with maximum area allocation is carried out.
|
||||
#define MAX_PERCENT_AWAY_FROM_ALLOTED_AREA 50
|
||||
|
||||
// this variable can also be employed when solution optimization
|
||||
// with maximum area allocation is carried out.
|
||||
#define MIN_AREA_EFFICIENCY 20
|
||||
|
||||
// this variable can be employed when solution with a desired
|
||||
// aspect ratio is required.
|
||||
#define STACKED_DIE_LAYER_ASPECT_RATIO 1
|
||||
|
||||
// this variable can be employed when solution with a desired
|
||||
// aspect ratio is required.
|
||||
#define MAX_PERCENT_AWAY_FROM_ASPECT_RATIO 101
|
||||
|
||||
// this variable can be employed to carry out solution optimization
|
||||
// for a certain target random cycle time.
|
||||
#define TARGET_CYCLE_TIME_ns 1000000000
|
||||
|
||||
#define NUMBER_PIPELINE_STAGES 4
|
||||
|
||||
// this can be used to model the length of interconnect
|
||||
// between a bank and a crossbar
|
||||
#define LENGTH_INTERCONNECT_FROM_BANK_TO_CROSSBAR 0 //3791 // 2880//micron
|
||||
|
||||
#define IS_CROSSBAR 0
|
||||
#define NUMBER_INPUT_PORTS_CROSSBAR 8
|
||||
#define NUMBER_OUTPUT_PORTS_CROSSBAR 8
|
||||
#define NUMBER_SIGNALS_PER_PORT_CROSSBAR 256
|
||||
|
||||
|
||||
#define MAT_LEAKAGE_REDUCTION_DUE_TO_SLEEP_TRANSISTORS_FACTOR 1
|
||||
#define LEAKAGE_REDUCTION_DUE_TO_LONG_CHANNEL_HP_TRANSISTORS_FACTOR 1
|
||||
|
||||
#define PAGE_MODE 0
|
||||
|
||||
#define MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA 60
|
||||
// We are actually not using this variable in the CACTI code. We just want to acknowledge that
|
||||
// this current should be multiplied by the DDR(n) system VDD value to compute the standby power
|
||||
// consumed during precharge.
|
||||
|
||||
|
||||
const double VDD_STORAGE_LOSS_FRACTION_WORST = 0.125;
|
||||
const double CU_RESISTIVITY = 0.022; //ohm-micron
|
||||
const double BULK_CU_RESISTIVITY = 0.018; //ohm-micron
|
||||
const double PERMITTIVITY_FREE_SPACE = 8.854e-18; //F/micron
|
||||
|
||||
const static uint32_t sram_num_cells_wl_stitching_ = 16;
|
||||
const static uint32_t dram_num_cells_wl_stitching_ = 64;
|
||||
const static uint32_t comm_dram_num_cells_wl_stitching_ = 256;
|
||||
const static double num_bits_per_ecc_b_ = 8.0;
|
||||
|
||||
const double bit_to_byte = 8.0;
|
||||
|
||||
#define MAX_NUMBER_GATES_STAGE 20
|
||||
#define MAX_NUMBER_HTREE_NODES 20
|
||||
#define NAND2_LEAK_STACK_FACTOR 0.2
|
||||
#define NAND3_LEAK_STACK_FACTOR 0.2
|
||||
#define NOR2_LEAK_STACK_FACTOR 0.2
|
||||
#define INV_LEAK_STACK_FACTOR 0.5
|
||||
#define MAX_NUMBER_ARRAY_PARTITIONS 1000000
|
||||
|
||||
// abbreviations used in this project
|
||||
// ----------------------------------
|
||||
//
|
||||
// num : number
|
||||
// rw : read/write
|
||||
// rd : read
|
||||
// wr : write
|
||||
// se : single-ended
|
||||
// sz : size
|
||||
// F : feature
|
||||
// w : width
|
||||
// h : height or horizontal
|
||||
// v : vertical or velocity
|
||||
|
||||
|
||||
enum ram_cell_tech_type_num
|
||||
{
|
||||
itrs_hp = 0,
|
||||
itrs_lstp = 1,
|
||||
itrs_lop = 2,
|
||||
lp_dram = 3,
|
||||
comm_dram = 4
|
||||
};
|
||||
|
||||
const double pppm[4] = {1,1,1,1};
|
||||
const double pppm_lkg[4] = {0,1,1,0};
|
||||
const double pppm_dyn[4] = {1,0,0,0};
|
||||
const double pppm_Isub[4] = {0,1,0,0};
|
||||
const double pppm_Ig[4] = {0,0,1,0};
|
||||
const double pppm_sc[4] = {0,0,0,1};
|
||||
|
||||
|
||||
|
||||
#endif
|
126
ext/mcpat/cacti/contention.dat
Executable file
126
ext/mcpat/cacti/contention.dat
Executable file
|
@ -0,0 +1,126 @@
|
|||
l34c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l34c64l2b: 9 11 19 29 43 62 81 102
|
||||
l34c64l4b: 6 8 12 17 24 29 39 47
|
||||
l34c64l8b: 7 8 10 14 18 22 25 30
|
||||
l34c64l16b: 7 7 9 12 14 17 20 24
|
||||
l34c64l32b: 7 7 9 12 14 17 20 24 -r
|
||||
l34c64l64b: 7 7 9 12 14 17 20 24 -r
|
||||
l34c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l34c128l2b: 4 10 19 30 44 64 82 103
|
||||
l34c128l4b: 3 6 11 17 24 31 38 47
|
||||
l34c128l8b: 3 5 9 13 17 21 25 29
|
||||
l34c128l16b: 4 5 7 10 13 16 19 22
|
||||
l34c128l32b: 4 5 7 10 13 16 19 22 -r
|
||||
l34c128l64b: 4 5 7 10 13 16 19 22 -r
|
||||
l34c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l34c256l2b: 3 10 19 30 44 63 82 103
|
||||
l34c256l4b: 3 6 11 17 24 31 38 47
|
||||
l34c256l8b: 2 5 8 12 16 20 24 29
|
||||
l34c256l16b: 2 4 7 9 12 15 18 21
|
||||
l34c256l32b: 2 4 7 9 12 15 18 21 -r
|
||||
l34c256l64b: 2 4 7 9 12 15 18 21 -r
|
||||
l38c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l38c64l2b: 57 59 77 90 137 187 219 245
|
||||
l38c64l4b: 35 40 48 56 43 61 80 101
|
||||
l38c64l8b: 18 27 41 45 52 58 58 58 -r
|
||||
l38c64l16b: 16 17 19 35 40 49 53 53 -r
|
||||
l38c64l32b: 15 15 17 19 22 25 30 30 -r
|
||||
l38c64l64b: 15 15 17 19 22 25 30 30 -r
|
||||
l38c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l38c128l2b: 38 50 78 93 139 188 220 245
|
||||
l38c128l4b: 29 37 46 56 43 61 81 102
|
||||
l38c128l8b: 16 30 39 44 50 57 57 57 -r
|
||||
l38c128l16b: 14 16 19 33 40 47 52 52 -r
|
||||
l38c128l32b: 14 15 17 20 23 27 31 31 -r
|
||||
l38c128l64b: 14 15 17 20 23 27 31 31 -r
|
||||
l38c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l38c256l2b: 35 50 78 94 139 188 220 246
|
||||
l38c256l4b: 28 36 45 55 55 61 81 102
|
||||
l38c256l8b: 17 30 38 43 50 57 57 57 -r
|
||||
l38c256l16b: 15 17 21 32 40 47 51 51
|
||||
l38c256l32b: 15 17 19 21 24 29 33 33
|
||||
l38c256l64b: 15 17 19 21 24 29 33 33 -r
|
||||
l316c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l316c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l316c64l4b: 34 35 78 126 178 220 252 274
|
||||
l316c64l8b: 9 11 23 43 62 87 105 130
|
||||
l316c64l16b: 7 9 13 23 33 45 56 67
|
||||
l316c64l32b: 5 6 7 10 13 19 25 30
|
||||
l316c64l64b: 4 5 6 8 10 14 18 21
|
||||
l316c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l316c128l2b: 25 131 243 1000 1000 1000 1000 1000
|
||||
l316c128l4b: 8 28 79 127 179 221 253 274
|
||||
l316c128l8b: 4 9 22 43 62 88 106 131
|
||||
l316c128l16b: 4 6 11 21 32 44 55 67
|
||||
l316c128l32b: 4 6 11 12 12 18 24 29
|
||||
l316c128l64b: 2 3 5 7 9 13 17 21
|
||||
l316c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l316c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l316c256l4b: 5 28 80 128 180 221 253 274
|
||||
l316c256l8b: 3 8 22 43 63 88 107 131
|
||||
l316c256l16b: 2 5 11 21 32 44 55 67
|
||||
l316c256l32b: 2 3 5 8 12 18 24 29
|
||||
l316c256l64b: 2 3 4 6 9 13 17 21
|
||||
l24c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l24c64l2b: 10 12 24 41 60 86 105 122
|
||||
l24c64l4b: 5 7 13 20 29 38 47 56
|
||||
l24c64l8b: 5 6 9 14 18 24 29 35
|
||||
l24c64l16b: 4 5 7 10 12 16 19 22
|
||||
l24c64l32b: 5 5 6 8 10 12 14 17
|
||||
l24c64l64b: 5 5 6 8 10 12 14 16
|
||||
l24c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l24c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l24c128l4b: 3 7 13 20 29 38 47 57
|
||||
l24c128l8b: 3 5 9 13 18 23 29 35
|
||||
l24c128l16b: 3 4 6 9 12 15 19 22
|
||||
l24c128l32b: 3 4 5 7 9 11 14 16
|
||||
l24c128l64b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l24c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l24c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l24c256l4b: 2 6 13 20 29 38 47 57
|
||||
l24c256l8b: 2 4 8 13 18 23 28 35
|
||||
l24c256l16b: 2 3 6 8 11 15 18 22
|
||||
l24c256l32b: 2 3 5 6 8 11 14 16
|
||||
l24c256l64b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l28c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l28c64l2b: 46 52 117 157 188 225 246 261
|
||||
l28c64l4b: 19 25 39 54 96 107 120 150
|
||||
l28c64l8b: 9 12 21 30 39 47 58 79
|
||||
l28c64l16b: 8 9 11 16 25 32 37 42
|
||||
l28c64l32b: 7 8 9 11 14 19 23 28
|
||||
l28c64l64b: 7 7 8 10 12 14 18 22
|
||||
l28c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l28c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l28c128l4b: 12 22 39 54 98 108 130 151
|
||||
l28c128l8b: 7 12 21 30 39 48 59 80
|
||||
l28c128l16b: 6 8 11 16 24 31 37 42
|
||||
l28c128l32b: 6 7 9 11 14 19 24 28
|
||||
l28c128l64b: 6 7 9 11 14 19 24 28
|
||||
l28c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l28c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l28c256l4b: 12 22 39 54 100 108 130 152
|
||||
l28c256l8b: 7 12 21 30 39 48 59 81
|
||||
l28c256l16b: 6 8 11 16 24 31 37 42
|
||||
l28c256l32b: 6 7 9 11 14 19 24 28
|
||||
l28c256l64b: 6 7 9 11 14 19 24 28
|
||||
l216c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l216c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l216c64l4b: 34 35 78 126 178 220 252 274
|
||||
l216c64l8b: 9 11 23 43 62 87 105 130
|
||||
l216c64l16b: 7 9 13 23 33 45 56 67
|
||||
l216c64l32b: 5 6 7 10 13 19 25 30
|
||||
l216c64l64b: 4 5 6 8 10 14 18 21
|
||||
l216c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l216c128l2b: 25 131 243 1000 1000 1000 1000 1000
|
||||
l216c128l4b: 8 28 79 127 179 221 253 274
|
||||
l216c128l8b: 4 9 22 43 62 88 106 131
|
||||
l216c128l16b: 4 6 11 21 32 44 55 67
|
||||
l216c128l32b: 4 6 11 12 12 18 24 29
|
||||
l216c128l64b: 2 3 5 7 9 13 17 21
|
||||
l216c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l216c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
|
||||
l216c256l4b: 5 28 80 128 180 221 253 274
|
||||
l216c256l8b: 3 8 22 43 63 88 107 131
|
||||
l216c256l16b: 2 5 11 21 32 44 55 67
|
||||
l216c256l32b: 2 3 5 8 12 18 24 29
|
||||
l216c256l64b: 2 3 4 6 9 13 17 21
|
161
ext/mcpat/cacti/crossbar.cc
Normal file
161
ext/mcpat/cacti/crossbar.cc
Normal file
|
@ -0,0 +1,161 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include "crossbar.h"
|
||||
|
||||
#define ASPECT_THRESHOLD .8
|
||||
#define ADJ 1
|
||||
|
||||
Crossbar::Crossbar(
|
||||
double n_inp_,
|
||||
double n_out_,
|
||||
double flit_size_,
|
||||
TechnologyParameter::DeviceType *dt
|
||||
):n_inp(n_inp_), n_out(n_out_), flit_size(flit_size_), deviceType(dt)
|
||||
{
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
|
||||
Vdd = dt->Vdd;
|
||||
CB_ADJ = 1;
|
||||
}
|
||||
|
||||
Crossbar::~Crossbar(){}
|
||||
|
||||
double Crossbar::output_buffer()
|
||||
{
|
||||
|
||||
//Wire winit(4, 4);
|
||||
double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
|
||||
Wire w1(g_ip->wt, l_eff);
|
||||
//double s1 = w1.repeater_size *l_eff*ADJ/w1.repeater_spacing;
|
||||
double s1 = w1.repeater_size * (l_eff <w1.repeater_spacing? l_eff *ADJ/w1.repeater_spacing : ADJ);
|
||||
double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
|
||||
// the model assumes input capacitance of the wire driver = input capacitance of nand + nor = input cap of the driver transistor
|
||||
TriS1 = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
|
||||
TriS2 = s1; //driver transistor
|
||||
|
||||
if (TriS1 < 1)
|
||||
TriS1 = 1;
|
||||
|
||||
double input_cap = gate_C(TriS1*(2*min_w_pmos + g_tp.min_w_nmos_), 0) +
|
||||
gate_C(TriS1*(min_w_pmos + 2*g_tp.min_w_nmos_), 0);
|
||||
// input_cap += drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
// drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
// gate_C(TriS2*g_tp.min_w_nmos_, 0)+
|
||||
// drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
// drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
// gate_C(TriS2*min_w_pmos, 0);
|
||||
tri_int_cap = drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
gate_C(TriS2*g_tp.min_w_nmos_, 0)+
|
||||
drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(TriS2*min_w_pmos, 0);
|
||||
double output_cap = drain_C_(TriS2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(TriS2*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def);
|
||||
double ctr_cap = gate_C(TriS2 *(min_w_pmos + g_tp.min_w_nmos_), 0);
|
||||
|
||||
tri_inp_cap = input_cap;
|
||||
tri_out_cap = output_cap;
|
||||
tri_ctr_cap = ctr_cap;
|
||||
return input_cap + output_cap + ctr_cap;
|
||||
}
|
||||
|
||||
void Crossbar::compute_power()
|
||||
{
|
||||
|
||||
Wire winit(4, 4);
|
||||
double tri_cap = output_buffer();
|
||||
assert(tri_cap > 0);
|
||||
//area of a tristate logic
|
||||
double g_area = compute_gate_area(INV, 1, TriS2*g_tp.min_w_nmos_, TriS2*min_w_pmos, g_tp.cell_h_def);
|
||||
g_area *= 2; // to model area of output transistors
|
||||
g_area += compute_gate_area (NAND, 2, TriS1*2*g_tp.min_w_nmos_, TriS1*min_w_pmos, g_tp.cell_h_def);
|
||||
g_area += compute_gate_area (NOR, 2, TriS1*g_tp.min_w_nmos_, TriS1*2*min_w_pmos, g_tp.cell_h_def);
|
||||
double width /*per tristate*/ = g_area/(CB_ADJ * g_tp.cell_h_def);
|
||||
// effective no. of tristate buffers that need to be laid side by side
|
||||
int ntri = (int)ceil(g_tp.cell_h_def/(g_tp.wire_outside_mat.pitch));
|
||||
double wire_len = MAX(width*ntri*n_out, flit_size*g_tp.wire_outside_mat.pitch*n_out);
|
||||
Wire w1(g_ip->wt, wire_len);
|
||||
|
||||
area.w = wire_len;
|
||||
area.h = g_tp.wire_outside_mat.pitch*n_inp*flit_size * CB_ADJ;
|
||||
Wire w2(g_ip->wt, area.h);
|
||||
|
||||
double aspect_ratio_cb = (area.h/area.w)*(n_out/n_inp);
|
||||
if (aspect_ratio_cb > 1) aspect_ratio_cb = 1/aspect_ratio_cb;
|
||||
|
||||
if (aspect_ratio_cb < ASPECT_THRESHOLD) {
|
||||
if (n_out > 2 && n_inp > 2) {
|
||||
CB_ADJ+=0.2;
|
||||
//cout << "CB ADJ " << CB_ADJ << endl;
|
||||
if (CB_ADJ < 4) {
|
||||
this->compute_power();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
power.readOp.dynamic = (w1.power.readOp.dynamic + w2.power.readOp.dynamic + (tri_inp_cap * n_out + tri_out_cap * n_inp + tri_ctr_cap + tri_int_cap) * Vdd*Vdd)*flit_size;
|
||||
power.readOp.leakage = n_inp * n_out * flit_size * (
|
||||
cmos_Isub_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
|
||||
cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
|
||||
cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
|
||||
w1.power.readOp.leakage + w2.power.readOp.leakage);
|
||||
power.readOp.gate_leakage = n_inp * n_out * flit_size * (
|
||||
cmos_Ig_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
|
||||
cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
|
||||
cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
|
||||
w1.power.readOp.gate_leakage + w2.power.readOp.gate_leakage);
|
||||
|
||||
// delay calculation
|
||||
double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
|
||||
Wire wdriver(g_ip->wt, l_eff);
|
||||
double res = g_tp.wire_outside_mat.R_per_um * (area.w+area.h) + tr_R_on(g_tp.min_w_nmos_*wdriver.repeater_size, NCH, 1);
|
||||
double cap = g_tp.wire_outside_mat.C_per_um * (area.w + area.h) + n_out*tri_inp_cap + n_inp*tri_out_cap;
|
||||
delay = horowitz(w1.signal_rise_time(), res*cap, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
|
||||
|
||||
Wire wreset();
|
||||
}
|
||||
|
||||
void Crossbar::print_crossbar()
|
||||
{
|
||||
cout << "\nCrossbar Stats (" << n_inp << "x" << n_out << ")\n\n";
|
||||
cout << "Flit size : " << flit_size << " bits" << endl;
|
||||
cout << "Width : " << area.w << " u" << endl;
|
||||
cout << "Height : " << area.h << " u" << endl;
|
||||
cout << "Dynamic Power : " << power.readOp.dynamic*1e9 * MIN(n_inp, n_out) << " (nJ)" << endl;
|
||||
cout << "Leakage Power : " << power.readOp.leakage*1e3 << " (mW)" << endl;
|
||||
cout << "Gate Leakage Power : " << power.readOp.gate_leakage*1e3 << " (mW)" << endl;
|
||||
cout << "Crossbar Delay : " << delay*1e12 << " ps\n";
|
||||
}
|
||||
|
||||
|
85
ext/mcpat/cacti/crossbar.h
Normal file
85
ext/mcpat/cacti/crossbar.h
Normal file
|
@ -0,0 +1,85 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __CROSSBAR__
|
||||
#define __CROSSBAR__
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "mat.h"
|
||||
#include "parameter.h"
|
||||
#include "wire.h"
|
||||
|
||||
class Crossbar : public Component
|
||||
{
|
||||
public:
|
||||
Crossbar(
|
||||
double in,
|
||||
double out,
|
||||
double flit_sz,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
|
||||
~Crossbar();
|
||||
|
||||
void print_crossbar();
|
||||
double output_buffer();
|
||||
void compute_power();
|
||||
|
||||
double n_inp, n_out;
|
||||
double flit_size;
|
||||
double tri_inp_cap, tri_out_cap, tri_ctr_cap, tri_int_cap;
|
||||
|
||||
private:
|
||||
double CB_ADJ;
|
||||
/*
|
||||
* Adjust factor of the height of the cross-point (tri-state buffer) cell (layout) in crossbar
|
||||
* buffer is adjusted to get an aspect ratio of whole cross bar close to one;
|
||||
* when adjust the ratio, the number of wires route over the tri-state buffers does not change,
|
||||
* however, the effective wiring pitch changes. Specifically, since CB_ADJ will increase
|
||||
* during the adjust, the tri-state buffer will become taller and thiner, and the effective wiring pitch
|
||||
* will increase. As a result, the height of the crossbar (area.h) will increase.
|
||||
*/
|
||||
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
double TriS1, TriS2;
|
||||
double min_w_pmos, Vdd;
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
1577
ext/mcpat/cacti/decoder.cc
Normal file
1577
ext/mcpat/cacti/decoder.cc
Normal file
File diff suppressed because it is too large
Load diff
247
ext/mcpat/cacti/decoder.h
Normal file
247
ext/mcpat/cacti/decoder.h
Normal file
|
@ -0,0 +1,247 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __DECODER_H__
|
||||
#define __DECODER_H__
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "area.h"
|
||||
#include "component.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
class Decoder : public Component
|
||||
{
|
||||
public:
|
||||
Decoder(
|
||||
int _num_dec_signals,
|
||||
bool flag_way_select,
|
||||
double _C_ld_dec_out,
|
||||
double _R_wire_dec_out,
|
||||
bool fully_assoc_,
|
||||
bool is_dram_,
|
||||
bool is_wl_tr_,
|
||||
const Area & cell_);
|
||||
|
||||
bool exist;
|
||||
int num_in_signals;
|
||||
double C_ld_dec_out;
|
||||
double R_wire_dec_out;
|
||||
int num_gates;
|
||||
int num_gates_min;
|
||||
double w_dec_n[MAX_NUMBER_GATES_STAGE];
|
||||
double w_dec_p[MAX_NUMBER_GATES_STAGE];
|
||||
double delay;
|
||||
//powerDef power;
|
||||
bool fully_assoc;
|
||||
bool is_dram;
|
||||
bool is_wl_tr;
|
||||
const Area & cell;
|
||||
|
||||
|
||||
void compute_widths();
|
||||
void compute_area();
|
||||
double compute_delays(double inrisetime); // return outrisetime
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
};
|
||||
|
||||
|
||||
|
||||
class PredecBlk : public Component
|
||||
{
|
||||
public:
|
||||
PredecBlk(
|
||||
int num_dec_signals,
|
||||
Decoder * dec,
|
||||
double C_wire_predec_blk_out,
|
||||
double R_wire_predec_blk_out,
|
||||
int num_dec_per_predec,
|
||||
bool is_dram_,
|
||||
bool is_blk1);
|
||||
|
||||
Decoder * dec;
|
||||
bool exist;
|
||||
int number_input_addr_bits;
|
||||
double C_ld_predec_blk_out;
|
||||
double R_wire_predec_blk_out;
|
||||
int branch_effort_nand2_gate_output;
|
||||
int branch_effort_nand3_gate_output;
|
||||
bool flag_two_unique_paths;
|
||||
int flag_L2_gate;
|
||||
int number_inputs_L1_gate;
|
||||
int number_gates_L1_nand2_path;
|
||||
int number_gates_L1_nand3_path;
|
||||
int number_gates_L2;
|
||||
int min_number_gates_L1;
|
||||
int min_number_gates_L2;
|
||||
int num_L1_active_nand2_path;
|
||||
int num_L1_active_nand3_path;
|
||||
double w_L1_nand2_n[MAX_NUMBER_GATES_STAGE];
|
||||
double w_L1_nand2_p[MAX_NUMBER_GATES_STAGE];
|
||||
double w_L1_nand3_n[MAX_NUMBER_GATES_STAGE];
|
||||
double w_L1_nand3_p[MAX_NUMBER_GATES_STAGE];
|
||||
double w_L2_n[MAX_NUMBER_GATES_STAGE];
|
||||
double w_L2_p[MAX_NUMBER_GATES_STAGE];
|
||||
double delay_nand2_path;
|
||||
double delay_nand3_path;
|
||||
powerDef power_nand2_path;
|
||||
powerDef power_nand3_path;
|
||||
powerDef power_L2;
|
||||
|
||||
bool is_dram_;
|
||||
|
||||
void compute_widths();
|
||||
void compute_area();
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
|
||||
pair<double, double> compute_delays(pair<double, double> inrisetime); // <nand2, nand3>
|
||||
// return <outrise_nand2, outrise_nand3>
|
||||
};
|
||||
|
||||
|
||||
class PredecBlkDrv : public Component
|
||||
{
|
||||
public:
|
||||
PredecBlkDrv(
|
||||
int way_select,
|
||||
PredecBlk * blk_,
|
||||
bool is_dram);
|
||||
|
||||
int flag_driver_exists;
|
||||
int number_input_addr_bits;
|
||||
int number_gates_nand2_path;
|
||||
int number_gates_nand3_path;
|
||||
int min_number_gates;
|
||||
int num_buffers_driving_1_nand2_load;
|
||||
int num_buffers_driving_2_nand2_load;
|
||||
int num_buffers_driving_4_nand2_load;
|
||||
int num_buffers_driving_2_nand3_load;
|
||||
int num_buffers_driving_8_nand3_load;
|
||||
int num_buffers_nand3_path;
|
||||
double c_load_nand2_path_out;
|
||||
double c_load_nand3_path_out;
|
||||
double r_load_nand2_path_out;
|
||||
double r_load_nand3_path_out;
|
||||
double width_nand2_path_n[MAX_NUMBER_GATES_STAGE];
|
||||
double width_nand2_path_p[MAX_NUMBER_GATES_STAGE];
|
||||
double width_nand3_path_n[MAX_NUMBER_GATES_STAGE];
|
||||
double width_nand3_path_p[MAX_NUMBER_GATES_STAGE];
|
||||
double delay_nand2_path;
|
||||
double delay_nand3_path;
|
||||
powerDef power_nand2_path;
|
||||
powerDef power_nand3_path;
|
||||
|
||||
PredecBlk * blk;
|
||||
Decoder * dec;
|
||||
bool is_dram_;
|
||||
int way_select;
|
||||
|
||||
void compute_widths();
|
||||
void compute_area();
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
|
||||
|
||||
pair<double, double> compute_delays(
|
||||
double inrisetime_nand2_path,
|
||||
double inrisetime_nand3_path); // return <outrise_nand2, outrise_nand3>
|
||||
|
||||
inline int num_addr_bits_nand2_path()
|
||||
{
|
||||
return num_buffers_driving_1_nand2_load +
|
||||
num_buffers_driving_2_nand2_load +
|
||||
num_buffers_driving_4_nand2_load;
|
||||
}
|
||||
inline int num_addr_bits_nand3_path()
|
||||
{
|
||||
return num_buffers_driving_2_nand3_load +
|
||||
num_buffers_driving_8_nand3_load;
|
||||
}
|
||||
double get_rdOp_dynamic_E(int num_act_mats_hor_dir);
|
||||
};
|
||||
|
||||
|
||||
|
||||
class Predec : public Component
|
||||
{
|
||||
public:
|
||||
Predec(
|
||||
PredecBlkDrv * drv1,
|
||||
PredecBlkDrv * drv2);
|
||||
|
||||
double compute_delays(double inrisetime); // return outrisetime
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
PredecBlk * blk1;
|
||||
PredecBlk * blk2;
|
||||
PredecBlkDrv * drv1;
|
||||
PredecBlkDrv * drv2;
|
||||
|
||||
powerDef block_power;
|
||||
powerDef driver_power;
|
||||
|
||||
private:
|
||||
// returns <delay, risetime>
|
||||
pair<double, double> get_max_delay_before_decoder(
|
||||
pair<double, double> input_pair1,
|
||||
pair<double, double> input_pair2);
|
||||
};
|
||||
|
||||
|
||||
|
||||
class Driver : public Component
|
||||
{
|
||||
public:
|
||||
Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram);
|
||||
|
||||
int number_gates;
|
||||
int min_number_gates;
|
||||
double width_n[MAX_NUMBER_GATES_STAGE];
|
||||
double width_p[MAX_NUMBER_GATES_STAGE];
|
||||
double c_gate_load;
|
||||
double c_wire_load;
|
||||
double r_wire_load;
|
||||
double delay;
|
||||
powerDef power;
|
||||
bool is_dram_;
|
||||
|
||||
void compute_widths();
|
||||
double compute_delay(double inrisetime);
|
||||
};
|
||||
|
||||
|
||||
#endif
|
641
ext/mcpat/cacti/htree2.cc
Normal file
641
ext/mcpat/cacti/htree2.cc
Normal file
|
@ -0,0 +1,641 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
#include "htree2.h"
|
||||
#include "wire.h"
|
||||
|
||||
Htree2::Htree2(
|
||||
enum Wire_type wire_model, double mat_w, double mat_h,
|
||||
int a_bits, int d_inbits, int search_data_in, int d_outbits, int search_data_out, int bl, int wl, enum Htree_type htree_type,
|
||||
bool uca_tree_, bool search_tree_, TechnologyParameter::DeviceType *dt)
|
||||
:in_rise_time(0), out_rise_time(0),
|
||||
tree_type(htree_type), mat_width(mat_w), mat_height(mat_h),
|
||||
add_bits(a_bits), data_in_bits(d_inbits), search_data_in_bits(search_data_in),data_out_bits(d_outbits),
|
||||
search_data_out_bits(search_data_out), ndbl(bl), ndwl(wl),
|
||||
uca_tree(uca_tree_), search_tree(search_tree_), wt(wire_model), deviceType(dt)
|
||||
{
|
||||
assert(ndbl >= 2 && ndwl >= 2);
|
||||
|
||||
// if (ndbl == 1 && ndwl == 1)
|
||||
// {
|
||||
// delay = 0;
|
||||
// power.readOp.dynamic = 0;
|
||||
// power.readOp.leakage = 0;
|
||||
// area.w = mat_w;
|
||||
// area.h = mat_h;
|
||||
// return;
|
||||
// }
|
||||
// if (ndwl == 1) ndwl++;
|
||||
// if (ndbl == 1) ndbl++;
|
||||
|
||||
max_unpipelined_link_delay = 0; //TODO
|
||||
min_w_nmos = g_tp.min_w_nmos_;
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
|
||||
|
||||
switch (htree_type)
|
||||
{
|
||||
case Add_htree:
|
||||
wire_bw = init_wire_bw = add_bits;
|
||||
in_htree();
|
||||
break;
|
||||
case Data_in_htree:
|
||||
wire_bw = init_wire_bw = data_in_bits;
|
||||
in_htree();
|
||||
break;
|
||||
case Data_out_htree:
|
||||
wire_bw = init_wire_bw = data_out_bits;
|
||||
out_htree();
|
||||
break;
|
||||
case Search_in_htree:
|
||||
wire_bw = init_wire_bw = search_data_in_bits;//in_search_tree is broad cast, out_htree is not.
|
||||
in_htree();
|
||||
break;
|
||||
case Search_out_htree:
|
||||
wire_bw = init_wire_bw = search_data_out_bits;
|
||||
out_htree();
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
|
||||
power_bit = power;
|
||||
power.readOp.dynamic *= init_wire_bw;
|
||||
|
||||
assert(power.readOp.dynamic >= 0);
|
||||
assert(power.readOp.leakage >= 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// nand gate sizing calculation
|
||||
void Htree2::input_nand(double s1, double s2, double l_eff)
|
||||
{
|
||||
Wire w1(wt, l_eff);
|
||||
double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
|
||||
// input capacitance of a repeater = input capacitance of nand.
|
||||
double nsize = s1*(1 + pton_size)/(2 + pton_size);
|
||||
nsize = (nsize < 1) ? 1 : nsize;
|
||||
|
||||
double tc = 2*tr_R_on(nsize*min_w_nmos, NCH, 1) *
|
||||
(drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
2 * gate_C(s2*(min_w_nmos + min_w_pmos), 0));
|
||||
delay+= horowitz (w1.out_rise_time, tc,
|
||||
deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
|
||||
power.readOp.dynamic += 0.5 *
|
||||
(2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+ drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
|
||||
+ 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
power.searchOp.dynamic += 0.5 *
|
||||
(2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+ drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
|
||||
+ 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd * wire_bw ;
|
||||
power.readOp.leakage += (wire_bw*cmos_Isub_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
|
||||
power.readOp.gate_leakage += (wire_bw*cmos_Ig_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// tristate buffer model consisting of not, nand, nor, and driver transistors
|
||||
void Htree2::output_buffer(double s1, double s2, double l_eff)
|
||||
{
|
||||
Wire w1(wt, l_eff);
|
||||
double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
|
||||
// input capacitance of repeater = input capacitance of nand + nor.
|
||||
double size = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
|
||||
double s_eff = //stage eff of a repeater in a wire
|
||||
(gate_C(s2*(min_w_nmos + min_w_pmos), 0) + w1.wire_cap(l_eff*1e-6,true))/
|
||||
gate_C(s2*(min_w_nmos + min_w_pmos), 0);
|
||||
double tr_size = gate_C(s1*(min_w_nmos + min_w_pmos), 0) * 1/2/(s_eff*gate_C(min_w_pmos, 0));
|
||||
size = (size < 1) ? 1 : size;
|
||||
|
||||
double res_nor = 2*tr_R_on(size*min_w_pmos, PCH, 1);
|
||||
double res_ptrans = tr_R_on(tr_size*min_w_nmos, NCH, 1);
|
||||
double cap_nand_out = drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
gate_C(tr_size*min_w_pmos, 0);
|
||||
double cap_ptrans_out = 2 *(drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)) +
|
||||
gate_C(s1*(min_w_nmos + min_w_pmos), 0);
|
||||
|
||||
double tc = res_nor * cap_nand_out + (res_nor + res_ptrans) * cap_ptrans_out;
|
||||
|
||||
|
||||
delay += horowitz (w1.out_rise_time, tc,
|
||||
deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
|
||||
|
||||
//nand
|
||||
power.readOp.dynamic += 0.5 *
|
||||
(2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(tr_size*(min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
power.searchOp.dynamic += 0.5 *
|
||||
(2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(tr_size*(min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
|
||||
|
||||
//not
|
||||
power.readOp.dynamic += 0.5 *
|
||||
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
|
||||
+gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
power.searchOp.dynamic += 0.5 *
|
||||
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
|
||||
+gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
|
||||
|
||||
//nor
|
||||
power.readOp.dynamic += 0.5 *
|
||||
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+ 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
|
||||
+gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
power.searchOp.dynamic += 0.5 *
|
||||
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+ 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
|
||||
+gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
|
||||
|
||||
//output transistor
|
||||
power.readOp.dynamic += 0.5 *
|
||||
((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
|
||||
+ gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
power.searchOp.dynamic += 0.5 *
|
||||
((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
|
||||
+drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
|
||||
+ gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
|
||||
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
|
||||
|
||||
if(uca_tree) {
|
||||
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
|
||||
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
|
||||
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
|
||||
|
||||
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
|
||||
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
|
||||
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
|
||||
//power.readOp.gate_leakage *=;
|
||||
}
|
||||
else {
|
||||
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
|
||||
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
|
||||
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
|
||||
|
||||
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
|
||||
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
|
||||
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
|
||||
//power.readOp.gate_leakage *=deviceType->Vdd*wire_bw;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* calculates the input h-tree delay/power
|
||||
* A nand gate is used at each node to
|
||||
* limit the signal
|
||||
* The area of an unbalanced htree (rows != columns)
|
||||
* depends on how data is traversed.
|
||||
* In the following function, if ( no. of rows < no. of columns),
|
||||
* then data first traverse in excess hor. links until vertical
|
||||
* and horizontal nodes are same.
|
||||
* If no. of rows is bigger, then data traverse in
|
||||
* a hor. link followed by a ver. link in a repeated
|
||||
* fashion (similar to a balanced tree) until there are no
|
||||
* hor. links left. After this it goes through the remaining vertical
|
||||
* links.
|
||||
*/
|
||||
void
|
||||
Htree2::in_htree()
|
||||
{
|
||||
//temp var
|
||||
double s1 = 0, s2 = 0, s3 = 0;
|
||||
double l_eff = 0;
|
||||
Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
|
||||
double len = 0, ht = 0;
|
||||
int option = 0;
|
||||
|
||||
int h = (int) _log2(ndwl/2); // horizontal nodes
|
||||
int v = (int) _log2(ndbl/2); // vertical nodes
|
||||
double len_temp;
|
||||
double ht_temp;
|
||||
if (uca_tree)
|
||||
{//Sheng: this computation do not consider the wires that route from edge to middle.
|
||||
ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
|
||||
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
|
||||
2 * (1-pow(0.5,h))))/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
|
||||
2 * (1-pow(0.5,v))))/2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ndwl == ndbl) {
|
||||
ht_temp = ((mat_height*ndbl/2) +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits))* (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
|
||||
)/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
|
||||
}
|
||||
else if (ndwl > ndbl) {
|
||||
double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
|
||||
ht_temp = ((mat_height*ndbl/2) +
|
||||
((add_bits + + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
|
||||
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
|
||||
(2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
|
||||
}
|
||||
else {
|
||||
double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
|
||||
ht_temp = ((mat_height*ndbl/2) +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
|
||||
)/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
|
||||
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
|
||||
}
|
||||
}
|
||||
|
||||
area.h = ht_temp * 2;
|
||||
area.w = len_temp * 2;
|
||||
delay = 0;
|
||||
power.readOp.dynamic = 0;
|
||||
power.readOp.leakage = 0;
|
||||
power.searchOp.dynamic =0;
|
||||
len = len_temp;
|
||||
ht = ht_temp/2;
|
||||
|
||||
while (v > 0 || h > 0)
|
||||
{
|
||||
if (wtemp1) delete wtemp1;
|
||||
if (wtemp2) delete wtemp2;
|
||||
if (wtemp3) delete wtemp3;
|
||||
|
||||
if (h > v)
|
||||
{
|
||||
//the iteration considers only one horizontal link
|
||||
wtemp1 = new Wire(wt, len); // hor
|
||||
wtemp2 = new Wire(wt, len/2); // ver
|
||||
len_temp = len;
|
||||
len /= 2;
|
||||
wtemp3 = 0;
|
||||
h--;
|
||||
option = 0;
|
||||
}
|
||||
else if (v>0 && h>0)
|
||||
{
|
||||
//considers one horizontal link and one vertical link
|
||||
wtemp1 = new Wire(wt, len); // hor
|
||||
wtemp2 = new Wire(wt, ht); // ver
|
||||
wtemp3 = new Wire(wt, len/2); // next hor
|
||||
len_temp = len;
|
||||
ht_temp = ht;
|
||||
len /= 2;
|
||||
ht /= 2;
|
||||
v--;
|
||||
h--;
|
||||
option = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// considers only one vertical link
|
||||
assert(h == 0);
|
||||
wtemp1 = new Wire(wt, ht); // ver
|
||||
wtemp2 = new Wire(wt, ht/2); // hor
|
||||
ht_temp = ht;
|
||||
ht /= 2;
|
||||
wtemp3 = 0;
|
||||
v--;
|
||||
option = 2;
|
||||
}
|
||||
|
||||
delay += wtemp1->delay;
|
||||
power.readOp.dynamic += wtemp1->power.readOp.dynamic;
|
||||
power.searchOp.dynamic += wtemp1->power.readOp.dynamic*wire_bw;
|
||||
power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
|
||||
power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
|
||||
if ((uca_tree == false && option == 2) || search_tree==true)
|
||||
{
|
||||
wire_bw*=2; // wire bandwidth doubles only for vertical branches
|
||||
}
|
||||
|
||||
if (uca_tree == false)
|
||||
{
|
||||
if (len_temp > wtemp1->repeater_spacing)
|
||||
{
|
||||
s1 = wtemp1->repeater_size;
|
||||
l_eff = wtemp1->repeater_spacing;
|
||||
}
|
||||
else
|
||||
{
|
||||
s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
|
||||
l_eff = len_temp;
|
||||
}
|
||||
|
||||
if (ht_temp > wtemp2->repeater_spacing)
|
||||
{
|
||||
s2 = wtemp2->repeater_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
|
||||
}
|
||||
// first level
|
||||
input_nand(s1, s2, l_eff);
|
||||
}
|
||||
|
||||
|
||||
if (option != 1)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// second level
|
||||
delay += wtemp2->delay;
|
||||
power.readOp.dynamic += wtemp2->power.readOp.dynamic;
|
||||
power.searchOp.dynamic += wtemp2->power.readOp.dynamic*wire_bw;
|
||||
power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
|
||||
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
|
||||
|
||||
if (uca_tree)
|
||||
{
|
||||
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
|
||||
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
|
||||
}
|
||||
else
|
||||
{
|
||||
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
|
||||
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
|
||||
wire_bw*=2;
|
||||
|
||||
if (ht_temp > wtemp3->repeater_spacing)
|
||||
{
|
||||
s3 = wtemp3->repeater_size;
|
||||
l_eff = wtemp3->repeater_spacing;
|
||||
}
|
||||
else
|
||||
{
|
||||
s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
|
||||
l_eff = ht_temp;
|
||||
}
|
||||
|
||||
input_nand(s2, s3, l_eff);
|
||||
}
|
||||
}
|
||||
|
||||
if (wtemp1) delete wtemp1;
|
||||
if (wtemp2) delete wtemp2;
|
||||
if (wtemp3) delete wtemp3;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* a tristate buffer is used to handle fan-ins
|
||||
* The area of an unbalanced htree (rows != columns)
|
||||
* depends on how data is traversed.
|
||||
* In the following function, if ( no. of rows < no. of columns),
|
||||
* then data first traverse in excess hor. links until vertical
|
||||
* and horizontal nodes are same.
|
||||
* If no. of rows is bigger, then data traverse in
|
||||
* a hor. link followed by a ver. link in a repeated
|
||||
* fashion (similar to a balanced tree) until there are no
|
||||
* hor. links left. After this it goes through the remaining vertical
|
||||
* links.
|
||||
*/
|
||||
void Htree2::out_htree()
|
||||
{
|
||||
//temp var
|
||||
double s1 = 0, s2 = 0, s3 = 0;
|
||||
double l_eff = 0;
|
||||
Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
|
||||
double len = 0, ht = 0;
|
||||
int option = 0;
|
||||
|
||||
int h = (int) _log2(ndwl/2);
|
||||
int v = (int) _log2(ndbl/2);
|
||||
double len_temp;
|
||||
double ht_temp;
|
||||
if (uca_tree)
|
||||
{
|
||||
ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
|
||||
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
|
||||
2 * (1-pow(0.5,h))))/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
|
||||
2 * (1-pow(0.5,v))))/2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ndwl == ndbl) {
|
||||
ht_temp = ((mat_height*ndbl/2) +
|
||||
((add_bits+ (search_data_in_bits + search_data_out_bits)) * (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
|
||||
)/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
|
||||
|
||||
}
|
||||
else if (ndwl > ndbl) {
|
||||
double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
|
||||
ht_temp = ((mat_height*ndbl/2) +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
|
||||
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
|
||||
(2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
|
||||
}
|
||||
else {
|
||||
double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
|
||||
ht_temp = ((mat_height*ndbl/2) +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
|
||||
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
|
||||
)/2;
|
||||
len_temp = (mat_width*ndwl/2 +
|
||||
((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
|
||||
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
|
||||
}
|
||||
}
|
||||
area.h = ht_temp * 2;
|
||||
area.w = len_temp * 2;
|
||||
delay = 0;
|
||||
power.readOp.dynamic = 0;
|
||||
power.readOp.leakage = 0;
|
||||
power.readOp.gate_leakage = 0;
|
||||
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
|
||||
len = len_temp;
|
||||
ht = ht_temp/2;
|
||||
|
||||
while (v > 0 || h > 0)
|
||||
{ //finds delay/power of each link in the tree
|
||||
if (wtemp1) delete wtemp1;
|
||||
if (wtemp2) delete wtemp2;
|
||||
if (wtemp3) delete wtemp3;
|
||||
|
||||
if(h > v) {
|
||||
//the iteration considers only one horizontal link
|
||||
wtemp1 = new Wire(wt, len); // hor
|
||||
wtemp2 = new Wire(wt, len/2); // ver
|
||||
len_temp = len;
|
||||
len /= 2;
|
||||
wtemp3 = 0;
|
||||
h--;
|
||||
option = 0;
|
||||
}
|
||||
else if (v>0 && h>0) {
|
||||
//considers one horizontal link and one vertical link
|
||||
wtemp1 = new Wire(wt, len); // hor
|
||||
wtemp2 = new Wire(wt, ht); // ver
|
||||
wtemp3 = new Wire(wt, len/2); // next hor
|
||||
len_temp = len;
|
||||
ht_temp = ht;
|
||||
len /= 2;
|
||||
ht /= 2;
|
||||
v--;
|
||||
h--;
|
||||
option = 1;
|
||||
}
|
||||
else {
|
||||
// considers only one vertical link
|
||||
assert(h == 0);
|
||||
wtemp1 = new Wire(wt, ht); // hor
|
||||
wtemp2 = new Wire(wt, ht/2); // ver
|
||||
ht_temp = ht;
|
||||
ht /= 2;
|
||||
wtemp3 = 0;
|
||||
v--;
|
||||
option = 2;
|
||||
}
|
||||
delay += wtemp1->delay;
|
||||
power.readOp.dynamic += wtemp1->power.readOp.dynamic;
|
||||
power.searchOp.dynamic += wtemp1->power.readOp.dynamic*init_wire_bw;
|
||||
power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
|
||||
power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
|
||||
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
|
||||
if ((uca_tree == false && option == 2) || search_tree==true)
|
||||
{
|
||||
wire_bw*=2;
|
||||
}
|
||||
|
||||
if (uca_tree == false)
|
||||
{
|
||||
if (len_temp > wtemp1->repeater_spacing)
|
||||
{
|
||||
s1 = wtemp1->repeater_size;
|
||||
l_eff = wtemp1->repeater_spacing;
|
||||
}
|
||||
else
|
||||
{
|
||||
s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
|
||||
l_eff = len_temp;
|
||||
}
|
||||
if (ht_temp > wtemp2->repeater_spacing)
|
||||
{
|
||||
s2 = wtemp2->repeater_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
|
||||
}
|
||||
// first level
|
||||
output_buffer(s1, s2, l_eff);
|
||||
}
|
||||
|
||||
|
||||
if (option != 1)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// second level
|
||||
delay += wtemp2->delay;
|
||||
power.readOp.dynamic += wtemp2->power.readOp.dynamic;
|
||||
power.searchOp.dynamic += wtemp2->power.readOp.dynamic*init_wire_bw;
|
||||
power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
|
||||
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
|
||||
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
|
||||
if (uca_tree)
|
||||
{
|
||||
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
|
||||
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
|
||||
}
|
||||
else
|
||||
{
|
||||
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
|
||||
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
|
||||
wire_bw*=2;
|
||||
|
||||
if (ht_temp > wtemp3->repeater_spacing)
|
||||
{
|
||||
s3 = wtemp3->repeater_size;
|
||||
l_eff = wtemp3->repeater_spacing;
|
||||
}
|
||||
else
|
||||
{
|
||||
s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
|
||||
l_eff = ht_temp;
|
||||
}
|
||||
|
||||
output_buffer(s2, s3, l_eff);
|
||||
}
|
||||
//cout<<"power.readOp.leakage"<<power.readOp.leakage<<endl;
|
||||
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
|
||||
//cout<<"wtemp2->power.readOp.gate_leakage"<<wtemp2->power.readOp.gate_leakage<<endl;
|
||||
}
|
||||
|
||||
if (wtemp1) delete wtemp1;
|
||||
if (wtemp2) delete wtemp2;
|
||||
if (wtemp3) delete wtemp3;
|
||||
}
|
||||
|
97
ext/mcpat/cacti/htree2.h
Normal file
97
ext/mcpat/cacti/htree2.h
Normal file
|
@ -0,0 +1,97 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __HTREE2_H__
|
||||
#define __HTREE2_H__
|
||||
|
||||
#include "assert.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "parameter.h"
|
||||
#include "subarray.h"
|
||||
#include "wire.h"
|
||||
|
||||
// leakge power includes entire htree in a bank (when uca_tree == false)
|
||||
// leakge power includes only part to one bank when uca_tree == true
|
||||
|
||||
class Htree2 : public Component
|
||||
{
|
||||
public:
|
||||
Htree2(enum Wire_type wire_model,
|
||||
double mat_w, double mat_h, int add, int data_in, int search_data_in, int data_out, int search_data_out, int bl, int wl,
|
||||
enum Htree_type h_type, bool uca_tree_ = false, bool search_tree_ = false,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
|
||||
~Htree2() {};
|
||||
|
||||
void in_htree();
|
||||
void out_htree();
|
||||
|
||||
// repeaters only at h-tree nodes
|
||||
void limited_in_htree();
|
||||
void limited_out_htree();
|
||||
void input_nand(double s1, double s2, double l);
|
||||
void output_buffer(double s1, double s2, double l);
|
||||
|
||||
double in_rise_time, out_rise_time;
|
||||
|
||||
void set_in_rise_time(double rt)
|
||||
{
|
||||
in_rise_time = rt;
|
||||
}
|
||||
|
||||
double max_unpipelined_link_delay;
|
||||
powerDef power_bit;
|
||||
|
||||
|
||||
private:
|
||||
double wire_bw;
|
||||
double init_wire_bw; // bus width at root
|
||||
enum Htree_type tree_type;
|
||||
double htree_hnodes;
|
||||
double htree_vnodes;
|
||||
double mat_width;
|
||||
double mat_height;
|
||||
int add_bits, data_in_bits,search_data_in_bits,data_out_bits, search_data_out_bits;
|
||||
int ndbl, ndwl;
|
||||
bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously
|
||||
bool search_tree;
|
||||
|
||||
enum Wire_type wt;
|
||||
double min_w_nmos;
|
||||
double min_w_pmos;
|
||||
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
2350
ext/mcpat/cacti/io.cc
Normal file
2350
ext/mcpat/cacti/io.cc
Normal file
File diff suppressed because it is too large
Load diff
44
ext/mcpat/cacti/io.h
Normal file
44
ext/mcpat/cacti/io.h
Normal file
|
@ -0,0 +1,44 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __IO_H__
|
||||
#define __IO_H__
|
||||
|
||||
|
||||
#include "cacti_interface.h"
|
||||
#include "const.h"
|
||||
|
||||
void output_data_csv(const uca_org_t & fin_res);
|
||||
void output_UCA(uca_org_t * fin_res);
|
||||
|
||||
|
||||
#endif
|
191
ext/mcpat/cacti/main.cc
Normal file
191
ext/mcpat/cacti/main.cc
Normal file
|
@ -0,0 +1,191 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "io.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main(int argc,char *argv[])
|
||||
{
|
||||
|
||||
uca_org_t result;
|
||||
if (argc != 53 && argc != 55)
|
||||
{
|
||||
bool infile_specified = false;
|
||||
string infile_name("");
|
||||
|
||||
for (int32_t i = 0; i < argc; i++)
|
||||
{
|
||||
if (argv[i] == string("-infile"))
|
||||
{
|
||||
infile_specified = true;
|
||||
i++;
|
||||
infile_name = argv[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (infile_specified == false)
|
||||
{
|
||||
cerr << " Invalid arguments -- how to use CACTI:" << endl;
|
||||
cerr << " 1) cacti -infile <input file name>" << endl;
|
||||
cerr << " 2) cacti arg1 ... arg52 -- please refer to the README file" << endl;
|
||||
cerr << " No. of arguments input - " << argc << endl;
|
||||
exit(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
result = cacti_interface(infile_name);
|
||||
}
|
||||
}
|
||||
else if (argc == 53)
|
||||
{
|
||||
result = cacti_interface(atoi(argv[ 1]),
|
||||
atoi(argv[ 2]),
|
||||
atoi(argv[ 3]),
|
||||
atoi(argv[ 4]),
|
||||
atoi(argv[ 5]),
|
||||
atoi(argv[ 6]),
|
||||
atoi(argv[ 7]),
|
||||
atoi(argv[ 8]),
|
||||
atoi(argv[ 9]),
|
||||
atof(argv[10]),
|
||||
atoi(argv[11]),
|
||||
atoi(argv[12]),
|
||||
atoi(argv[13]),
|
||||
atoi(argv[14]),
|
||||
atoi(argv[15]),
|
||||
atoi(argv[16]),
|
||||
atoi(argv[17]),
|
||||
atoi(argv[18]),
|
||||
atoi(argv[19]),
|
||||
atoi(argv[20]),
|
||||
atoi(argv[21]),
|
||||
atoi(argv[22]),
|
||||
atoi(argv[23]),
|
||||
atoi(argv[24]),
|
||||
atoi(argv[25]),
|
||||
atoi(argv[26]),
|
||||
atoi(argv[27]),
|
||||
atoi(argv[28]),
|
||||
atoi(argv[29]),
|
||||
atoi(argv[30]),
|
||||
atoi(argv[31]),
|
||||
atoi(argv[32]),
|
||||
atoi(argv[33]),
|
||||
atoi(argv[34]),
|
||||
atoi(argv[35]),
|
||||
atoi(argv[36]),
|
||||
atoi(argv[37]),
|
||||
atoi(argv[38]),
|
||||
atoi(argv[39]),
|
||||
atoi(argv[40]),
|
||||
atoi(argv[41]),
|
||||
atoi(argv[42]),
|
||||
atoi(argv[43]),
|
||||
atoi(argv[44]),
|
||||
atoi(argv[45]),
|
||||
atoi(argv[46]),
|
||||
atoi(argv[47]),
|
||||
atoi(argv[48]),
|
||||
atoi(argv[49]),
|
||||
atoi(argv[50]),
|
||||
atoi(argv[51]),
|
||||
atoi(argv[52]));
|
||||
}
|
||||
else
|
||||
{
|
||||
result = cacti_interface(atoi(argv[ 1]),
|
||||
atoi(argv[ 2]),
|
||||
atoi(argv[ 3]),
|
||||
atoi(argv[ 4]),
|
||||
atoi(argv[ 5]),
|
||||
atoi(argv[ 6]),
|
||||
atoi(argv[ 7]),
|
||||
atoi(argv[ 8]),
|
||||
atof(argv[ 9]),
|
||||
atoi(argv[10]),
|
||||
atoi(argv[11]),
|
||||
atoi(argv[12]),
|
||||
atoi(argv[13]),
|
||||
atoi(argv[14]),
|
||||
atoi(argv[15]),
|
||||
atoi(argv[16]),
|
||||
atoi(argv[17]),
|
||||
atoi(argv[18]),
|
||||
atoi(argv[19]),
|
||||
atoi(argv[20]),
|
||||
atoi(argv[21]),
|
||||
atoi(argv[22]),
|
||||
atoi(argv[23]),
|
||||
atoi(argv[24]),
|
||||
atoi(argv[25]),
|
||||
atoi(argv[26]),
|
||||
atoi(argv[27]),
|
||||
atoi(argv[28]),
|
||||
atoi(argv[29]),
|
||||
atoi(argv[30]),
|
||||
atoi(argv[31]),
|
||||
atoi(argv[32]),
|
||||
atoi(argv[33]),
|
||||
atoi(argv[34]),
|
||||
atoi(argv[35]),
|
||||
atoi(argv[36]),
|
||||
atoi(argv[37]),
|
||||
atoi(argv[38]),
|
||||
atoi(argv[39]),
|
||||
atoi(argv[40]),
|
||||
atoi(argv[41]),
|
||||
atoi(argv[42]),
|
||||
atoi(argv[43]),
|
||||
atoi(argv[44]),
|
||||
atoi(argv[45]),
|
||||
atoi(argv[46]),
|
||||
atoi(argv[47]),
|
||||
atoi(argv[48]),
|
||||
atoi(argv[49]),
|
||||
atoi(argv[50]),
|
||||
atoi(argv[51]),
|
||||
atoi(argv[52]),
|
||||
atoi(argv[53]),
|
||||
atoi(argv[54]));
|
||||
}
|
||||
|
||||
result.cleanup();
|
||||
// delete result.data_array2;
|
||||
// if (result.tag_array2!=NULL)
|
||||
// delete result.tag_array2;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
28
ext/mcpat/cacti/makefile
Normal file
28
ext/mcpat/cacti/makefile
Normal file
|
@ -0,0 +1,28 @@
|
|||
TAR = cacti
|
||||
|
||||
.PHONY: dbg opt depend clean clean_dbg clean_opt
|
||||
|
||||
all: opt
|
||||
|
||||
dbg: $(TAR).mk obj_dbg
|
||||
@$(MAKE) TAG=dbg -C . -f $(TAR).mk
|
||||
|
||||
opt: $(TAR).mk obj_opt
|
||||
@$(MAKE) TAG=opt -C . -f $(TAR).mk
|
||||
|
||||
obj_dbg:
|
||||
mkdir $@
|
||||
|
||||
obj_opt:
|
||||
mkdir $@
|
||||
|
||||
clean: clean_dbg clean_opt
|
||||
|
||||
clean_dbg: obj_dbg
|
||||
@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
|
||||
rm -rf $<
|
||||
|
||||
clean_opt: obj_opt
|
||||
@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
|
||||
rm -rf $<
|
||||
|
1748
ext/mcpat/cacti/mat.cc
Executable file
1748
ext/mcpat/cacti/mat.cc
Executable file
File diff suppressed because it is too large
Load diff
148
ext/mcpat/cacti/mat.h
Executable file
148
ext/mcpat/cacti/mat.h
Executable file
|
@ -0,0 +1,148 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __MAT_H__
|
||||
#define __MAT_H__
|
||||
|
||||
#include "component.h"
|
||||
#include "decoder.h"
|
||||
#include "subarray.h"
|
||||
#include "wire.h"
|
||||
|
||||
class Mat : public Component
|
||||
{
|
||||
public:
|
||||
Mat(const DynamicParameter & dyn_p);
|
||||
~Mat();
|
||||
double compute_delays(double inrisetime); // return outrisetime
|
||||
void compute_power_energy();
|
||||
|
||||
const DynamicParameter & dp;
|
||||
|
||||
// TODO: clean up pointers and powerDefs below
|
||||
Decoder * row_dec;
|
||||
Decoder * bit_mux_dec;
|
||||
Decoder * sa_mux_lev_1_dec;
|
||||
Decoder * sa_mux_lev_2_dec;
|
||||
PredecBlk * dummy_way_sel_predec_blk1;
|
||||
PredecBlk * dummy_way_sel_predec_blk2;
|
||||
PredecBlkDrv * way_sel_drv1;
|
||||
PredecBlkDrv * dummy_way_sel_predec_blk_drv2;
|
||||
|
||||
Predec * r_predec;
|
||||
Predec * b_mux_predec;
|
||||
Predec * sa_mux_lev_1_predec;
|
||||
Predec * sa_mux_lev_2_predec;
|
||||
|
||||
Wire * subarray_out_wire;
|
||||
Driver * bl_precharge_eq_drv;
|
||||
Driver * cam_bl_precharge_eq_drv;//bitline pre-charge circuit is separated for CAM and RAM arrays.
|
||||
Driver * ml_precharge_drv;//matchline prechange driver
|
||||
Driver * sl_precharge_eq_drv;//searchline prechage driver
|
||||
Driver * sl_data_drv;//search line data driver
|
||||
Driver * ml_to_ram_wl_drv;//search line data driver
|
||||
|
||||
|
||||
powerDef power_row_decoders;
|
||||
powerDef power_bit_mux_decoders;
|
||||
powerDef power_sa_mux_lev_1_decoders;
|
||||
powerDef power_sa_mux_lev_2_decoders;
|
||||
powerDef power_fa_cam; // TODO: leakage power is not computed yet
|
||||
powerDef power_bl_precharge_eq_drv;
|
||||
powerDef power_subarray_out_drv;
|
||||
powerDef power_cam_all_active;
|
||||
powerDef power_searchline_precharge;
|
||||
powerDef power_matchline_precharge;
|
||||
powerDef power_ml_to_ram_wl_drv;
|
||||
|
||||
double delay_fa_tag, delay_cam;
|
||||
double delay_before_decoder;
|
||||
double delay_bitline;
|
||||
double delay_wl_reset;
|
||||
double delay_bl_restore;
|
||||
|
||||
double delay_searchline;
|
||||
double delay_matchchline;
|
||||
double delay_cam_sl_restore;
|
||||
double delay_cam_ml_reset;
|
||||
double delay_fa_ram_wl;
|
||||
|
||||
double delay_hit_miss_reset;
|
||||
double delay_hit_miss;
|
||||
|
||||
Subarray subarray;
|
||||
powerDef power_bitline, power_searchline, power_matchline;
|
||||
double per_bitline_read_energy;
|
||||
int deg_bl_muxing;
|
||||
int num_act_mats_hor_dir;
|
||||
double delay_writeback;
|
||||
Area cell,cam_cell;
|
||||
bool is_dram,is_fa, pure_cam, camFlag;
|
||||
int num_mats;
|
||||
powerDef power_sa;
|
||||
double delay_sa;
|
||||
double leak_power_sense_amps_closed_page_state;
|
||||
double leak_power_sense_amps_open_page_state;
|
||||
double delay_subarray_out_drv;
|
||||
double delay_subarray_out_drv_htree;
|
||||
double delay_comparator;
|
||||
powerDef power_comparator;
|
||||
int num_do_b_mat;
|
||||
int num_so_b_mat;
|
||||
int num_sa_subarray;
|
||||
int num_sa_subarray_search;
|
||||
double C_bl;
|
||||
|
||||
uint32_t num_subarrays_per_mat; // the number of subarrays in a mat
|
||||
uint32_t num_subarrays_per_row; // the number of subarrays in a row of a mat
|
||||
|
||||
|
||||
private:
|
||||
double compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
|
||||
double width_write_driver_or_write_mux();
|
||||
double compute_comparators_height(int tagbits, int number_ways_in_mat, double subarray_mem_cell_area_w);
|
||||
double compute_cam_delay(double inrisetime);
|
||||
double compute_bitline_delay(double inrisetime);
|
||||
double compute_sa_delay(double inrisetime);
|
||||
double compute_subarray_out_drv(double inrisetime);
|
||||
double compute_comparator_delay(double inrisetime);
|
||||
|
||||
int RWP;
|
||||
int ERP;
|
||||
int EWP;
|
||||
int SCHP;
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif
|
612
ext/mcpat/cacti/nuca.cc
Normal file
612
ext/mcpat/cacti/nuca.cc
Normal file
|
@ -0,0 +1,612 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "Ucache.h"
|
||||
#include "nuca.h"
|
||||
|
||||
unsigned int MIN_BANKSIZE=65536;
|
||||
#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
|
||||
#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
|
||||
#define CONTR_2_BANK_LAT 0
|
||||
|
||||
int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
|
||||
|
||||
Nuca::Nuca(
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
|
||||
):deviceType(dt)
|
||||
{
|
||||
init_cont();
|
||||
}
|
||||
|
||||
void
|
||||
Nuca::init_cont()
|
||||
{
|
||||
FILE *cont;
|
||||
char line[5000];
|
||||
char jk[5000];
|
||||
cont = fopen("contention.dat", "r");
|
||||
if (!cont) {
|
||||
cout << "contention.dat file is missing!\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
for(int i=0; i<2; i++) {
|
||||
for(int j=2; j<5; j++) {
|
||||
for(int k=0; k<ROUTER_TYPES; k++) {
|
||||
for(int l=0;l<7; l++) {
|
||||
int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
|
||||
assert(fscanf(cont, "%[^\n]\n", line) != EOF);
|
||||
sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3],
|
||||
&temp[4], &temp[5], &temp[6], &temp[7]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(cont);
|
||||
}
|
||||
|
||||
void
|
||||
Nuca::print_cont_stats()
|
||||
{
|
||||
for(int i=0; i<2; i++) {
|
||||
for(int j=2; j<5; j++) {
|
||||
for(int k=0; k<ROUTER_TYPES; k++) {
|
||||
for(int l=0;l<7; l++) {
|
||||
for(int m=0;l<7; l++) {
|
||||
cout << cont_stats[i][j][k][l][m] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
Nuca::~Nuca(){
|
||||
for (int i = wt_min; i <= wt_max; i++) {
|
||||
delete wire_vertical[i];
|
||||
delete wire_horizontal[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
|
||||
int
|
||||
Nuca::calc_cycles(double lat, double oper_freq)
|
||||
{
|
||||
//TODO: convert latch delay to FO4 */
|
||||
double cycle_time = (1.0/(oper_freq*1e9)); /*s*/
|
||||
cycle_time -= LATCH_DELAY;
|
||||
cycle_time -= FIXED_OVERHEAD;
|
||||
|
||||
return (int)ceil(lat/cycle_time);
|
||||
}
|
||||
|
||||
|
||||
nuca_org_t::~nuca_org_t() {
|
||||
// if(h_wire) delete h_wire;
|
||||
// if(v_wire) delete v_wire;
|
||||
// if(router) delete router;
|
||||
}
|
||||
|
||||
/*
|
||||
* Version - 6.0
|
||||
*
|
||||
* Perform exhaustive search across different bank organizatons,
|
||||
* router configurations, grid organizations, and wire models and
|
||||
* find an optimal NUCA organization
|
||||
* For different bank count values
|
||||
* 1. Optimal bank organization is calculated
|
||||
* 2. For each bank organization, find different NUCA organizations
|
||||
* using various router configurations, grid organizations,
|
||||
* and wire models.
|
||||
* 3. NUCA model with the least cost is picked for
|
||||
* this particular bank count
|
||||
* Finally include contention statistics and find the optimal
|
||||
* NUCA configuration
|
||||
*/
|
||||
void
|
||||
Nuca::sim_nuca()
|
||||
{
|
||||
/* temp variables */
|
||||
int it, ro, wr;
|
||||
int num_cyc;
|
||||
unsigned int i, j, k;
|
||||
unsigned int r, c;
|
||||
int l2_c;
|
||||
int bank_count = 0;
|
||||
uca_org_t ures;
|
||||
nuca_org_t *opt_n;
|
||||
mem_array tag, data;
|
||||
list<nuca_org_t *> nuca_list;
|
||||
Router *router_s[ROUTER_TYPES];
|
||||
router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
|
||||
router_s[0]->print_router();
|
||||
router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
|
||||
router_s[1]->print_router();
|
||||
router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
|
||||
router_s[2]->print_router();
|
||||
|
||||
int core_in; // to store no. of cores
|
||||
|
||||
/* to search diff grid organizations */
|
||||
double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
|
||||
curr_acclat;
|
||||
double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
|
||||
avg_leakage_power;
|
||||
|
||||
double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF;
|
||||
int opt_rows = 0;
|
||||
int opt_columns = 0;
|
||||
double opt_totno_hops = 0;
|
||||
double opt_avg_hop = 0;
|
||||
double opt_dyn_power = 0, opt_leakage_power = 0;
|
||||
min_values_t minval;
|
||||
|
||||
int bank_start = 0;
|
||||
|
||||
int flit_width = 0;
|
||||
|
||||
/* vertical and horizontal hop latency values */
|
||||
int ver_hop_lat, hor_hop_lat; /* in cycles */
|
||||
|
||||
|
||||
/* no. of different bank sizes to consider */
|
||||
int iterations;
|
||||
|
||||
|
||||
g_ip->nuca_cache_sz = g_ip->cache_sz;
|
||||
nuca_list.push_back(new nuca_org_t());
|
||||
|
||||
if (g_ip->cache_level == 0) l2_c = 1;
|
||||
else l2_c = 0;
|
||||
|
||||
if (g_ip->cores <= 4) core_in = 2;
|
||||
else if (g_ip->cores <= 8) core_in = 3;
|
||||
else if (g_ip->cores <= 16) core_in = 4;
|
||||
else {cout << "Number of cores should be <= 16!\n"; exit(0);}
|
||||
|
||||
|
||||
// set the lower bound to an appropriate value. this depends on cache associativity
|
||||
if (g_ip->assoc > 2) {
|
||||
i = 2;
|
||||
while (i != g_ip->assoc) {
|
||||
MIN_BANKSIZE *= 2;
|
||||
i *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);
|
||||
|
||||
if (g_ip->force_wiretype)
|
||||
{
|
||||
if (g_ip->wt == Low_swing) {
|
||||
wt_min = Low_swing;
|
||||
wt_max = Low_swing;
|
||||
}
|
||||
else {
|
||||
wt_min = Global;
|
||||
wt_max = Low_swing-1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
wt_min = Global;
|
||||
wt_max = Low_swing;
|
||||
}
|
||||
if (g_ip->nuca_bank_count != 0) { // simulate just one bank
|
||||
if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
|
||||
g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
|
||||
g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
|
||||
fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
|
||||
}
|
||||
bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
|
||||
iterations = bank_start+1;
|
||||
g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
|
||||
}
|
||||
cout << "Simulating various NUCA configurations\n";
|
||||
for (it=bank_start; it<iterations; it++) { /* different bank count values */
|
||||
ures.tag_array2 = &tag;
|
||||
ures.data_array2 = &data;
|
||||
/*
|
||||
* find the optimal bank organization
|
||||
*/
|
||||
solve(&ures);
|
||||
// output_UCA(&ures);
|
||||
bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
|
||||
cout << "====" << g_ip->cache_sz << "\n";
|
||||
|
||||
for (wr=wt_min; wr<=wt_max; wr++) {
|
||||
|
||||
for (ro=0; ro<ROUTER_TYPES; ro++)
|
||||
{
|
||||
flit_width = (int) router_s[ro]->flit_size; //initialize router
|
||||
nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
|
||||
|
||||
/* calculate router and wire parameters */
|
||||
|
||||
double vlength = ures.cache_ht; /* length of the wire (u)*/
|
||||
double hlength = ures.cache_len; // u
|
||||
|
||||
/* find delay, area, and power for wires */
|
||||
wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
|
||||
wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
|
||||
|
||||
|
||||
hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
|
||||
1/(nuca_list.back()->nuca_pda.cycle_time*.001));
|
||||
ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
|
||||
1/(nuca_list.back()->nuca_pda.cycle_time*.001));
|
||||
|
||||
/*
|
||||
* assume a grid like topology and explore for optimal network
|
||||
* configuration using different row and column count values.
|
||||
*/
|
||||
for (c=1; c<=(unsigned int)bank_count; c++) {
|
||||
while (bank_count%c != 0) c++;
|
||||
r = bank_count/c;
|
||||
|
||||
/*
|
||||
* to find the avg access latency of a NUCA cache, uncontended
|
||||
* access time to each bank from the
|
||||
* cache controller is calculated.
|
||||
* avg latency =
|
||||
* sum of the access latencies to individual banks)/bank
|
||||
* count value.
|
||||
*/
|
||||
totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
|
||||
k = 1;
|
||||
for (i=0; i<r; i++) {
|
||||
for (j=0; j<c; j++) {
|
||||
/*
|
||||
* vertical hops including the
|
||||
* first hop from the cache controller
|
||||
*/
|
||||
curr_hop = i + 1;
|
||||
curr_hop += j; /* horizontal hops */
|
||||
totno_hhops += j;
|
||||
totno_vhops += (i+1);
|
||||
curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
|
||||
j * hor_hop_lat);
|
||||
|
||||
tot_lat += curr_acclat;
|
||||
totno_hops += curr_hop;
|
||||
}
|
||||
}
|
||||
avg_lat = tot_lat/bank_count;
|
||||
avg_hop = totno_hops/bank_count;
|
||||
avg_hhop = totno_hhops/bank_count;
|
||||
avg_vhop = totno_vhops/bank_count;
|
||||
|
||||
/* net access latency */
|
||||
curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
|
||||
calc_cycles(ures.access_time,
|
||||
1/(nuca_list.back()->nuca_pda.cycle_time*.001));
|
||||
|
||||
/* avg access lat of nuca */
|
||||
avg_dyn_power =
|
||||
avg_hop *
|
||||
(router_s[ro]->power.readOp.dynamic) + avg_hhop *
|
||||
(wire_horizontal[wr]->power.readOp.dynamic) *
|
||||
(g_ip->block_sz*8 + 64) + avg_vhop *
|
||||
(wire_vertical[wr]->power.readOp.dynamic) *
|
||||
(g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;
|
||||
|
||||
avg_leakage_power =
|
||||
bank_count * router_s[ro]->power.readOp.leakage +
|
||||
avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
|
||||
wire_horizontal[wr]->delay) * flit_width +
|
||||
avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
|
||||
wire_horizontal[wr]->delay);
|
||||
|
||||
if (curr_acclat < opt_acclat) {
|
||||
opt_acclat = curr_acclat;
|
||||
opt_tot_lat = tot_lat;
|
||||
opt_avg_lat = avg_lat;
|
||||
opt_totno_hops = totno_hops;
|
||||
opt_avg_hop = avg_hop;
|
||||
opt_rows = r;
|
||||
opt_columns = c;
|
||||
opt_dyn_power = avg_dyn_power;
|
||||
opt_leakage_power = avg_leakage_power;
|
||||
}
|
||||
totno_hops = 0;
|
||||
tot_lat = 0;
|
||||
totno_hhops = 0;
|
||||
totno_vhops = 0;
|
||||
}
|
||||
nuca_list.back()->wire_pda.power.readOp.dynamic =
|
||||
opt_avg_hop * flit_width *
|
||||
(wire_horizontal[wr]->power.readOp.dynamic +
|
||||
wire_vertical[wr]->power.readOp.dynamic);
|
||||
nuca_list.back()->avg_hops = opt_avg_hop;
|
||||
/* network delay/power */
|
||||
nuca_list.back()->h_wire = wire_horizontal[wr];
|
||||
nuca_list.back()->v_wire = wire_vertical[wr];
|
||||
nuca_list.back()->router = router_s[ro];
|
||||
/* bank delay/power */
|
||||
|
||||
nuca_list.back()->bank_pda.delay = ures.access_time;
|
||||
nuca_list.back()->bank_pda.power = ures.power;
|
||||
nuca_list.back()->bank_pda.area.h = ures.cache_ht;
|
||||
nuca_list.back()->bank_pda.area.w = ures.cache_len;
|
||||
nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
|
||||
|
||||
num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
|
||||
1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
|
||||
if(num_cyc%2 != 0) num_cyc++;
|
||||
if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
|
||||
|
||||
if (it < 7) {
|
||||
nuca_list.back()->nuca_pda.delay = opt_acclat +
|
||||
cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
|
||||
nuca_list.back()->contention =
|
||||
cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
|
||||
}
|
||||
else {
|
||||
nuca_list.back()->nuca_pda.delay = opt_acclat +
|
||||
cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
|
||||
nuca_list.back()->contention =
|
||||
cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
|
||||
}
|
||||
nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
|
||||
nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
|
||||
|
||||
/* array organization */
|
||||
nuca_list.back()->bank_count = bank_count;
|
||||
nuca_list.back()->rows = opt_rows;
|
||||
nuca_list.back()->columns = opt_columns;
|
||||
calculate_nuca_area (nuca_list.back());
|
||||
|
||||
minval.update_min_values(nuca_list.back());
|
||||
nuca_list.push_back(new nuca_org_t());
|
||||
opt_acclat = BIGNUM;
|
||||
|
||||
}
|
||||
}
|
||||
g_ip->cache_sz /= 2;
|
||||
}
|
||||
|
||||
delete(nuca_list.back());
|
||||
nuca_list.pop_back();
|
||||
opt_n = find_optimal_nuca(&nuca_list, &minval);
|
||||
print_nuca(opt_n);
|
||||
g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;
|
||||
|
||||
list<nuca_org_t *>::iterator niter;
|
||||
for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
|
||||
{
|
||||
delete *niter;
|
||||
}
|
||||
nuca_list.clear();
|
||||
|
||||
for(int i=0; i < ROUTER_TYPES; i++)
|
||||
{
|
||||
delete router_s[i];
|
||||
}
|
||||
g_ip->display_ip();
|
||||
// g_ip->force_cache_config = true;
|
||||
// g_ip->ndwl = 8;
|
||||
// g_ip->ndbl = 16;
|
||||
// g_ip->nspd = 4;
|
||||
// g_ip->ndcm = 1;
|
||||
// g_ip->ndsam1 = 8;
|
||||
// g_ip->ndsam2 = 32;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Nuca::print_nuca (nuca_org_t *fr)
|
||||
{
|
||||
printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
|
||||
"----------\n\n");
|
||||
printf("Optimal number of banks - %d\n", fr->bank_count);
|
||||
printf("Grid organization rows x columns - %d x %d\n",
|
||||
fr->rows, fr->columns);
|
||||
printf("Network frequency - %g GHz\n",
|
||||
(1/fr->nuca_pda.cycle_time)*1e3);
|
||||
printf("Cache dimension (mm x mm) - %g x %g\n",
|
||||
fr->nuca_pda.area.h,
|
||||
fr->nuca_pda.area.w);
|
||||
|
||||
fr->router->print_router();
|
||||
|
||||
printf("\n\nWire stats:\n");
|
||||
if (fr->h_wire->wt == Global) {
|
||||
printf("\tWire type - Full swing global wires with least "
|
||||
"possible delay\n");
|
||||
}
|
||||
else if (fr->h_wire->wt == Global_5) {
|
||||
printf("\tWire type - Full swing global wires with "
|
||||
"5%% delay penalty\n");
|
||||
}
|
||||
else if (fr->h_wire->wt == Global_10) {
|
||||
printf("\tWire type - Full swing global wires with "
|
||||
"10%% delay penalty\n");
|
||||
}
|
||||
else if (fr->h_wire->wt == Global_20) {
|
||||
printf("\tWire type - Full swing global wires with "
|
||||
"20%% delay penalty\n");
|
||||
}
|
||||
else if (fr->h_wire->wt == Global_30) {
|
||||
printf("\tWire type - Full swing global wires with "
|
||||
"30%% delay penalty\n");
|
||||
}
|
||||
else if(fr->h_wire->wt == Low_swing) {
|
||||
printf("\tWire type - Low swing wires\n");
|
||||
}
|
||||
|
||||
printf("\tHorizontal link delay - %g (ns)\n",
|
||||
fr->h_wire->delay*1e9);
|
||||
printf("\tVertical link delay - %g (ns)\n",
|
||||
fr->v_wire->delay*1e9);
|
||||
printf("\tDelay/length - %g (ns/mm)\n",
|
||||
fr->h_wire->delay*1e9/fr->bank_pda.area.w);
|
||||
printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
|
||||
"\t -leakage %g (nW)\n\n",
|
||||
fr->h_wire->power.readOp.dynamic*1e9,
|
||||
fr->h_wire->power.readOp.leakage*1e9);
|
||||
printf("\tVertical link energy -dynamic/access %g (nJ)\n"
|
||||
"\t -leakage %g (nW)\n\n",
|
||||
fr->v_wire->power.readOp.dynamic*1e9,
|
||||
fr->v_wire->power.readOp.leakage*1e9);
|
||||
printf("\n\n");
|
||||
fr->v_wire->print_wire();
|
||||
printf("\n\nBank stats:\n");
|
||||
}
|
||||
|
||||
|
||||
nuca_org_t *
|
||||
Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval)
|
||||
{
|
||||
double cost = 0;
|
||||
double min_cost = BIGNUM;
|
||||
nuca_org_t *res = NULL;
|
||||
float d, a, dp, lp, c;
|
||||
int v;
|
||||
dp = g_ip->dynamic_power_wt_nuca;
|
||||
lp = g_ip->leakage_power_wt_nuca;
|
||||
a = g_ip->area_wt_nuca;
|
||||
d = g_ip->delay_wt_nuca;
|
||||
c = g_ip->cycle_time_wt_nuca;
|
||||
|
||||
list<nuca_org_t *>::iterator niter;
|
||||
|
||||
|
||||
for (niter = n->begin(); niter != n->end(); niter++) {
|
||||
fprintf(stderr, "\n-----------------------------"
|
||||
"---------------\n");
|
||||
|
||||
|
||||
printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
|
||||
"bank_dpower = %g \tleak = %g \tcycle = %g\n",
|
||||
(*niter)->bank_count,
|
||||
(*niter)->nuca_pda.delay,
|
||||
(*niter)->nuca_pda.power.readOp.dynamic,
|
||||
(*niter)->h_wire->wt,
|
||||
(*niter)->bank_pda.power.readOp.dynamic,
|
||||
(*niter)->nuca_pda.power.readOp.leakage,
|
||||
(*niter)->nuca_pda.cycle_time);
|
||||
|
||||
|
||||
if (g_ip->ed == 1) {
|
||||
cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
|
||||
((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
|
||||
if (min_cost > cost) {
|
||||
min_cost = cost;
|
||||
res = ((*niter));
|
||||
}
|
||||
}
|
||||
else if (g_ip->ed == 2) {
|
||||
cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
|
||||
((*niter)->nuca_pda.delay/minval->min_delay)*
|
||||
((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
|
||||
if (min_cost > cost) {
|
||||
min_cost = cost;
|
||||
res = ((*niter));
|
||||
}
|
||||
}
|
||||
else {
|
||||
/*
|
||||
* check whether the current organization
|
||||
* meets the input deviation constraints
|
||||
*/
|
||||
v = check_nuca_org((*niter), minval);
|
||||
if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
|
||||
|
||||
if (v) {
|
||||
cost = (d * ((*niter)->nuca_pda.delay/minval->min_delay) +
|
||||
c * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) +
|
||||
dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) +
|
||||
lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) +
|
||||
a * ((*niter)->nuca_pda.area.get_area()/minval->min_area));
|
||||
fprintf(stderr, "cost = %g\n", cost);
|
||||
|
||||
if (min_cost > cost) {
|
||||
min_cost = cost;
|
||||
res = ((*niter));
|
||||
}
|
||||
}
|
||||
else {
|
||||
niter = n->erase(niter);
|
||||
if (niter !=n->begin())
|
||||
niter --;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
int
|
||||
Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval)
|
||||
{
|
||||
if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) {
|
||||
return 0;
|
||||
}
|
||||
if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
|
||||
g_ip->dynamic_power_dev_nuca) {
|
||||
return 0;
|
||||
}
|
||||
if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
|
||||
g_ip->leakage_power_dev_nuca) {
|
||||
return 0;
|
||||
}
|
||||
if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
|
||||
g_ip->cycle_time_dev_nuca) {
|
||||
return 0;
|
||||
}
|
||||
if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 >
|
||||
g_ip->area_dev_nuca) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void
|
||||
Nuca::calculate_nuca_area (nuca_org_t *nuca)
|
||||
{
|
||||
nuca->nuca_pda.area.h=
|
||||
nuca->rows * ((nuca->h_wire->wire_width +
|
||||
nuca->h_wire->wire_spacing)
|
||||
* nuca->router->flit_size +
|
||||
nuca->bank_pda.area.h);
|
||||
|
||||
nuca->nuca_pda.area.w =
|
||||
nuca->columns * ((nuca->v_wire->wire_width +
|
||||
nuca->v_wire->wire_spacing)
|
||||
* nuca->router->flit_size +
|
||||
nuca->bank_pda.area.w);
|
||||
}
|
||||
|
100
ext/mcpat/cacti/nuca.h
Normal file
100
ext/mcpat/cacti/nuca.h
Normal file
|
@ -0,0 +1,100 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __NUCA_H__
|
||||
#define __NUCA_H__
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "assert.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "io.h"
|
||||
#include "mat.h"
|
||||
#include "parameter.h"
|
||||
#include "router.h"
|
||||
#include "wire.h"
|
||||
|
||||
class nuca_org_t {
|
||||
public:
|
||||
~nuca_org_t();
|
||||
// int size;
|
||||
/* area, power, access time, and cycle time stats */
|
||||
Component nuca_pda;
|
||||
Component bank_pda;
|
||||
Component wire_pda;
|
||||
Wire *h_wire;
|
||||
Wire *v_wire;
|
||||
Router *router;
|
||||
/* for particular network configuration
|
||||
* calculated based on a cycle accurate
|
||||
* simulation Ref: CACTI 6 - Tech report
|
||||
*/
|
||||
double contention;
|
||||
|
||||
/* grid network stats */
|
||||
double avg_hops;
|
||||
int rows;
|
||||
int columns;
|
||||
int bank_count;
|
||||
};
|
||||
|
||||
|
||||
|
||||
class Nuca : public Component
|
||||
{
|
||||
public:
|
||||
Nuca(
|
||||
TechnologyParameter::DeviceType *dt);
|
||||
void print_router();
|
||||
~Nuca();
|
||||
void sim_nuca();
|
||||
void init_cont();
|
||||
int calc_cycles(double lat, double oper_freq);
|
||||
void calculate_nuca_area (nuca_org_t *nuca);
|
||||
int check_nuca_org (nuca_org_t *n, min_values_t *minval);
|
||||
nuca_org_t * find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval);
|
||||
void print_nuca(nuca_org_t *n);
|
||||
void print_cont_stats();
|
||||
|
||||
private:
|
||||
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
int wt_min, wt_max;
|
||||
Wire *wire_vertical[WIRE_TYPES],
|
||||
*wire_horizontal[WIRE_TYPES];
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif
|
713
ext/mcpat/cacti/parameter.cc
Normal file
713
ext/mcpat/cacti/parameter.cc
Normal file
|
@ -0,0 +1,713 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "area.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
InputParameter * g_ip;
|
||||
TechnologyParameter g_tp;
|
||||
|
||||
|
||||
|
||||
void TechnologyParameter::DeviceType::display(uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
cout << indent_str << "C_g_ideal = " << setw(12) << C_g_ideal << " F/um" << endl;
|
||||
cout << indent_str << "C_fringe = " << setw(12) << C_fringe << " F/um" << endl;
|
||||
cout << indent_str << "C_overlap = " << setw(12) << C_overlap << " F/um" << endl;
|
||||
cout << indent_str << "C_junc = " << setw(12) << C_junc << " F/um^2" << endl;
|
||||
cout << indent_str << "l_phy = " << setw(12) << l_phy << " um" << endl;
|
||||
cout << indent_str << "l_elec = " << setw(12) << l_elec << " um" << endl;
|
||||
cout << indent_str << "R_nch_on = " << setw(12) << R_nch_on << " ohm-um" << endl;
|
||||
cout << indent_str << "R_pch_on = " << setw(12) << R_pch_on << " ohm-um" << endl;
|
||||
cout << indent_str << "Vdd = " << setw(12) << Vdd << " V" << endl;
|
||||
cout << indent_str << "Vth = " << setw(12) << Vth << " V" << endl;
|
||||
cout << indent_str << "I_on_n = " << setw(12) << I_on_n << " A/um" << endl;
|
||||
cout << indent_str << "I_on_p = " << setw(12) << I_on_p << " A/um" << endl;
|
||||
cout << indent_str << "I_off_n = " << setw(12) << I_off_n << " A/um" << endl;
|
||||
cout << indent_str << "I_off_p = " << setw(12) << I_off_p << " A/um" << endl;
|
||||
cout << indent_str << "C_ox = " << setw(12) << C_ox << " F/um^2" << endl;
|
||||
cout << indent_str << "t_ox = " << setw(12) << t_ox << " um" << endl;
|
||||
cout << indent_str << "n_to_p_eff_curr_drv_ratio = " << n_to_p_eff_curr_drv_ratio << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void TechnologyParameter::InterconnectType::display(uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
cout << indent_str << "pitch = " << setw(12) << pitch << " um" << endl;
|
||||
cout << indent_str << "R_per_um = " << setw(12) << R_per_um << " ohm/um" << endl;
|
||||
cout << indent_str << "C_per_um = " << setw(12) << C_per_um << " F/um" << endl;
|
||||
}
|
||||
|
||||
void TechnologyParameter::ScalingFactor::display(uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
cout << indent_str << "logic_scaling_co_eff = " << setw(12) << logic_scaling_co_eff << endl;
|
||||
cout << indent_str << "curr_core_tx_density = " << setw(12) << core_tx_density << " # of tx/um^2" << endl;
|
||||
}
|
||||
|
||||
void TechnologyParameter::MemoryType::display(uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
cout << indent_str << "b_w = " << setw(12) << b_w << " um" << endl;
|
||||
cout << indent_str << "b_h = " << setw(12) << b_h << " um" << endl;
|
||||
cout << indent_str << "cell_a_w = " << setw(12) << cell_a_w << " um" << endl;
|
||||
cout << indent_str << "cell_pmos_w = " << setw(12) << cell_pmos_w << " um" << endl;
|
||||
cout << indent_str << "cell_nmos_w = " << setw(12) << cell_nmos_w << " um" << endl;
|
||||
cout << indent_str << "Vbitpre = " << setw(12) << Vbitpre << " V" << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void TechnologyParameter::display(uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
cout << indent_str << "ram_wl_stitching_overhead_ = " << setw(12) << ram_wl_stitching_overhead_ << " um" << endl;
|
||||
cout << indent_str << "min_w_nmos_ = " << setw(12) << min_w_nmos_ << " um" << endl;
|
||||
cout << indent_str << "max_w_nmos_ = " << setw(12) << max_w_nmos_ << " um" << endl;
|
||||
cout << indent_str << "unit_len_wire_del = " << setw(12) << unit_len_wire_del << " s/um^2" << endl;
|
||||
cout << indent_str << "FO4 = " << setw(12) << FO4 << " s" << endl;
|
||||
cout << indent_str << "kinv = " << setw(12) << kinv << " s" << endl;
|
||||
cout << indent_str << "vpp = " << setw(12) << vpp << " V" << endl;
|
||||
cout << indent_str << "w_sense_en = " << setw(12) << w_sense_en << " um" << endl;
|
||||
cout << indent_str << "w_sense_n = " << setw(12) << w_sense_n << " um" << endl;
|
||||
cout << indent_str << "w_sense_p = " << setw(12) << w_sense_p << " um" << endl;
|
||||
cout << indent_str << "w_iso = " << setw(12) << w_iso << " um" << endl;
|
||||
cout << indent_str << "w_poly_contact = " << setw(12) << w_poly_contact << " um" << endl;
|
||||
cout << indent_str << "spacing_poly_to_poly = " << setw(12) << spacing_poly_to_poly << " um" << endl;
|
||||
cout << indent_str << "spacing_poly_to_contact = " << setw(12) << spacing_poly_to_contact << " um" << endl;
|
||||
cout << endl;
|
||||
cout << indent_str << "w_comp_inv_p1 = " << setw(12) << w_comp_inv_p1 << " um" << endl;
|
||||
cout << indent_str << "w_comp_inv_p2 = " << setw(12) << w_comp_inv_p2 << " um" << endl;
|
||||
cout << indent_str << "w_comp_inv_p3 = " << setw(12) << w_comp_inv_p3 << " um" << endl;
|
||||
cout << indent_str << "w_comp_inv_n1 = " << setw(12) << w_comp_inv_n1 << " um" << endl;
|
||||
cout << indent_str << "w_comp_inv_n2 = " << setw(12) << w_comp_inv_n2 << " um" << endl;
|
||||
cout << indent_str << "w_comp_inv_n3 = " << setw(12) << w_comp_inv_n3 << " um" << endl;
|
||||
cout << indent_str << "w_eval_inv_p = " << setw(12) << w_eval_inv_p << " um" << endl;
|
||||
cout << indent_str << "w_eval_inv_n = " << setw(12) << w_eval_inv_n << " um" << endl;
|
||||
cout << indent_str << "w_comp_n = " << setw(12) << w_comp_n << " um" << endl;
|
||||
cout << indent_str << "w_comp_p = " << setw(12) << w_comp_p << " um" << endl;
|
||||
cout << endl;
|
||||
cout << indent_str << "dram_cell_I_on = " << setw(12) << dram_cell_I_on << " A/um" << endl;
|
||||
cout << indent_str << "dram_cell_Vdd = " << setw(12) << dram_cell_Vdd << " V" << endl;
|
||||
cout << indent_str << "dram_cell_I_off_worst_case_len_temp = " << setw(12) << dram_cell_I_off_worst_case_len_temp << " A/um" << endl;
|
||||
cout << indent_str << "dram_cell_C = " << setw(12) << dram_cell_C << " F" << endl;
|
||||
cout << indent_str << "gm_sense_amp_latch = " << setw(12) << gm_sense_amp_latch << " F/s" << endl;
|
||||
cout << endl;
|
||||
cout << indent_str << "w_nmos_b_mux = " << setw(12) << w_nmos_b_mux << " um" << endl;
|
||||
cout << indent_str << "w_nmos_sa_mux = " << setw(12) << w_nmos_sa_mux << " um" << endl;
|
||||
cout << indent_str << "w_pmos_bl_precharge = " << setw(12) << w_pmos_bl_precharge << " um" << endl;
|
||||
cout << indent_str << "w_pmos_bl_eq = " << setw(12) << w_pmos_bl_eq << " um" << endl;
|
||||
cout << indent_str << "MIN_GAP_BET_P_AND_N_DIFFS = " << setw(12) << MIN_GAP_BET_P_AND_N_DIFFS << " um" << endl;
|
||||
cout << indent_str << "HPOWERRAIL = " << setw(12) << HPOWERRAIL << " um" << endl;
|
||||
cout << indent_str << "cell_h_def = " << setw(12) << cell_h_def << " um" << endl;
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "SRAM cell transistor: " << endl;
|
||||
sram_cell.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "DRAM access transistor: " << endl;
|
||||
dram_acc.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "DRAM wordline transistor: " << endl;
|
||||
dram_wl.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "peripheral global transistor: " << endl;
|
||||
peri_global.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "wire local" << endl;
|
||||
wire_local.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "wire inside mat" << endl;
|
||||
wire_inside_mat.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "wire outside mat" << endl;
|
||||
wire_outside_mat.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "SRAM" << endl;
|
||||
sram.display(indent + 2);
|
||||
|
||||
cout << endl;
|
||||
cout << indent_str << "DRAM" << endl;
|
||||
dram.display(indent + 2);
|
||||
}
|
||||
|
||||
|
||||
DynamicParameter::DynamicParameter():
|
||||
use_inp_params(0), cell(), is_valid(true)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
DynamicParameter::DynamicParameter(
|
||||
bool is_tag_,
|
||||
int pure_ram_,
|
||||
int pure_cam_,
|
||||
double Nspd_,
|
||||
unsigned int Ndwl_,
|
||||
unsigned int Ndbl_,
|
||||
unsigned int Ndcm_,
|
||||
unsigned int Ndsam_lev_1_,
|
||||
unsigned int Ndsam_lev_2_,
|
||||
bool is_main_mem_):
|
||||
is_tag(is_tag_), pure_ram(pure_ram_), pure_cam(pure_cam_), tagbits(0), Nspd(Nspd_), Ndwl(Ndwl_), Ndbl(Ndbl_),Ndcm(Ndcm_),
|
||||
Ndsam_lev_1(Ndsam_lev_1_), Ndsam_lev_2(Ndsam_lev_2_),
|
||||
number_way_select_signals_mat(0), V_b_sense(0), use_inp_params(0),
|
||||
is_main_mem(is_main_mem_), cell(), is_valid(false)
|
||||
{
|
||||
ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
|
||||
is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
|
||||
|
||||
unsigned int capacity_per_die = g_ip->cache_sz / NUMBER_STACKED_DIE_LAYERS; // capacity per stacked die layer
|
||||
const TechnologyParameter::InterconnectType & wire_local = g_tp.wire_local;
|
||||
fully_assoc = (g_ip->fully_assoc) ? true : false;
|
||||
|
||||
if (fully_assoc || pure_cam)
|
||||
{ // fully-assocative cache -- ref: CACTi 2.0 report
|
||||
if (Ndwl != 1 || //Ndwl is fixed to 1 for FA
|
||||
Ndcm != 1 || //Ndcm is fixed to 1 for FA
|
||||
Nspd < 1 || Nspd > 1 || //Nspd is fixed to 1 for FA
|
||||
Ndsam_lev_1 != 1 || //Ndsam_lev_1 is fixed to one
|
||||
Ndsam_lev_2 != 1 || //Ndsam_lev_2 is fixed to one
|
||||
Ndbl < 2)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if ((is_dram) && (!is_tag) && (Ndcm > 1))
|
||||
{
|
||||
return; // For a DRAM array, each bitline has its own sense-amp
|
||||
}
|
||||
|
||||
// If it's not an FA tag/data array, Ndwl should be at least two and Ndbl should be
|
||||
// at least two because an array is assumed to have at least one mat. And a mat
|
||||
// is formed out of two horizontal subarrays and two vertical subarrays
|
||||
if (fully_assoc == false && (Ndwl < 1 || Ndbl < 1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
//***********compute row, col of an subarray
|
||||
if (!(fully_assoc || pure_cam))//Not fully_asso nor cam
|
||||
{
|
||||
// if data array, let tagbits = 0
|
||||
if (is_tag)
|
||||
{
|
||||
if (g_ip->specific_tag)
|
||||
{
|
||||
tagbits = g_ip->tag_w;
|
||||
}
|
||||
else
|
||||
{
|
||||
tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(capacity_per_die) +
|
||||
_log2(g_ip->tag_assoc*2 - 1) - _log2(g_ip->nbanks);
|
||||
|
||||
}
|
||||
tagbits = (((tagbits + 3) >> 2) << 2);
|
||||
|
||||
num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
|
||||
g_ip->block_sz * g_ip->tag_assoc * Ndbl * Nspd));// + EPSILON);
|
||||
num_c_subarray = (int)ceil((tagbits * g_ip->tag_assoc * Nspd / Ndwl));// + EPSILON);
|
||||
//burst_length = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
|
||||
g_ip->block_sz * g_ip->data_assoc * Ndbl * Nspd));// + EPSILON);
|
||||
num_c_subarray = (int)ceil((8 * g_ip->block_sz * g_ip->data_assoc * Nspd / Ndwl));// + EPSILON); + EPSILON);
|
||||
// burst_length = g_ip->block_sz * 8 / g_ip->out_w;
|
||||
}
|
||||
|
||||
if (num_r_subarray < MINSUBARRAYROWS) return;
|
||||
if (num_r_subarray == 0) return;
|
||||
if (num_r_subarray > MAXSUBARRAYROWS) return;
|
||||
if (num_c_subarray < MINSUBARRAYCOLS) return;
|
||||
if (num_c_subarray > MAXSUBARRAYCOLS) return;
|
||||
|
||||
}
|
||||
|
||||
else
|
||||
{//either fully-asso or cam
|
||||
if (pure_cam)
|
||||
{
|
||||
if (g_ip->specific_tag)
|
||||
{
|
||||
tagbits = int(ceil(g_ip->tag_w/8.0)*8);
|
||||
}
|
||||
else
|
||||
{
|
||||
tagbits = int(ceil((ADDRESS_BITS + EXTRA_TAG_BITS)/8.0)*8);
|
||||
// cout<<"Pure CAM needs tag width to be specified"<<endl;
|
||||
// exit(0);
|
||||
}
|
||||
//tagbits = (((tagbits + 3) >> 2) << 2);
|
||||
|
||||
tag_num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks*tagbits/8.0 * Ndbl));//TODO: error check input of tagbits and blocksize //TODO: for pure CAM, g_ip->block should be number of entries.
|
||||
//tag_num_c_subarray = (int)(tagbits + EPSILON);
|
||||
tag_num_c_subarray = tagbits;
|
||||
if (tag_num_r_subarray == 0) return;
|
||||
if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
|
||||
if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
|
||||
if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
|
||||
num_r_subarray = tag_num_r_subarray;
|
||||
}
|
||||
else //fully associative
|
||||
{
|
||||
if (g_ip->specific_tag)
|
||||
{
|
||||
tagbits = g_ip->tag_w;
|
||||
}
|
||||
else
|
||||
{
|
||||
tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(g_ip->block_sz);//TODO: should be the page_offset=log2(page size), but this info is not avail with CACTI, for McPAT this is no problem.
|
||||
}
|
||||
tagbits = (((tagbits + 3) >> 2) << 2);
|
||||
|
||||
tag_num_r_subarray = (int)(capacity_per_die / (g_ip->nbanks*g_ip->block_sz * Ndbl));
|
||||
tag_num_c_subarray = (int)ceil((tagbits * Nspd / Ndwl));// + EPSILON);
|
||||
if (tag_num_r_subarray == 0) return;
|
||||
if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
|
||||
if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
|
||||
if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
|
||||
|
||||
data_num_r_subarray = tag_num_r_subarray;
|
||||
data_num_c_subarray = 8 * g_ip->block_sz;
|
||||
if (data_num_r_subarray == 0) return;
|
||||
if (data_num_r_subarray > MAXSUBARRAYROWS) return;
|
||||
if (data_num_c_subarray < MINSUBARRAYCOLS) return;
|
||||
if (data_num_c_subarray > MAXSUBARRAYCOLS) return;
|
||||
num_r_subarray = tag_num_r_subarray;
|
||||
}
|
||||
}
|
||||
|
||||
num_subarrays = Ndwl * Ndbl;
|
||||
//****************end of computation of row, col of an subarray
|
||||
|
||||
// calculate wire parameters
|
||||
if (fully_assoc || pure_cam)
|
||||
{
|
||||
cam_cell.h = g_tp.cam.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
|
||||
+ 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
|
||||
cam_cell.w = g_tp.cam.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
|
||||
+ 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
|
||||
|
||||
cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +g_ip->num_rw_ports-1 + g_ip->num_rd_ports)
|
||||
+ 2 * wire_local.pitch*(g_ip->num_search_ports-1);
|
||||
cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports -1 + (g_ip->num_rd_ports - g_ip->num_se_rd_ports)
|
||||
+ g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(is_tag)
|
||||
{
|
||||
cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_rd_ports +
|
||||
g_ip->num_wr_ports);
|
||||
cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_wr_ports +
|
||||
(g_ip->num_rd_ports - g_ip->num_se_rd_ports)) +
|
||||
wire_local.pitch * g_ip->num_se_rd_ports;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (is_dram)
|
||||
{
|
||||
cell.h = g_tp.dram.b_h;
|
||||
cell.w = g_tp.dram.b_w;
|
||||
}
|
||||
else
|
||||
{
|
||||
cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +
|
||||
g_ip->num_rw_ports - 1 + g_ip->num_rd_ports);
|
||||
cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 +
|
||||
(g_ip->num_rd_ports - g_ip->num_se_rd_ports) +
|
||||
g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double c_b_metal = cell.h * wire_local.C_per_um;
|
||||
double C_bl;
|
||||
|
||||
if (!(fully_assoc || pure_cam))
|
||||
{
|
||||
if (is_dram)
|
||||
{
|
||||
deg_bl_muxing = 1;
|
||||
if (ram_cell_tech_type == comm_dram)
|
||||
{
|
||||
C_bl = num_r_subarray * c_b_metal;
|
||||
V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C / (g_tp.dram_cell_C + C_bl);
|
||||
if (V_b_sense < VBITSENSEMIN)
|
||||
{
|
||||
return;
|
||||
}
|
||||
V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value
|
||||
dram_refresh_period = 64e-3;
|
||||
}
|
||||
else
|
||||
{
|
||||
double Cbitrow_drain_cap = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;
|
||||
C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
|
||||
V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl);
|
||||
|
||||
if (V_b_sense < VBITSENSEMIN)
|
||||
{
|
||||
return; //Sense amp input signal is smaller that minimum allowable sense amp input signal
|
||||
}
|
||||
V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value
|
||||
//v_storage_worst = g_tp.dram_cell_Vdd / 2 - VBITSENSEMIN * (g_tp.dram_cell_C + C_bl) / g_tp.dram_cell_C;
|
||||
//dram_refresh_period = 1.1 * g_tp.dram_cell_C * v_storage_worst / g_tp.dram_cell_I_off_worst_case_len_temp;
|
||||
dram_refresh_period = 0.9 * g_tp.dram_cell_C * VDD_STORAGE_LOSS_FRACTION_WORST * g_tp.dram_cell_Vdd / g_tp.dram_cell_I_off_worst_case_len_temp;
|
||||
}
|
||||
}
|
||||
else
|
||||
{ //SRAM
|
||||
V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
|
||||
deg_bl_muxing = Ndcm;
|
||||
// "/ 2.0" below is due to the fact that two adjacent access transistors share drain
|
||||
// contacts in a physical layout
|
||||
double Cbitrow_drain_cap = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;
|
||||
C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
|
||||
dram_refresh_period = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
c_b_metal = cam_cell.h * wire_local.C_per_um;//IBM and SUN design, SRAM array uses dummy cells to fill the blank space due to mismatch on CAM-RAM
|
||||
V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
|
||||
deg_bl_muxing = 1;//FA fix as 1
|
||||
// "/ 2.0" below is due to the fact that two adjacent access transistors share drain
|
||||
// contacts in a physical layout
|
||||
double Cbitrow_drain_cap = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;//TODO: comment out these two lines
|
||||
C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
|
||||
dram_refresh_period = 0;
|
||||
}
|
||||
|
||||
|
||||
// do/di: data in/out, for fully associative they are the data width for normal read and write
|
||||
// so/si: search data in/out, for fully associative they are the data width for the search ops
|
||||
// for CAM, si=di, but so = matching address. do = data out = di (for normal read/write)
|
||||
// so/si needs broadcase while do/di do not
|
||||
|
||||
if (fully_assoc || pure_cam)
|
||||
{
|
||||
switch (Ndbl) {
|
||||
case (0):
|
||||
cout << " Invalid Ndbl \n"<<endl;
|
||||
exit(0);
|
||||
break;
|
||||
case (1):
|
||||
num_mats_h_dir = 1;//one subarray per mat
|
||||
num_mats_v_dir = 1;
|
||||
break;
|
||||
case (2):
|
||||
num_mats_h_dir = 1;//two subarrays per mat
|
||||
num_mats_v_dir = 1;
|
||||
break;
|
||||
default:
|
||||
num_mats_h_dir = int(floor(sqrt(Ndbl/4.0)));//4 subbarrys per mat
|
||||
num_mats_v_dir = int(Ndbl/4.0 / num_mats_h_dir);
|
||||
}
|
||||
num_mats = num_mats_h_dir * num_mats_v_dir;
|
||||
|
||||
if (fully_assoc)
|
||||
{
|
||||
num_so_b_mat = data_num_c_subarray;
|
||||
num_do_b_mat = data_num_c_subarray + tagbits;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_so_b_mat = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
|
||||
num_do_b_mat = tagbits;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
num_mats_h_dir = MAX(Ndwl / 2, 1);
|
||||
num_mats_v_dir = MAX(Ndbl / 2, 1);
|
||||
num_mats = num_mats_h_dir * num_mats_v_dir;
|
||||
num_do_b_mat = MAX((num_subarrays/num_mats) * num_c_subarray / (deg_bl_muxing * Ndsam_lev_1 * Ndsam_lev_2), 1);
|
||||
}
|
||||
|
||||
if (!(fully_assoc|| pure_cam) && (num_do_b_mat < (num_subarrays/num_mats)))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int deg_sa_mux_l1_non_assoc;
|
||||
//TODO:the i/o for subbank is not necessary and should be removed.
|
||||
if (!(fully_assoc || pure_cam))
|
||||
{
|
||||
if (!is_tag)
|
||||
{
|
||||
if (is_main_mem == true)
|
||||
{
|
||||
num_do_b_subbank = g_ip->int_prefetch_w * g_ip->out_w;
|
||||
deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (g_ip->fast_access == true)
|
||||
{
|
||||
num_do_b_subbank = g_ip->out_w * g_ip->data_assoc;
|
||||
deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
num_do_b_subbank = g_ip->out_w;
|
||||
deg_sa_mux_l1_non_assoc = Ndsam_lev_1 / g_ip->data_assoc;
|
||||
if (deg_sa_mux_l1_non_assoc < 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
num_do_b_subbank = tagbits * g_ip->tag_assoc;
|
||||
if (num_do_b_mat < tagbits)
|
||||
{
|
||||
return;
|
||||
}
|
||||
deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
|
||||
//num_do_b_mat = g_ip->tag_assoc / num_mats_h_dir;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fully_assoc)
|
||||
{
|
||||
num_so_b_subbank = 8 * g_ip->block_sz;//TODO:internal perfetch should be considered also for fa
|
||||
num_do_b_subbank = num_so_b_subbank + tag_num_c_subarray;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_so_b_subbank = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
|
||||
num_do_b_subbank = tag_num_c_subarray;
|
||||
}
|
||||
|
||||
deg_sa_mux_l1_non_assoc = 1;
|
||||
}
|
||||
|
||||
deg_senseamp_muxing_non_associativity = deg_sa_mux_l1_non_assoc;
|
||||
|
||||
if (fully_assoc || pure_cam)
|
||||
{
|
||||
num_act_mats_hor_dir = 1;
|
||||
num_act_mats_hor_dir_sl = num_mats_h_dir;//TODO: this is unnecessary, since search op, num_mats is used
|
||||
}
|
||||
else
|
||||
{
|
||||
num_act_mats_hor_dir = num_do_b_subbank / num_do_b_mat;
|
||||
if (num_act_mats_hor_dir == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
//compute num_do_mat for tag
|
||||
if (is_tag)
|
||||
{
|
||||
if (!(fully_assoc || pure_cam))
|
||||
{
|
||||
num_do_b_mat = g_ip->tag_assoc / num_act_mats_hor_dir;
|
||||
num_do_b_subbank = num_act_mats_hor_dir * num_do_b_mat;
|
||||
}
|
||||
}
|
||||
|
||||
if ((g_ip->is_cache == false && is_main_mem == true) || (PAGE_MODE == 1 && is_dram))
|
||||
{
|
||||
if (num_act_mats_hor_dir * num_do_b_mat * Ndsam_lev_1 * Ndsam_lev_2 != (int)g_ip->page_sz_bits)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// if (is_tag == false && g_ip->is_cache == true && !fully_assoc && !pure_cam && //TODO: TODO burst transfer should also apply to RAM arrays
|
||||
if (is_tag == false && g_ip->is_main_mem == true &&
|
||||
num_act_mats_hor_dir*num_do_b_mat*Ndsam_lev_1*Ndsam_lev_2 < ((int) g_ip->out_w * (int) g_ip->burst_len * (int) g_ip->data_assoc))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (num_act_mats_hor_dir > num_mats_h_dir)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
//compute di for mat subbank and bank
|
||||
if (!(fully_assoc ||pure_cam))
|
||||
{
|
||||
if(!is_tag)
|
||||
{
|
||||
if(g_ip->fast_access == true)
|
||||
{
|
||||
num_di_b_mat = num_do_b_mat / g_ip->data_assoc;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_di_b_mat = num_do_b_mat;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
num_di_b_mat = tagbits;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fully_assoc)
|
||||
{
|
||||
num_di_b_mat = num_do_b_mat;
|
||||
//*num_subarrays/num_mats; bits per mat of CAM/FA is as same as cache,
|
||||
//but inside the mat wire tracks need to be reserved for search data bus
|
||||
num_si_b_mat = tagbits;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_di_b_mat = tagbits;
|
||||
num_si_b_mat = tagbits;//*num_subarrays/num_mats;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
num_di_b_subbank = num_di_b_mat * num_act_mats_hor_dir;//normal cache or normal r/w for FA
|
||||
num_si_b_subbank = num_si_b_mat; //* num_act_mats_hor_dir_sl; inside the data is broadcast
|
||||
|
||||
int num_addr_b_row_dec = _log2(num_r_subarray);
|
||||
if ((fully_assoc ||pure_cam))
|
||||
num_addr_b_row_dec +=_log2(num_subarrays/num_mats);
|
||||
int number_subbanks = num_mats / num_act_mats_hor_dir;
|
||||
number_subbanks_decode = _log2(number_subbanks);//TODO: add log2(num_subarray_per_bank) to FA/CAM
|
||||
|
||||
num_rw_ports = g_ip->num_rw_ports;
|
||||
num_rd_ports = g_ip->num_rd_ports;
|
||||
num_wr_ports = g_ip->num_wr_ports;
|
||||
num_se_rd_ports = g_ip->num_se_rd_ports;
|
||||
num_search_ports = g_ip->num_search_ports;
|
||||
|
||||
if (is_dram && is_main_mem)
|
||||
{
|
||||
number_addr_bits_mat = MAX((unsigned int) num_addr_b_row_dec,
|
||||
_log2(deg_bl_muxing) + _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2));
|
||||
}
|
||||
else
|
||||
{
|
||||
number_addr_bits_mat = num_addr_b_row_dec + _log2(deg_bl_muxing) +
|
||||
_log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2);
|
||||
}
|
||||
|
||||
if (!(fully_assoc ||pure_cam))
|
||||
{
|
||||
if (is_tag)
|
||||
{
|
||||
num_di_b_bank_per_port = tagbits;
|
||||
num_do_b_bank_per_port = g_ip->data_assoc;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_di_b_bank_per_port = g_ip->out_w + g_ip->data_assoc;
|
||||
num_do_b_bank_per_port = g_ip->out_w;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fully_assoc)
|
||||
{
|
||||
num_di_b_bank_per_port = g_ip->out_w + tagbits;//TODO: out_w or block_sz?
|
||||
num_si_b_bank_per_port = tagbits;
|
||||
num_do_b_bank_per_port = g_ip->out_w + tagbits;
|
||||
num_so_b_bank_per_port = g_ip->out_w;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_di_b_bank_per_port = tagbits;
|
||||
num_si_b_bank_per_port = tagbits;
|
||||
num_do_b_bank_per_port = tagbits;
|
||||
num_so_b_bank_per_port = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));
|
||||
}
|
||||
}
|
||||
|
||||
if ((!is_tag) && (g_ip->data_assoc > 1) && (!g_ip->fast_access))
|
||||
{
|
||||
number_way_select_signals_mat = g_ip->data_assoc;
|
||||
}
|
||||
|
||||
// add ECC adjustment to all data signals that traverse on H-trees.
|
||||
if (g_ip->add_ecc_b_ == true)
|
||||
{
|
||||
num_do_b_mat += (int) (ceil(num_do_b_mat / num_bits_per_ecc_b_));
|
||||
num_di_b_mat += (int) (ceil(num_di_b_mat / num_bits_per_ecc_b_));
|
||||
num_di_b_subbank += (int) (ceil(num_di_b_subbank / num_bits_per_ecc_b_));
|
||||
num_do_b_subbank += (int) (ceil(num_do_b_subbank / num_bits_per_ecc_b_));
|
||||
num_di_b_bank_per_port += (int) (ceil(num_di_b_bank_per_port / num_bits_per_ecc_b_));
|
||||
num_do_b_bank_per_port += (int) (ceil(num_do_b_bank_per_port / num_bits_per_ecc_b_));
|
||||
|
||||
num_so_b_mat += (int) (ceil(num_so_b_mat / num_bits_per_ecc_b_));
|
||||
num_si_b_mat += (int) (ceil(num_si_b_mat / num_bits_per_ecc_b_));
|
||||
num_si_b_subbank += (int) (ceil(num_si_b_subbank / num_bits_per_ecc_b_));
|
||||
num_so_b_subbank += (int) (ceil(num_so_b_subbank / num_bits_per_ecc_b_));
|
||||
num_si_b_bank_per_port += (int) (ceil(num_si_b_bank_per_port / num_bits_per_ecc_b_));
|
||||
num_so_b_bank_per_port += (int) (ceil(num_so_b_bank_per_port / num_bits_per_ecc_b_));
|
||||
}
|
||||
|
||||
is_valid = true;
|
||||
}
|
||||
|
367
ext/mcpat/cacti/parameter.h
Normal file
367
ext/mcpat/cacti/parameter.h
Normal file
|
@ -0,0 +1,367 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __PARAMETER_H__
|
||||
#define __PARAMETER_H__
|
||||
|
||||
#include "area.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "const.h"
|
||||
#include "io.h"
|
||||
|
||||
// parameters which are functions of certain device technology
|
||||
class TechnologyParameter
|
||||
{
|
||||
public:
|
||||
class DeviceType
|
||||
{
|
||||
public:
|
||||
double C_g_ideal;
|
||||
double C_fringe;
|
||||
double C_overlap;
|
||||
double C_junc; // C_junc_area
|
||||
double C_junc_sidewall;
|
||||
double l_phy;
|
||||
double l_elec;
|
||||
double R_nch_on;
|
||||
double R_pch_on;
|
||||
double Vdd;
|
||||
double Vth;
|
||||
double I_on_n;
|
||||
double I_on_p;
|
||||
double I_off_n;
|
||||
double I_off_p;
|
||||
double I_g_on_n;
|
||||
double I_g_on_p;
|
||||
double C_ox;
|
||||
double t_ox;
|
||||
double n_to_p_eff_curr_drv_ratio;
|
||||
double long_channel_leakage_reduction;
|
||||
|
||||
DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0),
|
||||
C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0),
|
||||
Vdd(0), Vth(0),
|
||||
I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0),
|
||||
C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0) { };
|
||||
void reset()
|
||||
{
|
||||
C_g_ideal = 0;
|
||||
C_fringe = 0;
|
||||
C_overlap = 0;
|
||||
C_junc = 0;
|
||||
l_phy = 0;
|
||||
l_elec = 0;
|
||||
R_nch_on = 0;
|
||||
R_pch_on = 0;
|
||||
Vdd = 0;
|
||||
Vth = 0;
|
||||
I_on_n = 0;
|
||||
I_on_p = 0;
|
||||
I_off_n = 0;
|
||||
I_off_p = 0;
|
||||
I_g_on_n = 0;
|
||||
I_g_on_p = 0;
|
||||
C_ox = 0;
|
||||
t_ox = 0;
|
||||
n_to_p_eff_curr_drv_ratio = 0;
|
||||
long_channel_leakage_reduction = 0;
|
||||
}
|
||||
|
||||
void display(uint32_t indent = 0);
|
||||
};
|
||||
class InterconnectType
|
||||
{
|
||||
public:
|
||||
double pitch;
|
||||
double R_per_um;
|
||||
double C_per_um;
|
||||
double horiz_dielectric_constant;
|
||||
double vert_dielectric_constant;
|
||||
double aspect_ratio;
|
||||
double miller_value;
|
||||
double ild_thickness;
|
||||
|
||||
InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { };
|
||||
|
||||
void reset()
|
||||
{
|
||||
pitch = 0;
|
||||
R_per_um = 0;
|
||||
C_per_um = 0;
|
||||
horiz_dielectric_constant = 0;
|
||||
vert_dielectric_constant = 0;
|
||||
aspect_ratio = 0;
|
||||
miller_value = 0;
|
||||
ild_thickness = 0;
|
||||
}
|
||||
|
||||
void display(uint32_t indent = 0);
|
||||
};
|
||||
class MemoryType
|
||||
{
|
||||
public:
|
||||
double b_w;
|
||||
double b_h;
|
||||
double cell_a_w;
|
||||
double cell_pmos_w;
|
||||
double cell_nmos_w;
|
||||
double Vbitpre;
|
||||
|
||||
void reset()
|
||||
{
|
||||
b_w = 0;
|
||||
b_h = 0;
|
||||
cell_a_w = 0;
|
||||
cell_pmos_w = 0;
|
||||
cell_nmos_w = 0;
|
||||
Vbitpre = 0;
|
||||
}
|
||||
|
||||
void display(uint32_t indent = 0);
|
||||
};
|
||||
|
||||
class ScalingFactor
|
||||
{
|
||||
public:
|
||||
double logic_scaling_co_eff;
|
||||
double core_tx_density;
|
||||
double long_channel_leakage_reduction;
|
||||
|
||||
ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0),
|
||||
long_channel_leakage_reduction(0) { };
|
||||
|
||||
void reset()
|
||||
{
|
||||
logic_scaling_co_eff= 0;
|
||||
core_tx_density = 0;
|
||||
long_channel_leakage_reduction= 0;
|
||||
}
|
||||
|
||||
void display(uint32_t indent = 0);
|
||||
};
|
||||
|
||||
double ram_wl_stitching_overhead_;
|
||||
double min_w_nmos_;
|
||||
double max_w_nmos_;
|
||||
double max_w_nmos_dec;
|
||||
double unit_len_wire_del;
|
||||
double FO4;
|
||||
double kinv;
|
||||
double vpp;
|
||||
double w_sense_en;
|
||||
double w_sense_n;
|
||||
double w_sense_p;
|
||||
double sense_delay;
|
||||
double sense_dy_power;
|
||||
double w_iso;
|
||||
double w_poly_contact;
|
||||
double spacing_poly_to_poly;
|
||||
double spacing_poly_to_contact;
|
||||
|
||||
double w_comp_inv_p1;
|
||||
double w_comp_inv_p2;
|
||||
double w_comp_inv_p3;
|
||||
double w_comp_inv_n1;
|
||||
double w_comp_inv_n2;
|
||||
double w_comp_inv_n3;
|
||||
double w_eval_inv_p;
|
||||
double w_eval_inv_n;
|
||||
double w_comp_n;
|
||||
double w_comp_p;
|
||||
|
||||
double dram_cell_I_on;
|
||||
double dram_cell_Vdd;
|
||||
double dram_cell_I_off_worst_case_len_temp;
|
||||
double dram_cell_C;
|
||||
double gm_sense_amp_latch;
|
||||
|
||||
double w_nmos_b_mux;
|
||||
double w_nmos_sa_mux;
|
||||
double w_pmos_bl_precharge;
|
||||
double w_pmos_bl_eq;
|
||||
double MIN_GAP_BET_P_AND_N_DIFFS;
|
||||
double MIN_GAP_BET_SAME_TYPE_DIFFS;
|
||||
double HPOWERRAIL;
|
||||
double cell_h_def;
|
||||
|
||||
double chip_layout_overhead;
|
||||
double macro_layout_overhead;
|
||||
double sckt_co_eff;
|
||||
|
||||
double fringe_cap;
|
||||
|
||||
uint64_t h_dec;
|
||||
|
||||
DeviceType sram_cell; // SRAM cell transistor
|
||||
DeviceType dram_acc; // DRAM access transistor
|
||||
DeviceType dram_wl; // DRAM wordline transistor
|
||||
DeviceType peri_global; // peripheral global
|
||||
DeviceType cam_cell; // SRAM cell transistor
|
||||
|
||||
InterconnectType wire_local;
|
||||
InterconnectType wire_inside_mat;
|
||||
InterconnectType wire_outside_mat;
|
||||
|
||||
ScalingFactor scaling_factor;
|
||||
|
||||
MemoryType sram;
|
||||
MemoryType dram;
|
||||
MemoryType cam;
|
||||
|
||||
void display(uint32_t indent = 0);
|
||||
|
||||
void reset()
|
||||
{
|
||||
dram_cell_Vdd = 0;
|
||||
dram_cell_I_on = 0;
|
||||
dram_cell_C = 0;
|
||||
vpp = 0;
|
||||
|
||||
sense_delay = 0;
|
||||
sense_dy_power = 0;
|
||||
fringe_cap = 0;
|
||||
// horiz_dielectric_constant = 0;
|
||||
// vert_dielectric_constant = 0;
|
||||
// aspect_ratio = 0;
|
||||
// miller_value = 0;
|
||||
// ild_thickness = 0;
|
||||
|
||||
dram_cell_I_off_worst_case_len_temp = 0;
|
||||
|
||||
sram_cell.reset();
|
||||
dram_acc.reset();
|
||||
dram_wl.reset();
|
||||
peri_global.reset();
|
||||
cam_cell.reset();
|
||||
|
||||
scaling_factor.reset();
|
||||
|
||||
wire_local.reset();
|
||||
wire_inside_mat.reset();
|
||||
wire_outside_mat.reset();
|
||||
|
||||
sram.reset();
|
||||
dram.reset();
|
||||
cam.reset();
|
||||
|
||||
chip_layout_overhead = 0;
|
||||
macro_layout_overhead = 0;
|
||||
sckt_co_eff = 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
class DynamicParameter
|
||||
{
|
||||
public:
|
||||
bool is_tag;
|
||||
bool pure_ram;
|
||||
bool pure_cam;
|
||||
bool fully_assoc;
|
||||
int tagbits;
|
||||
int num_subarrays; // only for leakage computation -- the number of subarrays per bank
|
||||
int num_mats; // only for leakage computation -- the number of mats per bank
|
||||
double Nspd;
|
||||
int Ndwl;
|
||||
int Ndbl;
|
||||
int Ndcm;
|
||||
int deg_bl_muxing;
|
||||
int deg_senseamp_muxing_non_associativity;
|
||||
int Ndsam_lev_1;
|
||||
int Ndsam_lev_2;
|
||||
int number_addr_bits_mat; // per port
|
||||
int number_subbanks_decode; // per_port
|
||||
int num_di_b_bank_per_port;
|
||||
int num_do_b_bank_per_port;
|
||||
int num_di_b_mat;
|
||||
int num_do_b_mat;
|
||||
int num_di_b_subbank;
|
||||
int num_do_b_subbank;
|
||||
|
||||
int num_si_b_mat;
|
||||
int num_so_b_mat;
|
||||
int num_si_b_subbank;
|
||||
int num_so_b_subbank;
|
||||
int num_si_b_bank_per_port;
|
||||
int num_so_b_bank_per_port;
|
||||
|
||||
int number_way_select_signals_mat;
|
||||
int num_act_mats_hor_dir;
|
||||
|
||||
int num_act_mats_hor_dir_sl;
|
||||
bool is_dram;
|
||||
double V_b_sense;
|
||||
unsigned int num_r_subarray;
|
||||
unsigned int num_c_subarray;
|
||||
int tag_num_r_subarray;//sheng: fully associative cache tag and data must be computed together, data and tag must be separate
|
||||
int tag_num_c_subarray;
|
||||
int data_num_r_subarray;
|
||||
int data_num_c_subarray;
|
||||
int num_mats_h_dir;
|
||||
int num_mats_v_dir;
|
||||
uint32_t ram_cell_tech_type;
|
||||
double dram_refresh_period;
|
||||
|
||||
DynamicParameter();
|
||||
DynamicParameter(
|
||||
bool is_tag_,
|
||||
int pure_ram_,
|
||||
int pure_cam_,
|
||||
double Nspd_,
|
||||
unsigned int Ndwl_,
|
||||
unsigned int Ndbl_,
|
||||
unsigned int Ndcm_,
|
||||
unsigned int Ndsam_lev_1_,
|
||||
unsigned int Ndsam_lev_2_,
|
||||
bool is_main_mem_);
|
||||
|
||||
int use_inp_params;
|
||||
unsigned int num_rw_ports;
|
||||
unsigned int num_rd_ports;
|
||||
unsigned int num_wr_ports;
|
||||
unsigned int num_se_rd_ports; // number of single ended read ports
|
||||
unsigned int num_search_ports;
|
||||
unsigned int out_w;// == nr_bits_out
|
||||
bool is_main_mem;
|
||||
Area cell, cam_cell;//cell is the sram_cell in both nomal cache/ram and FA.
|
||||
bool is_valid;
|
||||
};
|
||||
|
||||
|
||||
|
||||
extern InputParameter * g_ip;
|
||||
extern TechnologyParameter g_tp;
|
||||
|
||||
#endif
|
||||
|
311
ext/mcpat/cacti/router.cc
Normal file
311
ext/mcpat/cacti/router.cc
Normal file
|
@ -0,0 +1,311 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include "router.h"
|
||||
|
||||
Router::Router(
|
||||
double flit_size_,
|
||||
double vc_buf, /* vc size = vc_buffer_size * flit_size */
|
||||
double vc_c,
|
||||
TechnologyParameter::DeviceType *dt,
|
||||
double I_,
|
||||
double O_,
|
||||
double M_
|
||||
):flit_size(flit_size_),
|
||||
deviceType(dt),
|
||||
I(I_),
|
||||
O(O_),
|
||||
M(M_)
|
||||
{
|
||||
vc_buffer_size = vc_buf;
|
||||
vc_count = vc_c;
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
|
||||
double technology = g_ip->F_sz_um;
|
||||
|
||||
Vdd = dt->Vdd;
|
||||
|
||||
/*Crossbar parameters. Transmisson gate is employed for connector*/
|
||||
NTtr = 10*technology*1e-6/2; /*Transmission gate's nmos tr. length*/
|
||||
PTtr = 20*technology*1e-6/2; /* pmos tr. length*/
|
||||
wt = 15*technology*1e-6/2; /*track width*/
|
||||
ht = 15*technology*1e-6/2; /*track height*/
|
||||
// I = 5; /*Number of crossbar input ports*/
|
||||
// O = 5; /*Number of crossbar output ports*/
|
||||
NTi = 12.5*technology*1e-6/2;
|
||||
PTi = 25*technology*1e-6/2;
|
||||
|
||||
NTid = 60*technology*1e-6/2; //m
|
||||
PTid = 120*technology*1e-6/2; // m
|
||||
NTod = 60*technology*1e-6/2; // m
|
||||
PTod = 120*technology*1e-6/2; // m
|
||||
|
||||
calc_router_parameters();
|
||||
}
|
||||
|
||||
Router::~Router(){}
|
||||
|
||||
|
||||
double //wire cap with triple spacing
|
||||
Router::Cw3(double length) {
|
||||
Wire wc(g_ip->wt, length, 1, 3, 3);
|
||||
return (wc.wire_cap(length));
|
||||
}
|
||||
|
||||
/*Function to calculate the gate capacitance*/
|
||||
double
|
||||
Router::gate_cap(double w) {
|
||||
return (double) gate_C (w*1e6 /*u*/, 0);
|
||||
}
|
||||
|
||||
/*Function to calculate the diffusion capacitance*/
|
||||
double
|
||||
Router::diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/,
|
||||
double s /*number of stacking transistors*/) {
|
||||
return (double) drain_C_(w*1e6 /*u*/, type, (int) s, 1, g_tp.cell_h_def);
|
||||
}
|
||||
|
||||
|
||||
/*crossbar related functions */
|
||||
|
||||
// Model for simple transmission gate
|
||||
double
|
||||
Router::transmission_buf_inpcap() {
|
||||
return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
|
||||
}
|
||||
|
||||
double
|
||||
Router::transmission_buf_outcap() {
|
||||
return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
|
||||
}
|
||||
|
||||
double
|
||||
Router::transmission_buf_ctrcap() {
|
||||
return gate_cap(NTtr)+gate_cap(PTtr);
|
||||
}
|
||||
|
||||
double
|
||||
Router::crossbar_inpline() {
|
||||
return (Cw3(O*flit_size*wt) + O*transmission_buf_inpcap() + gate_cap(NTid) +
|
||||
gate_cap(PTid) + diff_cap(NTid, 0, 1) + diff_cap(PTid, 1, 1));
|
||||
}
|
||||
|
||||
double
|
||||
Router::crossbar_outline() {
|
||||
return (Cw3(I*flit_size*ht) + I*transmission_buf_outcap() + gate_cap(NTod) +
|
||||
gate_cap(PTod) + diff_cap(NTod, 0, 1) + diff_cap(PTod, 1, 1));
|
||||
}
|
||||
|
||||
double
|
||||
Router::crossbar_ctrline() {
|
||||
return (Cw3(0.5*O*flit_size*wt) + flit_size*transmission_buf_ctrcap() +
|
||||
diff_cap(NTi, 0, 1) + diff_cap(PTi, 1, 1) +
|
||||
gate_cap(NTi) + gate_cap(PTi));
|
||||
}
|
||||
|
||||
double
|
||||
Router::tr_crossbar_power() {
|
||||
return (crossbar_inpline()*Vdd*Vdd*flit_size/2 +
|
||||
crossbar_outline()*Vdd*Vdd*flit_size/2)*2;
|
||||
}
|
||||
|
||||
void Router::buffer_stats()
|
||||
{
|
||||
DynamicParameter dyn_p;
|
||||
dyn_p.is_tag = false;
|
||||
dyn_p.pure_cam = false;
|
||||
dyn_p.fully_assoc = false;
|
||||
dyn_p.pure_ram = true;
|
||||
dyn_p.is_dram = false;
|
||||
dyn_p.is_main_mem = false;
|
||||
dyn_p.num_subarrays = 1;
|
||||
dyn_p.num_mats = 1;
|
||||
dyn_p.Ndbl = 1;
|
||||
dyn_p.Ndwl = 1;
|
||||
dyn_p.Nspd = 1;
|
||||
dyn_p.deg_bl_muxing = 1;
|
||||
dyn_p.deg_senseamp_muxing_non_associativity = 1;
|
||||
dyn_p.Ndsam_lev_1 = 1;
|
||||
dyn_p.Ndsam_lev_2 = 1;
|
||||
dyn_p.Ndcm = 1;
|
||||
dyn_p.number_addr_bits_mat = 8;
|
||||
dyn_p.number_way_select_signals_mat = 1;
|
||||
dyn_p.number_subbanks_decode = 0;
|
||||
dyn_p.num_act_mats_hor_dir = 1;
|
||||
dyn_p.V_b_sense = Vdd; // FIXME check power calc.
|
||||
dyn_p.ram_cell_tech_type = 0;
|
||||
dyn_p.num_r_subarray = (int) vc_buffer_size;
|
||||
dyn_p.num_c_subarray = (int) flit_size * (int) vc_count;
|
||||
dyn_p.num_mats_h_dir = 1;
|
||||
dyn_p.num_mats_v_dir = 1;
|
||||
dyn_p.num_do_b_subbank = (int)flit_size;
|
||||
dyn_p.num_di_b_subbank = (int)flit_size;
|
||||
dyn_p.num_do_b_mat = (int) flit_size;
|
||||
dyn_p.num_di_b_mat = (int) flit_size;
|
||||
dyn_p.num_do_b_mat = (int) flit_size;
|
||||
dyn_p.num_di_b_mat = (int) flit_size;
|
||||
dyn_p.num_do_b_bank_per_port = (int) flit_size;
|
||||
dyn_p.num_di_b_bank_per_port = (int) flit_size;
|
||||
dyn_p.out_w = (int) flit_size;
|
||||
|
||||
dyn_p.use_inp_params = 1;
|
||||
dyn_p.num_wr_ports = (unsigned int) vc_count;
|
||||
dyn_p.num_rd_ports = 1;//(unsigned int) vc_count;//based on Bill Dally's book
|
||||
dyn_p.num_rw_ports = 0;
|
||||
dyn_p.num_se_rd_ports =0;
|
||||
dyn_p.num_search_ports =0;
|
||||
|
||||
|
||||
|
||||
dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports +
|
||||
dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports);
|
||||
dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 +
|
||||
(dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) +
|
||||
dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports;
|
||||
|
||||
Mat buff(dyn_p);
|
||||
buff.compute_delays(0);
|
||||
buff.compute_power_energy();
|
||||
buffer.power.readOp = buff.power.readOp;
|
||||
buffer.power.writeOp = buffer.power.readOp; //FIXME
|
||||
buffer.area = buff.area;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void
|
||||
Router::cb_stats ()
|
||||
{
|
||||
if (1) {
|
||||
Crossbar c_b(I, O, flit_size);
|
||||
c_b.compute_power();
|
||||
crossbar.delay = c_b.delay;
|
||||
crossbar.power.readOp.dynamic = c_b.power.readOp.dynamic;
|
||||
crossbar.power.readOp.leakage = c_b.power.readOp.leakage;
|
||||
crossbar.power.readOp.gate_leakage = c_b.power.readOp.gate_leakage;
|
||||
crossbar.area = c_b.area;
|
||||
// c_b.print_crossbar();
|
||||
}
|
||||
else {
|
||||
crossbar.power.readOp.dynamic = tr_crossbar_power();
|
||||
crossbar.power.readOp.leakage = flit_size * I * O *
|
||||
cmos_Isub_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
|
||||
crossbar.power.readOp.gate_leakage = flit_size * I * O *
|
||||
cmos_Ig_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Router::get_router_power()
|
||||
{
|
||||
/* calculate buffer stats */
|
||||
buffer_stats();
|
||||
|
||||
/* calculate cross-bar stats */
|
||||
cb_stats();
|
||||
|
||||
/* calculate arbiter stats */
|
||||
Arbiter vcarb(vc_count, flit_size, buffer.area.w);
|
||||
Arbiter cbarb(I, flit_size, crossbar.area.w);
|
||||
vcarb.compute_power();
|
||||
cbarb.compute_power();
|
||||
arbiter.power.readOp.dynamic = vcarb.power.readOp.dynamic * I +
|
||||
cbarb.power.readOp.dynamic * O;
|
||||
arbiter.power.readOp.leakage = vcarb.power.readOp.leakage * I +
|
||||
cbarb.power.readOp.leakage * O;
|
||||
arbiter.power.readOp.gate_leakage = vcarb.power.readOp.gate_leakage * I +
|
||||
cbarb.power.readOp.gate_leakage * O;
|
||||
|
||||
// arb_stats();
|
||||
power.readOp.dynamic = ((buffer.power.readOp.dynamic+buffer.power.writeOp.dynamic) +
|
||||
crossbar.power.readOp.dynamic +
|
||||
arbiter.power.readOp.dynamic)*MIN(I, O)*M;
|
||||
double pppm_t[4] = {1,I,I,1};
|
||||
power = power + (buffer.power*pppm_t + crossbar.power + arbiter.power)*pppm_lkg;
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
Router::get_router_delay ()
|
||||
{
|
||||
FREQUENCY=5; // move this to config file --TODO
|
||||
cycle_time = (1/(double)FREQUENCY)*1e3; //ps
|
||||
delay = 4;
|
||||
max_cyc = 17 * g_tp.FO4; //s
|
||||
max_cyc *= 1e12; //ps
|
||||
if (cycle_time < max_cyc) {
|
||||
FREQUENCY = (1/max_cyc)*1e3; //GHz
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Router::get_router_area()
|
||||
{
|
||||
area.h = I*buffer.area.h;
|
||||
area.w = buffer.area.w+crossbar.area.w;
|
||||
}
|
||||
|
||||
void
|
||||
Router::calc_router_parameters()
|
||||
{
|
||||
/* calculate router frequency and pipeline cycles */
|
||||
get_router_delay();
|
||||
|
||||
/* router power stats */
|
||||
get_router_power();
|
||||
|
||||
/* area stats */
|
||||
get_router_area();
|
||||
}
|
||||
|
||||
void
|
||||
Router::print_router()
|
||||
{
|
||||
cout << "\n\nRouter stats:\n";
|
||||
cout << "\tRouter Area - "<< area.get_area()*1e-6<<"(mm^2)\n";
|
||||
cout << "\tMaximum possible network frequency - " << (1/max_cyc)*1e3 << "GHz\n";
|
||||
cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n";
|
||||
cout << "\tNo. of Virtual channels - " << vc_count << "\n";
|
||||
cout << "\tNo. of pipeline stages - " << delay << endl;
|
||||
cout << "\tLink bandwidth - " << flit_size << " (bits)\n";
|
||||
cout << "\tNo. of buffer entries per virtual channel - "<< vc_buffer_size << "\n";
|
||||
cout << "\tSimple buffer Area - "<< buffer.area.get_area()*1e-6<<"(mm^2)\n";
|
||||
cout << "\tSimple buffer access (Read) - " << buffer.power.readOp.dynamic * 1e9 <<" (nJ)\n";
|
||||
cout << "\tSimple buffer leakage - " << buffer.power.readOp.leakage * 1e3 <<" (mW)\n";
|
||||
cout << "\tCrossbar Area - "<< crossbar.area.get_area()*1e-6<<"(mm^2)\n";
|
||||
cout << "\tCross bar access energy - " << crossbar.power.readOp.dynamic * 1e9<<" (nJ)\n";
|
||||
cout << "\tCross bar leakage power - " << crossbar.power.readOp.leakage * 1e3<<" (mW)\n";
|
||||
cout << "\tArbiter access energy (VC arb + Crossbar arb) - "<<arbiter.power.readOp.dynamic * 1e9 <<" (nJ)\n";
|
||||
cout << "\tArbiter leakage (VC arb + Crossbar arb) - "<<arbiter.power.readOp.leakage * 1e3 <<" (mW)\n";
|
||||
|
||||
}
|
||||
|
115
ext/mcpat/cacti/router.h
Normal file
115
ext/mcpat/cacti/router.h
Normal file
|
@ -0,0 +1,115 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __ROUTER_H__
|
||||
#define __ROUTER_H__
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "arbiter.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "crossbar.h"
|
||||
#include "mat.h"
|
||||
#include "parameter.h"
|
||||
#include "wire.h"
|
||||
|
||||
class Router : public Component
|
||||
{
|
||||
public:
|
||||
Router(
|
||||
double flit_size_,
|
||||
double vc_buf, /* vc size = vc_buffer_size * flit_size */
|
||||
double vc_count,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global),
|
||||
double I_ = 5,
|
||||
double O_ = 5,
|
||||
double M_ = 0.6);
|
||||
~Router();
|
||||
|
||||
|
||||
void print_router();
|
||||
|
||||
Component arbiter, crossbar, buffer;
|
||||
|
||||
double cycle_time, max_cyc;
|
||||
double flit_size;
|
||||
double vc_count;
|
||||
double vc_buffer_size; /* vc size = vc_buffer_size * flit_size */
|
||||
|
||||
private:
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
double FREQUENCY; // move this to config file --TODO
|
||||
double Cw3(double len);
|
||||
double gate_cap(double w);
|
||||
double diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, double stack);
|
||||
enum Wire_type wtype;
|
||||
enum Wire_placement wire_placement;
|
||||
//corssbar
|
||||
double NTtr, PTtr, wt, ht, I, O, NTi, PTi, NTid, PTid, NTod, PTod, TriS1, TriS2;
|
||||
double M; //network load
|
||||
double transmission_buf_inpcap();
|
||||
double transmission_buf_outcap();
|
||||
double transmission_buf_ctrcap();
|
||||
double crossbar_inpline();
|
||||
double crossbar_outline();
|
||||
double crossbar_ctrline();
|
||||
double tr_crossbar_power();
|
||||
void cb_stats ();
|
||||
double arb_power();
|
||||
void arb_stats ();
|
||||
double buffer_params();
|
||||
void buffer_stats();
|
||||
|
||||
|
||||
//arbiter
|
||||
|
||||
//buffer
|
||||
|
||||
//router params
|
||||
double Vdd;
|
||||
|
||||
void calc_router_parameters();
|
||||
void get_router_area();
|
||||
void get_router_power();
|
||||
void get_router_delay();
|
||||
|
||||
double min_w_pmos;
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
196
ext/mcpat/cacti/subarray.cc
Executable file
196
ext/mcpat/cacti/subarray.cc
Executable file
|
@ -0,0 +1,196 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "subarray.h"
|
||||
|
||||
Subarray::Subarray(const DynamicParameter & dp_, bool is_fa_):
|
||||
dp(dp_), num_rows(dp.num_r_subarray), num_cols(dp.num_c_subarray),
|
||||
num_cols_fa_cam(dp.tag_num_c_subarray), num_cols_fa_ram(dp.data_num_c_subarray),
|
||||
cell(dp.cell), cam_cell(dp.cam_cell), is_fa(is_fa_)
|
||||
{
|
||||
//num_cols=7;
|
||||
//cout<<"num_cols ="<< num_cols <<endl;
|
||||
if (!(is_fa || dp.pure_cam))
|
||||
{
|
||||
num_cols +=(g_ip->add_ecc_b_ ? (int)ceil(num_cols / num_bits_per_ecc_b_) : 0); // ECC overhead
|
||||
uint32_t ram_num_cells_wl_stitching =
|
||||
(dp.ram_cell_tech_type == lp_dram) ? dram_num_cells_wl_stitching_ :
|
||||
(dp.ram_cell_tech_type == comm_dram) ? comm_dram_num_cells_wl_stitching_ : sram_num_cells_wl_stitching_;
|
||||
|
||||
area.h = cell.h * num_rows;
|
||||
|
||||
area.w = cell.w * num_cols +
|
||||
ceil(num_cols / ram_num_cells_wl_stitching) * g_tp.ram_wl_stitching_overhead_; // stitching overhead
|
||||
}
|
||||
else //cam fa
|
||||
{
|
||||
|
||||
//should not add dummy row here since the dummy row do not need decoder
|
||||
if (is_fa)// fully associative cache
|
||||
{
|
||||
num_cols_fa_cam += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
|
||||
num_cols_fa_ram += (g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_ram / num_bits_per_ecc_b_) : 0);
|
||||
num_cols = num_cols_fa_cam + num_cols_fa_ram;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_cols_fa_cam += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
|
||||
num_cols_fa_ram = 0;
|
||||
num_cols = num_cols_fa_cam;
|
||||
}
|
||||
|
||||
area.h = cam_cell.h * (num_rows + 1);//height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
|
||||
area.w = cam_cell.w * num_cols_fa_cam + cell.w * num_cols_fa_ram
|
||||
+ ceil((num_cols_fa_cam + num_cols_fa_ram) / sram_num_cells_wl_stitching_)*g_tp.ram_wl_stitching_overhead_
|
||||
+ 16*g_tp.wire_local.pitch //the overhead for the NAND gate to connect the two halves
|
||||
+ 128*g_tp.wire_local.pitch;//the overhead for the drivers from matchline to wordline of RAM
|
||||
}
|
||||
|
||||
assert(area.h>0);
|
||||
assert(area.w>0);
|
||||
compute_C();
|
||||
}
|
||||
|
||||
|
||||
|
||||
Subarray::~Subarray()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Subarray::get_total_cell_area()
|
||||
{
|
||||
// return (is_fa==false? cell.get_area() * num_rows * num_cols
|
||||
// //: cam_cell.h*(num_rows+1)*(num_cols_fa_cam + sram_cell.get_area()*num_cols_fa_ram));
|
||||
// : cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
|
||||
// //: cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam + sram_cell.get_area()*(num_rows+1)*num_cols_fa_ram);//for FA, this area does not include the dummy cells in SRAM arrays.
|
||||
|
||||
if (!(is_fa || dp.pure_cam))
|
||||
return (cell.get_area() * num_rows * num_cols);
|
||||
else if (is_fa)
|
||||
{ //for FA, this area includes the dummy cells in SRAM arrays.
|
||||
//return (cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
|
||||
//cout<<"diff" <<cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)- cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram)<<endl;
|
||||
return (cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram));
|
||||
}
|
||||
else
|
||||
return (cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam );
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Subarray::compute_C()
|
||||
{
|
||||
double c_w_metal = cell.w * g_tp.wire_local.C_per_um;
|
||||
double r_w_metal = cell.w * g_tp.wire_local.R_per_um;
|
||||
double C_b_metal = cell.h * g_tp.wire_local.C_per_um;
|
||||
double C_b_row_drain_C;
|
||||
|
||||
if (dp.is_dram)
|
||||
{
|
||||
C_wl = (gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true) + c_w_metal) * num_cols;
|
||||
|
||||
if (dp.ram_cell_tech_type == comm_dram)
|
||||
{
|
||||
C_bl = num_rows * C_b_metal;
|
||||
}
|
||||
else
|
||||
{
|
||||
C_b_row_drain_C = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0; // due to shared contact
|
||||
C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!(is_fa ||dp.pure_cam))
|
||||
{
|
||||
C_wl = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
|
||||
c_w_metal) * num_cols;
|
||||
C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0; // due to shared contact
|
||||
C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Following is wordline not matchline
|
||||
//CAM portion
|
||||
c_w_metal = cam_cell.w * g_tp.wire_local.C_per_um;
|
||||
r_w_metal = cam_cell.w * g_tp.wire_local.R_per_um;
|
||||
C_wl_cam = (gate_C_pass(g_tp.cam.cell_a_w, (g_tp.cam.b_w-2*g_tp.cam.cell_a_w)/2.0, false, true)*2 +
|
||||
c_w_metal) * num_cols_fa_cam;
|
||||
R_wl_cam = (r_w_metal) * num_cols_fa_cam;
|
||||
|
||||
if (!dp.pure_cam)
|
||||
{
|
||||
//RAM portion
|
||||
c_w_metal = cell.w * g_tp.wire_local.C_per_um;
|
||||
r_w_metal = cell.w * g_tp.wire_local.R_per_um;
|
||||
C_wl_ram = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
|
||||
c_w_metal) * num_cols_fa_ram;
|
||||
R_wl_ram = (r_w_metal) * num_cols_fa_ram;
|
||||
}
|
||||
else
|
||||
{
|
||||
C_wl_ram = R_wl_ram =0;
|
||||
}
|
||||
C_wl = C_wl_cam + C_wl_ram;
|
||||
C_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.C_per_um;
|
||||
|
||||
R_wl = R_wl_cam + R_wl_ram;
|
||||
R_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.R_per_um;
|
||||
|
||||
//there are two ways to write to a FA,
|
||||
//1) Write to CAM array then force a match on match line to active the corresponding wordline in RAM;
|
||||
//2) using separate wordline for read/write and search in RAM.
|
||||
//We are using the second approach.
|
||||
|
||||
//Bitline CAM portion This is bitline not searchline. We assume no sharing between bitline and searchline according to SUN's implementations.
|
||||
C_b_metal = cam_cell.h * g_tp.wire_local.C_per_um;
|
||||
C_b_row_drain_C = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0; // due to shared contact
|
||||
C_bl_cam = (num_rows+1) * (C_b_row_drain_C + C_b_metal);
|
||||
//height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
|
||||
C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0; // due to shared contact
|
||||
C_bl = (num_rows +1) * (C_b_row_drain_C + C_b_metal);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
70
ext/mcpat/cacti/subarray.h
Executable file
70
ext/mcpat/cacti/subarray.h
Executable file
|
@ -0,0 +1,70 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __SUBARRAY_H__
|
||||
#define __SUBARRAY_H__
|
||||
|
||||
#include "area.h"
|
||||
#include "component.h"
|
||||
#include "parameter.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
class Subarray : public Component
|
||||
{
|
||||
public:
|
||||
Subarray(const DynamicParameter & dp, bool is_fa_);
|
||||
~Subarray();
|
||||
|
||||
const DynamicParameter & dp;
|
||||
double get_total_cell_area();
|
||||
unsigned int num_rows;
|
||||
unsigned int num_cols;
|
||||
int32_t num_cols_fa_cam;
|
||||
int32_t num_cols_fa_ram;
|
||||
Area cell, cam_cell;
|
||||
|
||||
bool is_fa;
|
||||
double C_wl, C_wl_cam, C_wl_ram;
|
||||
double R_wl, R_wl_cam, R_wl_ram;
|
||||
double C_bl, C_bl_cam;
|
||||
private:
|
||||
|
||||
void compute_C(); // compute bitline and wordline capacitance
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
2921
ext/mcpat/cacti/technology.cc
Normal file
2921
ext/mcpat/cacti/technology.cc
Normal file
File diff suppressed because it is too large
Load diff
426
ext/mcpat/cacti/uca.cc
Executable file
426
ext/mcpat/cacti/uca.cc
Executable file
|
@ -0,0 +1,426 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include "uca.h"
|
||||
|
||||
UCA::UCA(const DynamicParameter & dyn_p)
|
||||
:dp(dyn_p), bank(dp), nbanks(g_ip->nbanks), refresh_power(0)
|
||||
{
|
||||
int num_banks_ver_dir = 1 << ((bank.area.h > bank.area.w) ? _log2(nbanks)/2 : (_log2(nbanks) - _log2(nbanks)/2));
|
||||
int num_banks_hor_dir = nbanks/num_banks_ver_dir;
|
||||
|
||||
if (dp.use_inp_params)
|
||||
{
|
||||
RWP = dp.num_rw_ports;
|
||||
ERP = dp.num_rd_ports;
|
||||
EWP = dp.num_wr_ports;
|
||||
SCHP = dp.num_search_ports;
|
||||
}
|
||||
else
|
||||
{
|
||||
RWP = g_ip->num_rw_ports;
|
||||
ERP = g_ip->num_rd_ports;
|
||||
EWP = g_ip->num_wr_ports;
|
||||
SCHP = g_ip->num_search_ports;
|
||||
}
|
||||
|
||||
num_addr_b_bank = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
|
||||
num_di_b_bank = dp.num_di_b_bank_per_port * (RWP + EWP);
|
||||
num_do_b_bank = dp.num_do_b_bank_per_port * (RWP + ERP);
|
||||
num_si_b_bank = dp.num_si_b_bank_per_port * SCHP;
|
||||
num_so_b_bank = dp.num_so_b_bank_per_port * SCHP;
|
||||
|
||||
if (!dp.fully_assoc && !dp.pure_cam)
|
||||
{
|
||||
|
||||
if (g_ip->fast_access && dp.is_tag == false)
|
||||
{
|
||||
num_do_b_bank *= g_ip->data_assoc;
|
||||
}
|
||||
|
||||
htree_in_add = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank,0, num_do_b_bank,0,num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
|
||||
htree_in_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
|
||||
htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
|
||||
htree_in_add = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank, num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
|
||||
htree_in_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
|
||||
htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
|
||||
htree_in_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
|
||||
htree_out_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
|
||||
num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
|
||||
}
|
||||
|
||||
area.w = htree_in_data->area.w;
|
||||
area.h = htree_in_data->area.h;
|
||||
|
||||
area_all_dataramcells = bank.mat.subarray.get_total_cell_area() * dp.num_subarrays * g_ip->nbanks;
|
||||
// cout<<"area cell"<<area_all_dataramcells<<endl;
|
||||
// cout<<area.get_area()<<endl;
|
||||
// delay calculation
|
||||
double inrisetime = 0.0;
|
||||
compute_delays(inrisetime);
|
||||
compute_power_energy();
|
||||
}
|
||||
|
||||
|
||||
|
||||
UCA::~UCA()
|
||||
{
|
||||
delete htree_in_add;
|
||||
delete htree_in_data;
|
||||
delete htree_out_data;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double UCA::compute_delays(double inrisetime)
|
||||
{
|
||||
double outrisetime = bank.compute_delays(inrisetime);
|
||||
|
||||
double delay_array_to_mat = htree_in_add->delay + bank.htree_in_add->delay;
|
||||
double max_delay_before_row_decoder = delay_array_to_mat + bank.mat.r_predec->delay;
|
||||
delay_array_to_sa_mux_lev_1_decoder = delay_array_to_mat +
|
||||
bank.mat.sa_mux_lev_1_predec->delay +
|
||||
bank.mat.sa_mux_lev_1_dec->delay;
|
||||
delay_array_to_sa_mux_lev_2_decoder = delay_array_to_mat +
|
||||
bank.mat.sa_mux_lev_2_predec->delay +
|
||||
bank.mat.sa_mux_lev_2_dec->delay;
|
||||
double delay_inside_mat = bank.mat.row_dec->delay + bank.mat.delay_bitline + bank.mat.delay_sa;
|
||||
|
||||
delay_before_subarray_output_driver =
|
||||
MAX(MAX(max_delay_before_row_decoder + delay_inside_mat, // row_path
|
||||
delay_array_to_mat + bank.mat.b_mux_predec->delay + bank.mat.bit_mux_dec->delay + bank.mat.delay_sa), // col_path
|
||||
MAX(delay_array_to_sa_mux_lev_1_decoder, // sa_mux_lev_1_path
|
||||
delay_array_to_sa_mux_lev_2_decoder)); // sa_mux_lev_2_path
|
||||
delay_from_subarray_out_drv_to_out = bank.mat.delay_subarray_out_drv_htree +
|
||||
bank.htree_out_data->delay + htree_out_data->delay;
|
||||
access_time = bank.mat.delay_comparator;
|
||||
|
||||
double ram_delay_inside_mat;
|
||||
if (dp.fully_assoc)
|
||||
{
|
||||
//delay of FA contains both CAM tag and RAM data
|
||||
{ //delay of CAM
|
||||
ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
|
||||
access_time = htree_in_add->delay + bank.htree_in_add->delay;
|
||||
//delay of fully-associative data array
|
||||
access_time += ram_delay_inside_mat + delay_from_subarray_out_drv_to_out;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
access_time = delay_before_subarray_output_driver + delay_from_subarray_out_drv_to_out; //data_acc_path
|
||||
}
|
||||
|
||||
if (dp.is_main_mem)
|
||||
{
|
||||
double t_rcd = max_delay_before_row_decoder + delay_inside_mat;
|
||||
double cas_latency = MAX(delay_array_to_sa_mux_lev_1_decoder, delay_array_to_sa_mux_lev_2_decoder) +
|
||||
delay_from_subarray_out_drv_to_out;
|
||||
access_time = t_rcd + cas_latency;
|
||||
}
|
||||
|
||||
double temp;
|
||||
|
||||
if (!dp.fully_assoc)
|
||||
{
|
||||
temp = delay_inside_mat + bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;//TODO: Sheng: revisit
|
||||
if (dp.is_dram)
|
||||
{
|
||||
temp += bank.mat.delay_writeback; // temp stores random cycle time
|
||||
}
|
||||
|
||||
|
||||
temp = MAX(temp, bank.mat.r_predec->delay);
|
||||
temp = MAX(temp, bank.mat.b_mux_predec->delay);
|
||||
temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
|
||||
temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
|
||||
}
|
||||
else
|
||||
{
|
||||
ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
|
||||
temp = ram_delay_inside_mat + bank.mat.delay_cam_sl_restore + bank.mat.delay_cam_ml_reset + bank.mat.delay_bl_restore
|
||||
+ bank.mat.delay_hit_miss_reset + bank.mat.delay_wl_reset;
|
||||
|
||||
temp = MAX(temp, bank.mat.b_mux_predec->delay);//TODO: Sheng revisit whether distinguish cam and ram bitline etc.
|
||||
temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
|
||||
temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
|
||||
}
|
||||
|
||||
// The following is true only if the input parameter "repeaters_in_htree" is set to false --Nav
|
||||
if (g_ip->rpters_in_htree == false)
|
||||
{
|
||||
temp = MAX(temp, bank.htree_in_add->max_unpipelined_link_delay);
|
||||
}
|
||||
cycle_time = temp;
|
||||
|
||||
double delay_req_network = max_delay_before_row_decoder;
|
||||
double delay_rep_network = delay_from_subarray_out_drv_to_out;
|
||||
multisubbank_interleave_cycle_time = MAX(delay_req_network, delay_rep_network);
|
||||
|
||||
if (dp.is_main_mem)
|
||||
{
|
||||
multisubbank_interleave_cycle_time = htree_in_add->delay;
|
||||
precharge_delay = htree_in_add->delay +
|
||||
bank.htree_in_add->delay + bank.mat.delay_writeback +
|
||||
bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;
|
||||
cycle_time = access_time + precharge_delay;
|
||||
}
|
||||
else
|
||||
{
|
||||
precharge_delay = 0;
|
||||
}
|
||||
|
||||
double dram_array_availability = 0;
|
||||
if (dp.is_dram)
|
||||
{
|
||||
dram_array_availability = (1 - dp.num_r_subarray * cycle_time / dp.dram_refresh_period) * 100;
|
||||
}
|
||||
|
||||
return outrisetime;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// note: currently, power numbers are for a bank of an array
|
||||
void UCA::compute_power_energy()
|
||||
{
|
||||
bank.compute_power_energy();
|
||||
power = bank.power;
|
||||
|
||||
power_routing_to_bank.readOp.dynamic = htree_in_add->power.readOp.dynamic + htree_out_data->power.readOp.dynamic;
|
||||
power_routing_to_bank.writeOp.dynamic = htree_in_add->power.readOp.dynamic + htree_in_data->power.readOp.dynamic;
|
||||
if (dp.fully_assoc || dp.pure_cam)
|
||||
power_routing_to_bank.searchOp.dynamic= htree_in_search->power.searchOp.dynamic + htree_out_search->power.searchOp.dynamic;
|
||||
|
||||
power_routing_to_bank.readOp.leakage += htree_in_add->power.readOp.leakage +
|
||||
htree_in_data->power.readOp.leakage +
|
||||
htree_out_data->power.readOp.leakage;
|
||||
|
||||
power_routing_to_bank.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage +
|
||||
htree_in_data->power.readOp.gate_leakage +
|
||||
htree_out_data->power.readOp.gate_leakage;
|
||||
if (dp.fully_assoc || dp.pure_cam)
|
||||
{
|
||||
power_routing_to_bank.readOp.leakage += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
|
||||
power_routing_to_bank.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
|
||||
}
|
||||
|
||||
power.searchOp.dynamic += power_routing_to_bank.searchOp.dynamic;
|
||||
power.readOp.dynamic += power_routing_to_bank.readOp.dynamic;
|
||||
power.readOp.leakage += power_routing_to_bank.readOp.leakage;
|
||||
power.readOp.gate_leakage += power_routing_to_bank.readOp.gate_leakage;
|
||||
|
||||
// calculate total write energy per access
|
||||
power.writeOp.dynamic = power.readOp.dynamic
|
||||
- bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
|
||||
+ bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
|
||||
- power_routing_to_bank.readOp.dynamic
|
||||
+ power_routing_to_bank.writeOp.dynamic
|
||||
+ bank.htree_in_data->power.readOp.dynamic
|
||||
- bank.htree_out_data->power.readOp.dynamic;
|
||||
|
||||
if (dp.is_dram == false)
|
||||
{
|
||||
power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
|
||||
}
|
||||
|
||||
dyn_read_energy_from_closed_page = power.readOp.dynamic;
|
||||
dyn_read_energy_from_open_page = power.readOp.dynamic -
|
||||
(bank.mat.r_predec->power.readOp.dynamic +
|
||||
bank.mat.power_row_decoders.readOp.dynamic +
|
||||
bank.mat.power_bl_precharge_eq_drv.readOp.dynamic +
|
||||
bank.mat.power_sa.readOp.dynamic +
|
||||
bank.mat.power_bitline.readOp.dynamic) * dp.num_act_mats_hor_dir;
|
||||
|
||||
dyn_read_energy_remaining_words_in_burst =
|
||||
(MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1) *
|
||||
((bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
|
||||
bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir +
|
||||
bank.htree_out_data->power.readOp.dynamic +
|
||||
power_routing_to_bank.readOp.dynamic);
|
||||
dyn_read_energy_from_closed_page += dyn_read_energy_remaining_words_in_burst;
|
||||
dyn_read_energy_from_open_page += dyn_read_energy_remaining_words_in_burst;
|
||||
|
||||
activate_energy = htree_in_add->power.readOp.dynamic +
|
||||
bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_act +
|
||||
(bank.mat.r_predec->power.readOp.dynamic +
|
||||
bank.mat.power_row_decoders.readOp.dynamic +
|
||||
bank.mat.power_sa.readOp.dynamic) * dp.num_act_mats_hor_dir;
|
||||
read_energy = (htree_in_add->power.readOp.dynamic +
|
||||
bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
|
||||
(bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
|
||||
bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir +
|
||||
bank.htree_out_data->power.readOp.dynamic +
|
||||
htree_in_data->power.readOp.dynamic) * g_ip->burst_len;
|
||||
write_energy = (htree_in_add->power.readOp.dynamic +
|
||||
bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
|
||||
htree_in_data->power.readOp.dynamic +
|
||||
bank.htree_in_data->power.readOp.dynamic +
|
||||
(bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic) * dp.num_act_mats_hor_dir) * g_ip->burst_len;
|
||||
precharge_energy = (bank.mat.power_bitline.readOp.dynamic +
|
||||
bank.mat.power_bl_precharge_eq_drv.readOp.dynamic) * dp.num_act_mats_hor_dir;
|
||||
|
||||
leak_power_subbank_closed_page =
|
||||
(bank.mat.r_predec->power.readOp.leakage +
|
||||
bank.mat.b_mux_predec->power.readOp.leakage +
|
||||
bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
|
||||
bank.mat.power_row_decoders.readOp.leakage +
|
||||
bank.mat.power_bit_mux_decoders.readOp.leakage +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
|
||||
bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
|
||||
|
||||
leak_power_subbank_closed_page +=
|
||||
(bank.mat.r_predec->power.readOp.gate_leakage +
|
||||
bank.mat.b_mux_predec->power.readOp.gate_leakage +
|
||||
bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
|
||||
bank.mat.power_row_decoders.readOp.gate_leakage +
|
||||
bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage) * dp.num_act_mats_hor_dir; //+
|
||||
//bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
|
||||
|
||||
leak_power_subbank_open_page =
|
||||
(bank.mat.r_predec->power.readOp.leakage +
|
||||
bank.mat.b_mux_predec->power.readOp.leakage +
|
||||
bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
|
||||
bank.mat.power_row_decoders.readOp.leakage +
|
||||
bank.mat.power_bit_mux_decoders.readOp.leakage +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
|
||||
bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
|
||||
|
||||
leak_power_subbank_open_page +=
|
||||
(bank.mat.r_predec->power.readOp.gate_leakage +
|
||||
bank.mat.b_mux_predec->power.readOp.gate_leakage +
|
||||
bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
|
||||
bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
|
||||
bank.mat.power_row_decoders.readOp.gate_leakage +
|
||||
bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
|
||||
bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
|
||||
bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage ) * dp.num_act_mats_hor_dir;
|
||||
//bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
|
||||
|
||||
leak_power_request_and_reply_networks =
|
||||
power_routing_to_bank.readOp.leakage +
|
||||
bank.htree_in_add->power.readOp.leakage +
|
||||
bank.htree_in_data->power.readOp.leakage +
|
||||
bank.htree_out_data->power.readOp.leakage;
|
||||
|
||||
leak_power_request_and_reply_networks +=
|
||||
power_routing_to_bank.readOp.gate_leakage +
|
||||
bank.htree_in_add->power.readOp.gate_leakage +
|
||||
bank.htree_in_data->power.readOp.gate_leakage +
|
||||
bank.htree_out_data->power.readOp.gate_leakage;
|
||||
|
||||
if (dp.fully_assoc || dp.pure_cam)
|
||||
{
|
||||
leak_power_request_and_reply_networks += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
|
||||
leak_power_request_and_reply_networks += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
|
||||
}
|
||||
|
||||
|
||||
if (dp.is_dram)
|
||||
{ // if DRAM, add contribution of power spent in row predecoder drivers, blocks and decoders to refresh power
|
||||
refresh_power = (bank.mat.r_predec->power.readOp.dynamic * dp.num_act_mats_hor_dir +
|
||||
bank.mat.row_dec->power.readOp.dynamic) * dp.num_r_subarray * dp.num_subarrays;
|
||||
refresh_power += bank.mat.per_bitline_read_energy * dp.num_c_subarray * dp.num_r_subarray * dp.num_subarrays;
|
||||
refresh_power += bank.mat.power_bl_precharge_eq_drv.readOp.dynamic * dp.num_act_mats_hor_dir;
|
||||
refresh_power += bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
|
||||
refresh_power /= dp.dram_refresh_period;
|
||||
}
|
||||
|
||||
|
||||
if (dp.is_tag == false)
|
||||
{
|
||||
power.readOp.dynamic = dyn_read_energy_from_closed_page;
|
||||
power.writeOp.dynamic = dyn_read_energy_from_closed_page
|
||||
- dyn_read_energy_remaining_words_in_burst
|
||||
- bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
|
||||
+ bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
|
||||
+ (power_routing_to_bank.writeOp.dynamic -
|
||||
power_routing_to_bank.readOp.dynamic -
|
||||
bank.htree_out_data->power.readOp.dynamic +
|
||||
bank.htree_in_data->power.readOp.dynamic) *
|
||||
(MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1); //FIXME
|
||||
|
||||
if (dp.is_dram == false)
|
||||
{
|
||||
power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
|
||||
}
|
||||
}
|
||||
|
||||
// if DRAM, add refresh power to total leakage
|
||||
if (dp.is_dram)
|
||||
{
|
||||
power.readOp.leakage += refresh_power;
|
||||
}
|
||||
|
||||
// TODO: below should be avoided.
|
||||
/*if (dp.is_main_mem)
|
||||
{
|
||||
power.readOp.leakage += MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA * 1e-3 * g_tp.peri_global.Vdd / g_ip->nbanks;
|
||||
}*/
|
||||
|
||||
assert(power.readOp.dynamic > 0);
|
||||
assert(power.writeOp.dynamic > 0);
|
||||
assert(power.readOp.leakage > 0);
|
||||
}
|
||||
|
95
ext/mcpat/cacti/uca.h
Executable file
95
ext/mcpat/cacti/uca.h
Executable file
|
@ -0,0 +1,95 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __UCA_H__
|
||||
#define __UCA_H__
|
||||
|
||||
#include "area.h"
|
||||
#include "bank.h"
|
||||
#include "component.h"
|
||||
#include "htree2.h"
|
||||
#include "parameter.h"
|
||||
|
||||
class UCA : public Component
|
||||
{
|
||||
public:
|
||||
UCA(const DynamicParameter & dyn_p);
|
||||
~UCA();
|
||||
double compute_delays(double inrisetime); // returns outrisetime
|
||||
void compute_power_energy();
|
||||
|
||||
DynamicParameter dp;
|
||||
Bank bank;
|
||||
|
||||
Htree2 * htree_in_add;
|
||||
Htree2 * htree_in_data;
|
||||
Htree2 * htree_out_data;
|
||||
Htree2 * htree_in_search;
|
||||
Htree2 * htree_out_search;
|
||||
|
||||
powerDef power_routing_to_bank;
|
||||
|
||||
uint32_t nbanks;
|
||||
|
||||
int num_addr_b_bank;
|
||||
int num_di_b_bank;
|
||||
int num_do_b_bank;
|
||||
int num_si_b_bank;
|
||||
int num_so_b_bank;
|
||||
int RWP, ERP, EWP,SCHP;
|
||||
double area_all_dataramcells;
|
||||
|
||||
double dyn_read_energy_from_closed_page;
|
||||
double dyn_read_energy_from_open_page;
|
||||
double dyn_read_energy_remaining_words_in_burst;
|
||||
|
||||
double refresh_power; // only for DRAM
|
||||
double activate_energy;
|
||||
double read_energy;
|
||||
double write_energy;
|
||||
double precharge_energy;
|
||||
double leak_power_subbank_closed_page;
|
||||
double leak_power_subbank_open_page;
|
||||
double leak_power_request_and_reply_networks;
|
||||
|
||||
double delay_array_to_sa_mux_lev_1_decoder;
|
||||
double delay_array_to_sa_mux_lev_2_decoder;
|
||||
double delay_before_subarray_output_driver;
|
||||
double delay_from_subarray_out_drv_to_out;
|
||||
double access_time;
|
||||
double precharge_delay;
|
||||
double multisubbank_interleave_cycle_time;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
832
ext/mcpat/cacti/wire.cc
Normal file
832
ext/mcpat/cacti/wire.cc
Normal file
|
@ -0,0 +1,832 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include "wire.h"
|
||||
#include "cmath"
|
||||
// use this constructor to calculate wire stats
|
||||
Wire::Wire(
|
||||
enum Wire_type wire_model,
|
||||
double wl,
|
||||
int n,
|
||||
double w_s,
|
||||
double s_s,
|
||||
enum Wire_placement wp,
|
||||
double resistivity,
|
||||
TechnologyParameter::DeviceType *dt
|
||||
):wt(wire_model), wire_length(wl*1e-6), nsense(n), w_scale(w_s), s_scale(s_s),
|
||||
resistivity(resistivity), deviceType(dt)
|
||||
{
|
||||
wire_placement = wp;
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
|
||||
in_rise_time = 0;
|
||||
out_rise_time = 0;
|
||||
if (initialized != 1) {
|
||||
cout << "Wire not initialized. Initializing it with default values\n";
|
||||
Wire winit;
|
||||
}
|
||||
calculate_wire_stats();
|
||||
// change everything back to seconds, microns, and Joules
|
||||
repeater_spacing *= 1e6;
|
||||
wire_length *= 1e6;
|
||||
wire_width *= 1e6;
|
||||
wire_spacing *= 1e6;
|
||||
assert(wire_length > 0);
|
||||
assert(power.readOp.dynamic > 0);
|
||||
assert(power.readOp.leakage > 0);
|
||||
assert(power.readOp.gate_leakage > 0);
|
||||
}
|
||||
|
||||
// the following values are for peripheral global technology
|
||||
// specified in the input config file
|
||||
Component Wire::global;
|
||||
Component Wire::global_5;
|
||||
Component Wire::global_10;
|
||||
Component Wire::global_20;
|
||||
Component Wire::global_30;
|
||||
Component Wire::low_swing;
|
||||
|
||||
int Wire::initialized;
|
||||
double Wire::wire_width_init;
|
||||
double Wire::wire_spacing_init;
|
||||
|
||||
|
||||
Wire::Wire(double w_s, double s_s, enum Wire_placement wp, double resis, TechnologyParameter::DeviceType *dt)
|
||||
{
|
||||
w_scale = w_s;
|
||||
s_scale = s_s;
|
||||
deviceType = dt;
|
||||
wire_placement = wp;
|
||||
resistivity = resis;
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * g_tp.min_w_nmos_;
|
||||
in_rise_time = 0;
|
||||
out_rise_time = 0;
|
||||
|
||||
switch (wire_placement)
|
||||
{
|
||||
case outside_mat: wire_width = g_tp.wire_outside_mat.pitch; break;
|
||||
case inside_mat : wire_width = g_tp.wire_inside_mat.pitch; break;
|
||||
default: wire_width = g_tp.wire_local.pitch; break;
|
||||
}
|
||||
|
||||
wire_spacing = wire_width;
|
||||
|
||||
wire_width *= (w_scale * 1e-6/2) /* (m) */;
|
||||
wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
|
||||
|
||||
initialized = 1;
|
||||
init_wire();
|
||||
wire_width_init = wire_width;
|
||||
wire_spacing_init = wire_spacing;
|
||||
|
||||
assert(power.readOp.dynamic > 0);
|
||||
assert(power.readOp.leakage > 0);
|
||||
assert(power.readOp.gate_leakage > 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
Wire::~Wire()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
void
|
||||
Wire::calculate_wire_stats()
|
||||
{
|
||||
|
||||
if (wire_placement == outside_mat) {
|
||||
wire_width = g_tp.wire_outside_mat.pitch;
|
||||
}
|
||||
else if (wire_placement == inside_mat) {
|
||||
wire_width = g_tp.wire_inside_mat.pitch;
|
||||
}
|
||||
else {
|
||||
wire_width = g_tp.wire_local.pitch;
|
||||
}
|
||||
|
||||
wire_spacing = wire_width;
|
||||
|
||||
wire_width *= (w_scale * 1e-6/2) /* (m) */;
|
||||
wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
|
||||
|
||||
|
||||
if (wt != Low_swing) {
|
||||
|
||||
// delay_optimal_wire();
|
||||
|
||||
if (wt == Global) {
|
||||
delay = global.delay * wire_length;
|
||||
power.readOp.dynamic = global.power.readOp.dynamic * wire_length;
|
||||
power.readOp.leakage = global.power.readOp.leakage * wire_length;
|
||||
power.readOp.gate_leakage = global.power.readOp.gate_leakage * wire_length;
|
||||
repeater_spacing = global.area.w;
|
||||
repeater_size = global.area.h;
|
||||
area.set_area((wire_length/repeater_spacing) *
|
||||
compute_gate_area(INV, 1, min_w_pmos * repeater_size,
|
||||
g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
|
||||
}
|
||||
else if (wt == Global_5) {
|
||||
delay = global_5.delay * wire_length;
|
||||
power.readOp.dynamic = global_5.power.readOp.dynamic * wire_length;
|
||||
power.readOp.leakage = global_5.power.readOp.leakage * wire_length;
|
||||
power.readOp.gate_leakage = global_5.power.readOp.gate_leakage * wire_length;
|
||||
repeater_spacing = global_5.area.w;
|
||||
repeater_size = global_5.area.h;
|
||||
area.set_area((wire_length/repeater_spacing) *
|
||||
compute_gate_area(INV, 1, min_w_pmos * repeater_size,
|
||||
g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
|
||||
}
|
||||
else if (wt == Global_10) {
|
||||
delay = global_10.delay * wire_length;
|
||||
power.readOp.dynamic = global_10.power.readOp.dynamic * wire_length;
|
||||
power.readOp.leakage = global_10.power.readOp.leakage * wire_length;
|
||||
power.readOp.gate_leakage = global_10.power.readOp.gate_leakage * wire_length;
|
||||
repeater_spacing = global_10.area.w;
|
||||
repeater_size = global_10.area.h;
|
||||
area.set_area((wire_length/repeater_spacing) *
|
||||
compute_gate_area(INV, 1, min_w_pmos * repeater_size,
|
||||
g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
|
||||
}
|
||||
else if (wt == Global_20) {
|
||||
delay = global_20.delay * wire_length;
|
||||
power.readOp.dynamic = global_20.power.readOp.dynamic * wire_length;
|
||||
power.readOp.leakage = global_20.power.readOp.leakage * wire_length;
|
||||
power.readOp.gate_leakage = global_20.power.readOp.gate_leakage * wire_length;
|
||||
repeater_spacing = global_20.area.w;
|
||||
repeater_size = global_20.area.h;
|
||||
area.set_area((wire_length/repeater_spacing) *
|
||||
compute_gate_area(INV, 1, min_w_pmos * repeater_size,
|
||||
g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
|
||||
}
|
||||
else if (wt == Global_30) {
|
||||
delay = global_30.delay * wire_length;
|
||||
power.readOp.dynamic = global_30.power.readOp.dynamic * wire_length;
|
||||
power.readOp.leakage = global_30.power.readOp.leakage * wire_length;
|
||||
power.readOp.gate_leakage = global_30.power.readOp.gate_leakage * wire_length;
|
||||
repeater_spacing = global_30.area.w;
|
||||
repeater_size = global_30.area.h;
|
||||
area.set_area((wire_length/repeater_spacing) *
|
||||
compute_gate_area(INV, 1, min_w_pmos * repeater_size,
|
||||
g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
|
||||
}
|
||||
out_rise_time = delay*repeater_spacing/deviceType->Vth;
|
||||
}
|
||||
else if (wt == Low_swing) {
|
||||
low_swing_model ();
|
||||
repeater_spacing = wire_length;
|
||||
repeater_size = 1;
|
||||
}
|
||||
else {
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* The fall time of an input signal to the first stage of a circuit is
|
||||
* assumed to be same as the fall time of the output signal of two
|
||||
* inverters connected in series (refer: CACTI 1 Technical report,
|
||||
* section 6.1.3)
|
||||
*/
|
||||
double
|
||||
Wire::signal_fall_time ()
|
||||
{
|
||||
|
||||
/* rise time of inverter 1's output */
|
||||
double rt;
|
||||
/* fall time of inverter 2's output */
|
||||
double ft;
|
||||
double timeconst;
|
||||
|
||||
timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
|
||||
tr_R_on(min_w_pmos, PCH, 1);
|
||||
rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
|
||||
timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
|
||||
tr_R_on(g_tp.min_w_nmos_, NCH, 1);
|
||||
ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
|
||||
return ft;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Wire::signal_rise_time ()
|
||||
{
|
||||
|
||||
/* rise time of inverter 1's output */
|
||||
double ft;
|
||||
/* fall time of inverter 2's output */
|
||||
double rt;
|
||||
double timeconst;
|
||||
|
||||
timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
|
||||
tr_R_on(g_tp.min_w_nmos_, NCH, 1);
|
||||
rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
|
||||
timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
|
||||
tr_R_on(min_w_pmos, PCH, 1);
|
||||
ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
|
||||
return ft; //sec
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Wire resistance and capacitance calculations
|
||||
* wire width
|
||||
*
|
||||
* /__/
|
||||
* | |
|
||||
* | | height = ASPECT_RATIO*wire width (ASPECT_RATIO = 2.2, ref: ITRS)
|
||||
* |__|/
|
||||
*
|
||||
* spacing between wires in same level = wire width
|
||||
* spacing between wires in adjacent levels = wire width---this is incorrect,
|
||||
* according to R.Ho's paper and thesis. ILD != wire width
|
||||
*
|
||||
*/
|
||||
|
||||
double Wire::wire_cap (double len /* in m */, bool call_from_outside)
|
||||
{
|
||||
//TODO: this should be consistent with the wire_res in technology file
|
||||
double sidewall, adj, tot_cap;
|
||||
double wire_height;
|
||||
double epsilon0 = 8.8542e-12;
|
||||
double aspect_ratio, horiz_dielectric_constant, vert_dielectric_constant, miller_value,ild_thickness;
|
||||
|
||||
switch (wire_placement)
|
||||
{
|
||||
case outside_mat:
|
||||
{
|
||||
aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
|
||||
horiz_dielectric_constant = g_tp.wire_outside_mat.horiz_dielectric_constant;
|
||||
vert_dielectric_constant = g_tp.wire_outside_mat.vert_dielectric_constant;
|
||||
miller_value = g_tp.wire_outside_mat.miller_value;
|
||||
ild_thickness = g_tp.wire_outside_mat.ild_thickness;
|
||||
break;
|
||||
}
|
||||
case inside_mat :
|
||||
{
|
||||
aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
|
||||
horiz_dielectric_constant = g_tp.wire_inside_mat.horiz_dielectric_constant;
|
||||
vert_dielectric_constant = g_tp.wire_inside_mat.vert_dielectric_constant;
|
||||
miller_value = g_tp.wire_inside_mat.miller_value;
|
||||
ild_thickness = g_tp.wire_inside_mat.ild_thickness;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
aspect_ratio = g_tp.wire_local.aspect_ratio;
|
||||
horiz_dielectric_constant = g_tp.wire_local.horiz_dielectric_constant;
|
||||
vert_dielectric_constant = g_tp.wire_local.vert_dielectric_constant;
|
||||
miller_value = g_tp.wire_local.miller_value;
|
||||
ild_thickness = g_tp.wire_local.ild_thickness;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (call_from_outside)
|
||||
{
|
||||
wire_width *= 1e-6;
|
||||
wire_spacing *= 1e-6;
|
||||
}
|
||||
wire_height = wire_width/w_scale*aspect_ratio;
|
||||
/*
|
||||
* assuming height does not change. wire_width = width_original*w_scale
|
||||
* So wire_height does not change as wire width increases
|
||||
*/
|
||||
|
||||
// capacitance between wires in the same level
|
||||
// sidewall = 2*miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
|
||||
// * epsilon0;
|
||||
|
||||
sidewall = miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
|
||||
* epsilon0;
|
||||
|
||||
|
||||
// capacitance between wires in adjacent levels
|
||||
//adj = miller_value * vert_dielectric_constant *w_scale * epsilon0;
|
||||
//adj = 2*vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
|
||||
|
||||
adj = miller_value *vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
|
||||
//Change ild_thickness from micron to M
|
||||
|
||||
//tot_cap = (sidewall + adj + (deviceType->C_fringe * 1e6)); //F/m
|
||||
tot_cap = (sidewall + adj + (g_tp.fringe_cap * 1e6)); //F/m
|
||||
|
||||
if (call_from_outside)
|
||||
{
|
||||
wire_width *= 1e6;
|
||||
wire_spacing *= 1e6;
|
||||
}
|
||||
return (tot_cap*len); // (F)
|
||||
}
|
||||
|
||||
|
||||
double
|
||||
Wire::wire_res (double len /*(in m)*/)
|
||||
{
|
||||
|
||||
double aspect_ratio,alpha_scatter =1.05, dishing_thickness=0, barrier_thickness=0;
|
||||
//TODO: this should be consistent with the wire_res in technology file
|
||||
//The whole computation should be consistent with the wire_res in technology.cc too!
|
||||
|
||||
switch (wire_placement)
|
||||
{
|
||||
case outside_mat:
|
||||
{
|
||||
aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
|
||||
break;
|
||||
}
|
||||
case inside_mat :
|
||||
{
|
||||
aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
aspect_ratio = g_tp.wire_local.aspect_ratio;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return (alpha_scatter * resistivity * 1e-6 * len/((aspect_ratio*wire_width/w_scale-dishing_thickness - barrier_thickness)*
|
||||
(wire_width-2*barrier_thickness)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculates the delay, power and area of the transmitter circuit.
|
||||
*
|
||||
* The transmitter delay is the sum of nand gate delay, inverter delay
|
||||
* low swing nmos delay, and the wire delay
|
||||
* (ref: Technical report 6)
|
||||
*/
|
||||
void
|
||||
Wire::low_swing_model()
|
||||
{
|
||||
double len = wire_length;
|
||||
double beta = pmos_to_nmos_sz_ratio();
|
||||
|
||||
|
||||
double inputrise = (in_rise_time == 0) ? signal_rise_time() : in_rise_time;
|
||||
|
||||
/* Final nmos low swing driver size calculation:
|
||||
* Try to size the driver such that the delay
|
||||
* is less than 8FO4.
|
||||
* If the driver size is greater than
|
||||
* the max allowable size, assume max size for the driver.
|
||||
* In either case, recalculate the delay using
|
||||
* the final driver size assuming slow input with
|
||||
* finite rise time instead of ideal step input
|
||||
*
|
||||
* (ref: Technical report 6)
|
||||
*/
|
||||
double cwire = wire_cap(len); /* load capacitance */
|
||||
double rwire = wire_res(len);
|
||||
|
||||
#define RES_ADJ (8.6) // Increase in resistance due to low driving vol.
|
||||
|
||||
double driver_res = (-8*g_tp.FO4/(log(0.5) * cwire))/RES_ADJ;
|
||||
double nsize = R_to_w(driver_res, NCH);
|
||||
|
||||
nsize = MIN(nsize, g_tp.max_w_nmos_);
|
||||
nsize = MAX(nsize, g_tp.min_w_nmos_);
|
||||
|
||||
if(rwire*cwire > 8*g_tp.FO4)
|
||||
{
|
||||
nsize = g_tp.max_w_nmos_;
|
||||
}
|
||||
|
||||
// size the inverter appropriately to minimize the transmitter delay
|
||||
// Note - In order to minimize leakage, we are not adding a set of inverters to
|
||||
// bring down delay. Instead, we are sizing the single gate
|
||||
// based on the logical effort.
|
||||
double st_eff = sqrt((2+beta/1+beta)*gate_C(nsize, 0)/(gate_C(2*g_tp.min_w_nmos_, 0)
|
||||
+ gate_C(2*min_w_pmos, 0)));
|
||||
double req_cin = ((2+beta/1+beta)*gate_C(nsize, 0))/st_eff;
|
||||
double inv_size = req_cin/(gate_C(min_w_pmos, 0) + gate_C(g_tp.min_w_nmos_, 0));
|
||||
inv_size = MAX(inv_size, 1);
|
||||
|
||||
/* nand gate delay */
|
||||
double res_eq = (2 * tr_R_on(g_tp.min_w_nmos_, NCH, 1));
|
||||
double cap_eq = 2 * drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(inv_size*g_tp.min_w_nmos_, 0) +
|
||||
gate_C(inv_size*min_w_pmos, 0);
|
||||
|
||||
double timeconst = res_eq * cap_eq;
|
||||
|
||||
delay = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
|
||||
deviceType->Vth/deviceType->Vdd, RISE);
|
||||
double temp_power = cap_eq*deviceType->Vdd*deviceType->Vdd;
|
||||
|
||||
inputrise = delay / (deviceType->Vdd - deviceType->Vth); /* for the next stage */
|
||||
|
||||
/* Inverter delay:
|
||||
* The load capacitance of this inv depends on
|
||||
* the gate capacitance of the final stage nmos
|
||||
* transistor which in turn depends on nsize
|
||||
*/
|
||||
res_eq = tr_R_on(inv_size*min_w_pmos, PCH, 1);
|
||||
cap_eq = drain_C_(inv_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(inv_size*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(nsize, 0);
|
||||
timeconst = res_eq * cap_eq;
|
||||
|
||||
delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
|
||||
deviceType->Vth/deviceType->Vdd, FALL);
|
||||
temp_power += cap_eq*deviceType->Vdd*deviceType->Vdd;
|
||||
|
||||
|
||||
transmitter.delay = delay;
|
||||
transmitter.power.readOp.dynamic = temp_power*2; /* since it is a diff. model*/
|
||||
transmitter.power.readOp.leakage = deviceType->Vdd *
|
||||
(4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
|
||||
4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
|
||||
|
||||
transmitter.power.readOp.gate_leakage = deviceType->Vdd *
|
||||
(4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
|
||||
4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
|
||||
|
||||
inputrise = delay / deviceType->Vth;
|
||||
|
||||
/* nmos delay + wire delay */
|
||||
cap_eq = cwire + drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2 +
|
||||
nsense * sense_amp_input_cap(); //+receiver cap
|
||||
/*
|
||||
* NOTE: nmos is used as both pull up and pull down transistor
|
||||
* in the transmitter. This is because for low voltage swing, drive
|
||||
* resistance of nmos is less than pmos
|
||||
* (for a detailed graph ref: On-Chip Wires: Scaling and Efficiency)
|
||||
*/
|
||||
timeconst = (tr_R_on(nsize, NCH, 1)*RES_ADJ) * (cwire +
|
||||
drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2) +
|
||||
rwire*cwire/2 +
|
||||
(tr_R_on(nsize, NCH, 1)*RES_ADJ + rwire) *
|
||||
nsense * sense_amp_input_cap();
|
||||
|
||||
/*
|
||||
* since we are pre-equalizing and overdriving the low
|
||||
* swing wires, the net time constant is less
|
||||
* than the actual value
|
||||
*/
|
||||
delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, .25, 0);
|
||||
#define VOL_SWING .1
|
||||
temp_power += cap_eq*VOL_SWING*.400; /* .4v is the over drive voltage */
|
||||
temp_power *= 2; /* differential wire */
|
||||
|
||||
l_wire.delay = delay - transmitter.delay;
|
||||
l_wire.power.readOp.dynamic = temp_power - transmitter.power.readOp.dynamic;
|
||||
l_wire.power.readOp.leakage = deviceType->Vdd*
|
||||
(4* cmos_Isub_leakage(nsize, 0, 1, nmos));
|
||||
|
||||
l_wire.power.readOp.gate_leakage = deviceType->Vdd*
|
||||
(4* cmos_Ig_leakage(nsize, 0, 1, nmos));
|
||||
|
||||
//double rt = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
|
||||
// deviceType->Vth/deviceType->Vdd, RISE)/deviceType->Vth;
|
||||
|
||||
delay += g_tp.sense_delay;
|
||||
|
||||
sense_amp.delay = g_tp.sense_delay;
|
||||
out_rise_time = g_tp.sense_delay/(deviceType->Vth);
|
||||
sense_amp.power.readOp.dynamic = g_tp.sense_dy_power;
|
||||
sense_amp.power.readOp.leakage = 0; //FIXME
|
||||
sense_amp.power.readOp.gate_leakage = 0;
|
||||
|
||||
power.readOp.dynamic = temp_power + sense_amp.power.readOp.dynamic;
|
||||
power.readOp.leakage = transmitter.power.readOp.leakage +
|
||||
l_wire.power.readOp.leakage +
|
||||
sense_amp.power.readOp.leakage;
|
||||
power.readOp.gate_leakage = transmitter.power.readOp.gate_leakage +
|
||||
l_wire.power.readOp.gate_leakage +
|
||||
sense_amp.power.readOp.gate_leakage;
|
||||
}
|
||||
|
||||
double
|
||||
Wire::sense_amp_input_cap()
|
||||
{
|
||||
return drain_C_(g_tp.w_iso, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
gate_C(g_tp.w_sense_en + g_tp.w_sense_n, 0) +
|
||||
drain_C_(g_tp.w_sense_n, NCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(g_tp.w_sense_p, PCH, 1, 1, g_tp.cell_h_def);
|
||||
}
|
||||
|
||||
|
||||
void Wire::delay_optimal_wire ()
|
||||
{
|
||||
double len = wire_length;
|
||||
//double min_wire_width = wire_width; //m
|
||||
double beta = pmos_to_nmos_sz_ratio();
|
||||
double switching = 0; // switching energy
|
||||
double short_ckt = 0; // short-circuit energy
|
||||
double tc = 0; // time constant
|
||||
// input cap of min sized driver
|
||||
double input_cap = gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0);
|
||||
|
||||
// output parasitic capacitance of
|
||||
// the min. sized driver
|
||||
double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
|
||||
// drive resistance
|
||||
double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
|
||||
tr_R_on(min_w_pmos, PCH, 1))/2;
|
||||
double wr = wire_res(len); //ohm
|
||||
|
||||
// wire cap /m
|
||||
double wc = wire_cap(len);
|
||||
|
||||
// size the repeater such that the delay of the wire is minimum
|
||||
double repeater_scaling = sqrt(out_res*wc/(wr*input_cap)); // len will cancel
|
||||
|
||||
// calc the optimum spacing between the repeaters (m)
|
||||
|
||||
repeater_spacing = sqrt(2 * out_res * (out_cap + input_cap)/
|
||||
((wr/len)*(wc/len)));
|
||||
repeater_size = repeater_scaling;
|
||||
|
||||
switching = (repeater_scaling * (input_cap + out_cap) +
|
||||
repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
tc = out_res * (input_cap + out_cap) +
|
||||
out_res * wc/len * repeater_spacing/repeater_scaling +
|
||||
wr/len * repeater_spacing * input_cap * repeater_scaling +
|
||||
0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
|
||||
|
||||
delay = 0.693 * tc * len/repeater_spacing;
|
||||
|
||||
#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
|
||||
short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
|
||||
repeater_scaling * tc;
|
||||
|
||||
area.set_area((len/repeater_spacing) *
|
||||
compute_gate_area(INV, 1, min_w_pmos * repeater_scaling,
|
||||
g_tp.min_w_nmos_ * repeater_scaling, g_tp.cell_h_def));
|
||||
power.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
|
||||
power.readOp.leakage = ((len/repeater_spacing)*
|
||||
deviceType->Vdd*
|
||||
cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
|
||||
power.readOp.gate_leakage = ((len/repeater_spacing)*
|
||||
deviceType->Vdd*
|
||||
cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
|
||||
}
|
||||
|
||||
|
||||
|
||||
// calculate power/delay values for wires with suboptimal repeater sizing/spacing
|
||||
void
|
||||
Wire::init_wire(){
|
||||
wire_length = 1;
|
||||
delay_optimal_wire();
|
||||
double sp, si;
|
||||
powerDef pow;
|
||||
si = repeater_size;
|
||||
sp = repeater_spacing;
|
||||
sp *= 1e6; // in microns
|
||||
|
||||
double i, j, del;
|
||||
repeated_wire.push_back(Component());
|
||||
for (j=sp; j < 4*sp; j+=100) {
|
||||
for (i = si; i > 1; i--) {
|
||||
pow = wire_model(j*1e-6, i, &del);
|
||||
if (j == sp && i == si) {
|
||||
global.delay = del;
|
||||
global.power = pow;
|
||||
global.area.h = si;
|
||||
global.area.w = sp*1e-6; // m
|
||||
}
|
||||
// cout << "Repeater size - "<< i <<
|
||||
// " Repeater spacing - " << j <<
|
||||
// " Delay - " << del <<
|
||||
// " PowerD - " << pow.readOp.dynamic <<
|
||||
// " PowerL - " << pow.readOp.leakage <<endl;
|
||||
repeated_wire.back().delay = del;
|
||||
repeated_wire.back().power.readOp = pow.readOp;
|
||||
repeated_wire.back().area.w = j*1e-6; //m
|
||||
repeated_wire.back().area.h = i;
|
||||
repeated_wire.push_back(Component());
|
||||
|
||||
}
|
||||
}
|
||||
repeated_wire.pop_back();
|
||||
update_fullswing();
|
||||
Wire *l_wire = new Wire(Low_swing, 0.001/* 1 mm*/, 1);
|
||||
low_swing.delay = l_wire->delay;
|
||||
low_swing.power = l_wire->power;
|
||||
delete l_wire;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Wire::update_fullswing()
|
||||
{
|
||||
|
||||
list<Component>::iterator citer;
|
||||
double del[4];
|
||||
del[3] = this->global.delay + this->global.delay*.3;
|
||||
del[2] = global.delay + global.delay*.2;
|
||||
del[1] = global.delay + global.delay*.1;
|
||||
del[0] = global.delay + global.delay*.05;
|
||||
double threshold;
|
||||
double ncost;
|
||||
double cost;
|
||||
int i = 4;
|
||||
while (i>0) {
|
||||
threshold = del[i-1];
|
||||
cost = BIGNUM;
|
||||
for (citer = repeated_wire.begin(); citer != repeated_wire.end(); citer++)
|
||||
{
|
||||
if (citer->delay > threshold) {
|
||||
citer = repeated_wire.erase(citer);
|
||||
citer --;
|
||||
}
|
||||
else {
|
||||
ncost = citer->power.readOp.dynamic/global.power.readOp.dynamic +
|
||||
citer->power.readOp.leakage/global.power.readOp.leakage;
|
||||
if(ncost < cost)
|
||||
{
|
||||
cost = ncost;
|
||||
if (i == 4) {
|
||||
global_30.delay = citer->delay;
|
||||
global_30.power = citer->power;
|
||||
global_30.area = citer->area;
|
||||
}
|
||||
else if (i==3) {
|
||||
global_20.delay = citer->delay;
|
||||
global_20.power = citer->power;
|
||||
global_20.area = citer->area;
|
||||
}
|
||||
else if(i==2) {
|
||||
global_10.delay = citer->delay;
|
||||
global_10.power = citer->power;
|
||||
global_10.area = citer->area;
|
||||
}
|
||||
else if(i==1) {
|
||||
global_5.delay = citer->delay;
|
||||
global_5.power = citer->power;
|
||||
global_5.area = citer->area;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
powerDef Wire::wire_model (double space, double size, double *delay)
|
||||
{
|
||||
powerDef ptemp;
|
||||
double len = 1;
|
||||
//double min_wire_width = wire_width; //m
|
||||
double beta = pmos_to_nmos_sz_ratio();
|
||||
// switching energy
|
||||
double switching = 0;
|
||||
// short-circuit energy
|
||||
double short_ckt = 0;
|
||||
// time constant
|
||||
double tc = 0;
|
||||
// input cap of min sized driver
|
||||
double input_cap = gate_C (g_tp.min_w_nmos_ +
|
||||
min_w_pmos, 0);
|
||||
|
||||
// output parasitic capacitance of
|
||||
// the min. sized driver
|
||||
double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
|
||||
drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
|
||||
// drive resistance
|
||||
double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
|
||||
tr_R_on(min_w_pmos, PCH, 1))/2;
|
||||
double wr = wire_res(len); //ohm
|
||||
|
||||
// wire cap /m
|
||||
double wc = wire_cap(len);
|
||||
|
||||
repeater_spacing = space;
|
||||
repeater_size = size;
|
||||
|
||||
switching = (repeater_size * (input_cap + out_cap) +
|
||||
repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
|
||||
|
||||
tc = out_res * (input_cap + out_cap) +
|
||||
out_res * wc/len * repeater_spacing/repeater_size +
|
||||
wr/len * repeater_spacing * out_cap * repeater_size +
|
||||
0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
|
||||
|
||||
*delay = 0.693 * tc * len/repeater_spacing;
|
||||
|
||||
#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
|
||||
short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
|
||||
repeater_size * tc;
|
||||
|
||||
ptemp.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
|
||||
ptemp.readOp.leakage = ((len/repeater_spacing)*
|
||||
deviceType->Vdd*
|
||||
cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
|
||||
|
||||
ptemp.readOp.gate_leakage = ((len/repeater_spacing)*
|
||||
deviceType->Vdd*
|
||||
cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
|
||||
|
||||
return ptemp;
|
||||
}
|
||||
|
||||
void
|
||||
Wire::print_wire()
|
||||
{
|
||||
|
||||
cout << "\nWire Properties:\n\n";
|
||||
cout << " Delay Optimal\n\tRepeater size - "<< global.area.h <<
|
||||
" \n\tRepeater spacing - " << global.area.w*1e3 << " (mm)"
|
||||
" \n\tDelay - " << global.delay*1e6 << " (ns/mm)"
|
||||
" \n\tPowerD - " << global.power.readOp.dynamic *1e6<< " (nJ/mm)"
|
||||
" \n\tPowerL - " << global.power.readOp.leakage << " (mW/mm)"
|
||||
" \n\tPowerLgate - " << global.power.readOp.gate_leakage << " (mW/mm)\n";
|
||||
cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
|
||||
cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
|
||||
cout <<endl;
|
||||
|
||||
cout << " 5% Overhead\n\tRepeater size - "<< global_5.area.h <<
|
||||
" \n\tRepeater spacing - " << global_5.area.w*1e3 << " (mm)"
|
||||
" \n\tDelay - " << global_5.delay *1e6<< " (ns/mm)"
|
||||
" \n\tPowerD - " << global_5.power.readOp.dynamic *1e6<< " (nJ/mm)"
|
||||
" \n\tPowerL - " << global_5.power.readOp.leakage << " (mW/mm)"
|
||||
" \n\tPowerLgate - " << global_5.power.readOp.gate_leakage << " (mW/mm)\n";
|
||||
cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
|
||||
cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
|
||||
cout <<endl;
|
||||
cout << " 10% Overhead\n\tRepeater size - "<< global_10.area.h <<
|
||||
" \n\tRepeater spacing - " << global_10.area.w*1e3 << " (mm)"
|
||||
" \n\tDelay - " << global_10.delay *1e6<< " (ns/mm)"
|
||||
" \n\tPowerD - " << global_10.power.readOp.dynamic *1e6<< " (nJ/mm)"
|
||||
" \n\tPowerL - " << global_10.power.readOp.leakage << " (mW/mm)"
|
||||
" \n\tPowerLgate - " << global_10.power.readOp.gate_leakage << " (mW/mm)\n";
|
||||
cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
|
||||
cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
|
||||
cout <<endl;
|
||||
cout << " 20% Overhead\n\tRepeater size - "<< global_20.area.h <<
|
||||
" \n\tRepeater spacing - " << global_20.area.w*1e3 << " (mm)"
|
||||
" \n\tDelay - " << global_20.delay *1e6<< " (ns/mm)"
|
||||
" \n\tPowerD - " << global_20.power.readOp.dynamic *1e6<< " (nJ/mm)"
|
||||
" \n\tPowerL - " << global_20.power.readOp.leakage << " (mW/mm)"
|
||||
" \n\tPowerLgate - " << global_20.power.readOp.gate_leakage << " (mW/mm)\n";
|
||||
cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
|
||||
cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
|
||||
cout <<endl;
|
||||
cout << " 30% Overhead\n\tRepeater size - "<< global_30.area.h <<
|
||||
" \n\tRepeater spacing - " << global_30.area.w*1e3 << " (mm)"
|
||||
" \n\tDelay - " << global_30.delay *1e6<< " (ns/mm)"
|
||||
" \n\tPowerD - " << global_30.power.readOp.dynamic *1e6<< " (nJ/mm)"
|
||||
" \n\tPowerL - " << global_30.power.readOp.leakage << " (mW/mm)"
|
||||
" \n\tPowerLgate - " << global_30.power.readOp.gate_leakage << " (mW/mm)\n";
|
||||
cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
|
||||
cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
|
||||
cout <<endl;
|
||||
cout << " Low-swing wire (1 mm) - Note: Unlike repeated wires, \n\tdelay and power "
|
||||
"values of low-swing wires do not\n\thave a linear relationship with length." <<
|
||||
" \n\tdelay - " << low_swing.delay *1e9<< " (ns)"
|
||||
" \n\tpowerD - " << low_swing.power.readOp.dynamic *1e9<< " (nJ)"
|
||||
" \n\tPowerL - " << low_swing.power.readOp.leakage << " (mW)"
|
||||
" \n\tPowerLgate - " << low_swing.power.readOp.gate_leakage << " (mW)\n";
|
||||
cout << "\tWire width - " <<wire_width_init * 2 /* differential */<< " microns\n";
|
||||
cout << "\tWire spacing - " <<wire_spacing_init * 2 /* differential */<< " microns\n";
|
||||
cout <<endl;
|
||||
cout <<endl;
|
||||
|
||||
}
|
||||
|
124
ext/mcpat/cacti/wire.h
Normal file
124
ext/mcpat/cacti/wire.h
Normal file
|
@ -0,0 +1,124 @@
|
|||
/*****************************************************************************
|
||||
* McPAT/CACTI
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
|
||||
#ifndef __WIRE_H__
|
||||
#define __WIRE_H__
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
|
||||
#include "assert.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "parameter.h"
|
||||
|
||||
class Wire : public Component
|
||||
{
|
||||
public:
|
||||
Wire(enum Wire_type wire_model, double len /* in u*/,
|
||||
int nsense = 1/* no. of sense amps connected to the low-swing wire */,
|
||||
double width_scaling = 1,
|
||||
double spacing_scaling = 1,
|
||||
enum Wire_placement wire_placement = outside_mat,
|
||||
double resistivity = CU_RESISTIVITY,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
|
||||
~Wire();
|
||||
|
||||
Wire( double width_scaling = 1,
|
||||
double spacing_scaling = 1,
|
||||
enum Wire_placement wire_placement = outside_mat,
|
||||
double resistivity = CU_RESISTIVITY,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
|
||||
); // should be used only once for initializing static members
|
||||
void init_wire();
|
||||
|
||||
void calculate_wire_stats();
|
||||
void delay_optimal_wire();
|
||||
double wire_cap(double len, bool call_from_outside=false);
|
||||
double wire_res(double len);
|
||||
void low_swing_model();
|
||||
double signal_fall_time();
|
||||
double signal_rise_time();
|
||||
double sense_amp_input_cap();
|
||||
|
||||
enum Wire_type wt;
|
||||
double wire_spacing;
|
||||
double wire_width;
|
||||
enum Wire_placement wire_placement;
|
||||
double repeater_size;
|
||||
double repeater_spacing;
|
||||
double wire_length;
|
||||
double in_rise_time, out_rise_time;
|
||||
|
||||
void set_in_rise_time(double rt)
|
||||
{
|
||||
in_rise_time = rt;
|
||||
}
|
||||
static Component global;
|
||||
static Component global_5;
|
||||
static Component global_10;
|
||||
static Component global_20;
|
||||
static Component global_30;
|
||||
static Component low_swing;
|
||||
static double wire_width_init;
|
||||
static double wire_spacing_init;
|
||||
void print_wire();
|
||||
|
||||
private:
|
||||
|
||||
int nsense; // no. of sense amps connected to a low-swing wire if it
|
||||
// is broadcasting data to multiple destinations
|
||||
// width and spacing scaling factor can be used
|
||||
// to model low level wires or special
|
||||
// fat wires
|
||||
double w_scale, s_scale;
|
||||
double resistivity;
|
||||
powerDef wire_model (double space, double size, double *delay);
|
||||
list <Component> repeated_wire;
|
||||
void update_fullswing();
|
||||
static int initialized;
|
||||
|
||||
|
||||
//low-swing
|
||||
Component transmitter;
|
||||
Component l_wire;
|
||||
Component sense_amp;
|
||||
|
||||
double min_w_pmos;
|
||||
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
4135
ext/mcpat/core.cc
Normal file
4135
ext/mcpat/core.cc
Normal file
File diff suppressed because it is too large
Load diff
262
ext/mcpat/core.h
Normal file
262
ext/mcpat/core.h
Normal file
|
@ -0,0 +1,262 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef CORE_H_
|
||||
#define CORE_H_
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "array.h"
|
||||
#include "basic_components.h"
|
||||
#include "interconnect.h"
|
||||
#include "logic.h"
|
||||
#include "parameter.h"
|
||||
#include "sharedcache.h"
|
||||
|
||||
class BranchPredictor :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
ArrayST * globalBPT;
|
||||
ArrayST * localBPT;
|
||||
ArrayST * L1_localBPT;
|
||||
ArrayST * L2_localBPT;
|
||||
ArrayST * chooser;
|
||||
ArrayST * RAS;
|
||||
bool exist;
|
||||
|
||||
BranchPredictor(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~BranchPredictor();
|
||||
};
|
||||
|
||||
|
||||
class InstFetchU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
enum Cache_policy cache_p;
|
||||
InstCache icache;
|
||||
ArrayST * IB;
|
||||
ArrayST * BTB;
|
||||
BranchPredictor * BPT;
|
||||
inst_decoder * ID_inst;
|
||||
inst_decoder * ID_operand;
|
||||
inst_decoder * ID_misc;
|
||||
bool exist;
|
||||
|
||||
InstFetchU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~InstFetchU();
|
||||
};
|
||||
|
||||
|
||||
class SchedulerU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
double Iw_height, fp_Iw_height,ROB_height;
|
||||
ArrayST * int_inst_window;
|
||||
ArrayST * fp_inst_window;
|
||||
ArrayST * ROB;
|
||||
selection_logic * instruction_selection;
|
||||
bool exist;
|
||||
|
||||
SchedulerU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~SchedulerU();
|
||||
};
|
||||
|
||||
class RENAMINGU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
double clockRate,executionTime;
|
||||
CoreDynParam coredynp;
|
||||
ArrayST * iFRAT;
|
||||
ArrayST * fFRAT;
|
||||
ArrayST * iRRAT;
|
||||
ArrayST * fRRAT;
|
||||
ArrayST * ifreeL;
|
||||
ArrayST * ffreeL;
|
||||
dep_resource_conflict_check * idcl;
|
||||
dep_resource_conflict_check * fdcl;
|
||||
ArrayST * RAHT;//register alias history table Used to store GC
|
||||
bool exist;
|
||||
|
||||
|
||||
RENAMINGU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~RENAMINGU();
|
||||
};
|
||||
|
||||
class LoadStoreU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
enum Cache_policy cache_p;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
double lsq_height;
|
||||
DataCache dcache;
|
||||
ArrayST * LSQ;//it is actually the store queue but for inorder processors it serves as both loadQ and StoreQ
|
||||
ArrayST * LoadQ;
|
||||
bool exist;
|
||||
|
||||
LoadStoreU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~LoadStoreU();
|
||||
};
|
||||
|
||||
class MemManU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
ArrayST * itlb;
|
||||
ArrayST * dtlb;
|
||||
bool exist;
|
||||
|
||||
MemManU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~MemManU();
|
||||
};
|
||||
|
||||
class RegFU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
double int_regfile_height, fp_regfile_height;
|
||||
ArrayST * IRF;
|
||||
ArrayST * FRF;
|
||||
ArrayST * RFWIN;
|
||||
bool exist;
|
||||
|
||||
RegFU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~RegFU();
|
||||
};
|
||||
|
||||
class EXECU :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
double lsq_height;
|
||||
CoreDynParam coredynp;
|
||||
RegFU * rfu;
|
||||
SchedulerU * scheu;
|
||||
FunctionalUnit * fp_u;
|
||||
FunctionalUnit * exeu;
|
||||
FunctionalUnit * mul;
|
||||
interconnect * int_bypass;
|
||||
interconnect * intTagBypass;
|
||||
interconnect * int_mul_bypass;
|
||||
interconnect * intTag_mul_Bypass;
|
||||
interconnect * fp_bypass;
|
||||
interconnect * fpTagBypass;
|
||||
|
||||
Component bypass;
|
||||
bool exist;
|
||||
|
||||
EXECU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, double lsq_height_,const CoreDynParam & dyn_p_, bool exist_=true);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~EXECU();
|
||||
};
|
||||
|
||||
|
||||
class Core :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
InstFetchU * ifu;
|
||||
LoadStoreU * lsu;
|
||||
MemManU * mmu;
|
||||
EXECU * exu;
|
||||
RENAMINGU * rnu;
|
||||
Pipeline * corepipe;
|
||||
UndiffCore * undiffCore;
|
||||
SharedCache * l2cache;
|
||||
CoreDynParam coredynp;
|
||||
//full_decoder inst_decoder;
|
||||
//clock_network clockNetwork;
|
||||
Core(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_);
|
||||
void set_core_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~Core();
|
||||
};
|
||||
|
||||
#endif /* CORE_H_ */
|
48
ext/mcpat/globalvar.h
Normal file
48
ext/mcpat/globalvar.h
Normal file
|
@ -0,0 +1,48 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef GLOBALVAR_H_
|
||||
#define GLOBALVAR_H_
|
||||
|
||||
#ifdef GLOBALVAR
|
||||
#define EXTERN
|
||||
#else
|
||||
#define EXTERN extern
|
||||
#endif
|
||||
|
||||
EXTERN bool opt_for_clk;
|
||||
|
||||
#endif /* GLOBALVAR_H_ */
|
||||
|
||||
|
||||
|
||||
|
222
ext/mcpat/interconnect.cc
Normal file
222
ext/mcpat/interconnect.cc
Normal file
|
@ -0,0 +1,222 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
#include "globalvar.h"
|
||||
#include "interconnect.h"
|
||||
#include "wire.h"
|
||||
|
||||
interconnect::interconnect(
|
||||
string name_,
|
||||
enum Device_ty device_ty_,
|
||||
double base_w, double base_h,
|
||||
int data_w, double len,const InputParameter *configure_interface,
|
||||
int start_wiring_level_,
|
||||
bool pipelinable_ ,
|
||||
double route_over_perc_ ,
|
||||
bool opt_local_,
|
||||
enum Core_type core_ty_,
|
||||
enum Wire_type wire_model,
|
||||
double width_s, double space_s,
|
||||
TechnologyParameter::DeviceType *dt
|
||||
)
|
||||
:name(name_),
|
||||
device_ty(device_ty_),
|
||||
in_rise_time(0),
|
||||
out_rise_time(0),
|
||||
base_width(base_w),
|
||||
base_height(base_h),
|
||||
data_width(data_w),
|
||||
wt(wire_model),
|
||||
width_scaling(width_s),
|
||||
space_scaling(space_s),
|
||||
start_wiring_level(start_wiring_level_),
|
||||
length(len),
|
||||
//interconnect_latency(1e-12),
|
||||
//interconnect_throughput(1e-12),
|
||||
opt_local(opt_local_),
|
||||
core_ty(core_ty_),
|
||||
pipelinable(pipelinable_),
|
||||
route_over_perc(route_over_perc_),
|
||||
deviceType(dt)
|
||||
{
|
||||
|
||||
wt = Global;
|
||||
l_ip=*configure_interface;
|
||||
local_result = init_interface(&l_ip);
|
||||
|
||||
|
||||
max_unpipelined_link_delay = 0; //TODO
|
||||
min_w_nmos = g_tp.min_w_nmos_;
|
||||
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
|
||||
|
||||
|
||||
|
||||
latency = l_ip.latency;
|
||||
throughput = l_ip.throughput;
|
||||
latency_overflow=false;
|
||||
throughput_overflow=false;
|
||||
|
||||
/*
|
||||
* TODO: Add wiring option from semi-global to global automatically
|
||||
* And directly jump to global if semi-global cannot satisfy timing
|
||||
* Fat wires only available for global wires, thus
|
||||
* if signal wiring layer starts from semi-global,
|
||||
* the next layer up will be global, i.e., semi-global does
|
||||
* not have fat wires.
|
||||
*/
|
||||
if (pipelinable == false)
|
||||
//Non-pipelinable wires, such as bypass logic, care latency
|
||||
{
|
||||
compute();
|
||||
if (opt_for_clk && opt_local)
|
||||
{
|
||||
while (delay > latency && width_scaling<3.0)
|
||||
{
|
||||
width_scaling *= 2;
|
||||
space_scaling *= 2;
|
||||
Wire winit(width_scaling, space_scaling);
|
||||
compute();
|
||||
}
|
||||
if (delay > latency)
|
||||
{
|
||||
latency_overflow=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else //Pipelinable wires, such as bus, does not care latency but throughput
|
||||
{
|
||||
/*
|
||||
* TODO: Add pipe regs power, area, and timing;
|
||||
* Pipelinable wires optimize latency first.
|
||||
*/
|
||||
compute();
|
||||
if (opt_for_clk && opt_local)
|
||||
{
|
||||
while (delay > throughput && width_scaling<3.0)
|
||||
{
|
||||
width_scaling *= 2;
|
||||
space_scaling *= 2;
|
||||
Wire winit(width_scaling, space_scaling);
|
||||
compute();
|
||||
}
|
||||
if (delay > throughput)
|
||||
// insert pipeline stages
|
||||
{
|
||||
num_pipe_stages = (int)ceil(delay/throughput);
|
||||
assert(num_pipe_stages>0);
|
||||
delay = delay/num_pipe_stages + num_pipe_stages*0.05*delay;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
power_bit = power;
|
||||
power.readOp.dynamic *= data_width;
|
||||
power.readOp.leakage *= data_width;
|
||||
power.readOp.gate_leakage *= data_width;
|
||||
area.set_area(area.get_area()*data_width);
|
||||
no_device_under_wire_area.h *= data_width;
|
||||
|
||||
if (latency_overflow==true)
|
||||
cout<< "Warning: "<< name <<" wire structure cannot satisfy latency constraint." << endl;
|
||||
|
||||
|
||||
assert(power.readOp.dynamic > 0);
|
||||
assert(power.readOp.leakage > 0);
|
||||
assert(power.readOp.gate_leakage > 0);
|
||||
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
|
||||
|
||||
double sckRation = g_tp.sckt_co_eff;
|
||||
power.readOp.dynamic *= sckRation;
|
||||
power.writeOp.dynamic *= sckRation;
|
||||
power.searchOp.dynamic *= sckRation;
|
||||
|
||||
power.readOp.longer_channel_leakage =
|
||||
power.readOp.leakage*long_channel_device_reduction;
|
||||
|
||||
if (pipelinable)//Only global wires has the option to choose whether routing over or not
|
||||
area.set_area(area.get_area()*route_over_perc + no_device_under_wire_area.get_area()*(1-route_over_perc));
|
||||
|
||||
Wire wreset();
|
||||
}
|
||||
|
||||
|
||||
|
||||
void
|
||||
interconnect::compute()
|
||||
{
|
||||
|
||||
Wire *wtemp1 = 0;
|
||||
wtemp1 = new Wire(wt, length, 1, width_scaling, space_scaling);
|
||||
delay = wtemp1->delay;
|
||||
power.readOp.dynamic = wtemp1->power.readOp.dynamic;
|
||||
power.readOp.leakage = wtemp1->power.readOp.leakage;
|
||||
power.readOp.gate_leakage = wtemp1->power.readOp.gate_leakage;
|
||||
|
||||
area.set_area(wtemp1->area.get_area());
|
||||
no_device_under_wire_area.h = (wtemp1->wire_width + wtemp1->wire_spacing);
|
||||
no_device_under_wire_area.w = length;
|
||||
|
||||
if (wtemp1)
|
||||
delete wtemp1;
|
||||
|
||||
}
|
||||
|
||||
void interconnect::leakage_feedback(double temperature)
|
||||
{
|
||||
l_ip.temp = (unsigned int)round(temperature/10.0)*10;
|
||||
uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
|
||||
|
||||
compute();
|
||||
|
||||
power_bit = power;
|
||||
power.readOp.dynamic *= data_width;
|
||||
power.readOp.leakage *= data_width;
|
||||
power.readOp.gate_leakage *= data_width;
|
||||
|
||||
assert(power.readOp.dynamic > 0);
|
||||
assert(power.readOp.leakage > 0);
|
||||
assert(power.readOp.gate_leakage > 0);
|
||||
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
|
||||
|
||||
double sckRation = g_tp.sckt_co_eff;
|
||||
power.readOp.dynamic *= sckRation;
|
||||
power.writeOp.dynamic *= sckRation;
|
||||
power.searchOp.dynamic *= sckRation;
|
||||
|
||||
power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
|
||||
}
|
||||
|
111
ext/mcpat/interconnect.h
Normal file
111
ext/mcpat/interconnect.h
Normal file
|
@ -0,0 +1,111 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#ifndef __INTERCONNECT_H__
|
||||
#define __INTERCONNECT_H__
|
||||
|
||||
#include "assert.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "basic_components.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "parameter.h"
|
||||
#include "subarray.h"
|
||||
#include "wire.h"
|
||||
|
||||
// leakge power includes entire htree in a bank (when uca_tree == false)
|
||||
// leakge power includes only part to one bank when uca_tree == true
|
||||
|
||||
class interconnect : public Component
|
||||
{
|
||||
public:
|
||||
interconnect(
|
||||
string name_,
|
||||
enum Device_ty device_ty_,
|
||||
double base_w, double base_h, int data_w, double len,
|
||||
const InputParameter *configure_interface, int start_wiring_level_,
|
||||
bool pipelinable_ = false,
|
||||
double route_over_perc_ =0.5,
|
||||
bool opt_local_=true,
|
||||
enum Core_type core_ty_=Inorder,
|
||||
enum Wire_type wire_model=Global,
|
||||
double width_s=1.0, double space_s=1.0,
|
||||
TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
|
||||
);
|
||||
|
||||
~interconnect() {};
|
||||
|
||||
void compute();
|
||||
string name;
|
||||
enum Device_ty device_ty;
|
||||
double in_rise_time, out_rise_time;
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
Area no_device_under_wire_area;
|
||||
void set_in_rise_time(double rt)
|
||||
{
|
||||
in_rise_time = rt;
|
||||
}
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
double max_unpipelined_link_delay;
|
||||
powerDef power_bit;
|
||||
|
||||
double wire_bw;
|
||||
double init_wire_bw; // bus width at root
|
||||
double base_width;
|
||||
double base_height;
|
||||
int data_width;
|
||||
enum Wire_type wt;
|
||||
double width_scaling, space_scaling;
|
||||
int start_wiring_level;
|
||||
double length;
|
||||
double min_w_nmos;
|
||||
double min_w_pmos;
|
||||
double latency, throughput;
|
||||
bool latency_overflow;
|
||||
bool throughput_overflow;
|
||||
double interconnect_latency;
|
||||
double interconnect_throughput;
|
||||
bool opt_local;
|
||||
enum Core_type core_ty;
|
||||
bool pipelinable;
|
||||
double route_over_perc;
|
||||
int num_pipe_stages;
|
||||
|
||||
private:
|
||||
TechnologyParameter::DeviceType *deviceType;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
446
ext/mcpat/iocontrollers.cc
Normal file
446
ext/mcpat/iocontrollers.cc
Normal file
|
@ -0,0 +1,446 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "basic_components.h"
|
||||
#include "const.h"
|
||||
#include "io.h"
|
||||
#include "iocontrollers.h"
|
||||
#include "logic.h"
|
||||
#include "parameter.h"
|
||||
|
||||
/*
|
||||
SUN Niagara 2 I/O power analysis:
|
||||
total signal bits: 711
|
||||
Total FBDIMM bits: (14+10)*2*8= 384
|
||||
PCIe bits: (8 + 8)*2 = 32
|
||||
10Gb NIC: (4*2+4*2)*2 = 32
|
||||
Debug I/Os: 168
|
||||
Other I/Os: 711- 32-32 - 384 - 168 = 95
|
||||
|
||||
According to "Implementation of an 8-Core, 64-Thread, Power-Efficient SPARC Server on a Chip"
|
||||
90% of I/Os are SerDers (the calucaltion is 384+64/(711-168)=83% about the same as the 90% reported in the paper)
|
||||
--> around 80Pins are common I/Os.
|
||||
Common I/Os consumes 71mW/Gb/s according to Cadence ChipEstimate @65nm
|
||||
Niagara 2 I/O clock is 1/4 of core clock. --> 87pin (<--((711-168)*17%)) * 71mW/Gb/s *0.25*1.4Ghz = 2.17W
|
||||
|
||||
Total dynamic power of FBDIMM, NIC, PCIe = 84*0.132 + 84*0.049*0.132 = 11.14 - 2.17 = 8.98
|
||||
Further, if assuming I/O logic power is about 50% of I/Os then Total energy of FBDIMM, NIC, PCIe = 11.14 - 2.17*1.5 = 7.89
|
||||
*/
|
||||
|
||||
/*
|
||||
* A bug in Cadence ChipEstimator: After update the clock rate in the clock tab, a user
|
||||
* need to re-select the IP clock (the same clk) and then click Estimate. if not reselect
|
||||
* the new clock rate may not be propogate into the IPs.
|
||||
*
|
||||
*/
|
||||
|
||||
NIUController::NIUController(ParseXML *XML_interface,InputParameter* interface_ip_)
|
||||
:XML(XML_interface),
|
||||
interface_ip(*interface_ip_)
|
||||
{
|
||||
local_result = init_interface(&interface_ip);
|
||||
|
||||
double frontend_area, phy_area, mac_area, SerDer_area;
|
||||
double frontend_dyn, mac_dyn, SerDer_dyn;
|
||||
double frontend_gates, mac_gates, SerDer_gates;
|
||||
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
|
||||
double NMOS_sizing, PMOS_sizing;
|
||||
|
||||
set_niu_param();
|
||||
|
||||
if (niup.type == 0) //high performance NIU
|
||||
{
|
||||
//Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate using 65nm.
|
||||
mac_area = (1.53 + 0.3)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Area estimation based on average of die photo from Niagara 2, ISSCC "An 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS"
|
||||
//and"A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique" Frontend is PCS
|
||||
frontend_area = (9.8 + (6 + 18)*65/130*65/130)/3 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
|
||||
//SerDer is very hard to scale
|
||||
SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
|
||||
phy_area = frontend_area + SerDer_area;
|
||||
//total area
|
||||
area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
|
||||
//Power
|
||||
//Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
|
||||
mac_dyn = 2.19e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
|
||||
//Cadence ChipEstimate using 65nm soft IP;
|
||||
frontend_dyn = 0.27e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
|
||||
//according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006
|
||||
//SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
|
||||
SerDer_dyn = 0.01*10*sqrt(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
|
||||
SerDer_dyn /= niup.clockRate;//covert to energy per clock cycle of whole NIU
|
||||
|
||||
//Cadence ChipEstimate using 65nm
|
||||
mac_gates = 111700;
|
||||
frontend_gates = 320000;
|
||||
SerDer_gates = 200000;
|
||||
NMOS_sizing = 5*g_tp.min_w_nmos_;
|
||||
PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{//Low power implementations are mostly from Cadence ChipEstimator; Ignore the multiple IP effect
|
||||
// ---When there are multiple IP (same kind or not) selected, Cadence ChipEstimator results are not
|
||||
// a simple summation of all IPs. Ignore this effect
|
||||
mac_area = 0.24 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
frontend_area = 0.1 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);//Frontend is the PCS layer
|
||||
SerDer_area = 0.35 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique"
|
||||
//and the ChipEstimator XAUI PHY hard IP, confirm that even PHY can scale perfectly with the technology
|
||||
//total area
|
||||
area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
|
||||
//Power
|
||||
//Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
|
||||
mac_dyn = 1.257e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
|
||||
//Cadence ChipEstimate using 65nm soft IP;
|
||||
frontend_dyn = 0.6e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
|
||||
//SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm
|
||||
SerDer_dyn = 0.0216*10*(interface_ip.F_sz_um/0.13)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
|
||||
SerDer_dyn /= niup.clockRate;//covert to energy per clock cycle of whole NIU
|
||||
|
||||
mac_gates = 111700;
|
||||
frontend_gates = 52000;
|
||||
SerDer_gates = 199260;
|
||||
|
||||
NMOS_sizing = g_tp.min_w_nmos_;
|
||||
PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
|
||||
}
|
||||
|
||||
power_t.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn;
|
||||
power_t.readOp.leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
|
||||
power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
|
||||
power_t.readOp.gate_leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
}
|
||||
|
||||
void NIUController::computeEnergy(bool is_tdp)
|
||||
{
|
||||
if (is_tdp)
|
||||
{
|
||||
|
||||
|
||||
power = power_t;
|
||||
power.readOp.dynamic *= niup.duty_cycle;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power = power_t;
|
||||
rt_power.readOp.dynamic *= niup.perc_load;
|
||||
}
|
||||
}
|
||||
|
||||
void NIUController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
bool long_channel = XML->sys.longer_channel_device;
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
cout << "NIU:" << endl;
|
||||
cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*niup.clockRate << " W" << endl;
|
||||
cout << indent_str<< "Subthreshold Leakage = "
|
||||
<< (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*niup.clockRate << " W" << endl;
|
||||
cout<<endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void NIUController::set_niu_param()
|
||||
{
|
||||
niup.clockRate = XML->sys.niu.clockrate;
|
||||
niup.clockRate *= 1e6;
|
||||
niup.num_units = XML->sys.niu.number_units;
|
||||
niup.duty_cycle = XML->sys.niu.duty_cycle;
|
||||
niup.perc_load = XML->sys.niu.total_load_perc;
|
||||
niup.type = XML->sys.niu.type;
|
||||
// niup.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
|
||||
}
|
||||
|
||||
PCIeController::PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_)
|
||||
:XML(XML_interface),
|
||||
interface_ip(*interface_ip_)
|
||||
{
|
||||
local_result = init_interface(&interface_ip);
|
||||
double frontend_area, phy_area, ctrl_area, SerDer_area;
|
||||
double ctrl_dyn, frontend_dyn, SerDer_dyn;
|
||||
double ctrl_gates,frontend_gates, SerDer_gates;
|
||||
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
|
||||
double NMOS_sizing, PMOS_sizing;
|
||||
|
||||
/* Assuming PCIe is bit-slice based architecture
|
||||
* This is the reason for /8 in both area and power calculation
|
||||
* to get per lane numbers
|
||||
*/
|
||||
|
||||
set_pcie_param();
|
||||
if (pciep.type == 0) //high performance NIU
|
||||
{
|
||||
//Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate @ 65nm.
|
||||
ctrl_area = (5.2 + 0.5)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
|
||||
frontend_area = (5.2 + 0.1)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
|
||||
//SerDer is very hard to scale
|
||||
SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
|
||||
phy_area = frontend_area + SerDer_area;
|
||||
//total area
|
||||
//Power
|
||||
//Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
|
||||
ctrl_dyn = 3.75e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
|
||||
// //Cadence ChipEstimate using 65nm soft IP;
|
||||
// frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
|
||||
//SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
|
||||
SerDer_dyn = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
|
||||
SerDer_dyn /= pciep.clockRate;//covert to energy per clock cycle
|
||||
|
||||
//power_t.readOp.dynamic = (ctrl_dyn)*pciep.num_channels;
|
||||
//Cadence ChipEstimate using 65nm
|
||||
ctrl_gates = 900000/8*pciep.num_channels;
|
||||
// frontend_gates = 120000/8;
|
||||
// SerDer_gates = 200000/8;
|
||||
NMOS_sizing = 5*g_tp.min_w_nmos_;
|
||||
PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrl_area = 0.412 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
|
||||
SerDer_area = 0.36 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//total area
|
||||
//Power
|
||||
//Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
|
||||
ctrl_dyn = 2.21e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
|
||||
// //Cadence ChipEstimate using 65nm soft IP;
|
||||
// frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
|
||||
//SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
|
||||
SerDer_dyn = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
|
||||
SerDer_dyn /= pciep.clockRate;//covert to energy per clock cycle
|
||||
|
||||
//Cadence ChipEstimate using 65nm
|
||||
ctrl_gates = 200000/8*pciep.num_channels;
|
||||
// frontend_gates = 120000/8;
|
||||
SerDer_gates = 200000/8*pciep.num_channels;
|
||||
NMOS_sizing = g_tp.min_w_nmos_;
|
||||
PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
|
||||
}
|
||||
area.set_area(((ctrl_area + (pciep.withPHY? SerDer_area:0))/8*pciep.num_channels)*1e6);
|
||||
power_t.readOp.dynamic = (ctrl_dyn + (pciep.withPHY? SerDer_dyn:0))*pciep.num_channels;
|
||||
power_t.readOp.leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
|
||||
power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
|
||||
power_t.readOp.gate_leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
}
|
||||
|
||||
void PCIeController::computeEnergy(bool is_tdp)
|
||||
{
|
||||
if (is_tdp)
|
||||
{
|
||||
|
||||
|
||||
power = power_t;
|
||||
power.readOp.dynamic *= pciep.duty_cycle;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power = power_t;
|
||||
rt_power.readOp.dynamic *= pciep.perc_load;
|
||||
}
|
||||
}
|
||||
|
||||
void PCIeController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
bool long_channel = XML->sys.longer_channel_device;
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
cout << "PCIe:" << endl;
|
||||
cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*pciep.clockRate << " W" << endl;
|
||||
cout << indent_str<< "Subthreshold Leakage = "
|
||||
<< (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*pciep.clockRate << " W" << endl;
|
||||
cout<<endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void PCIeController::set_pcie_param()
|
||||
{
|
||||
pciep.clockRate = XML->sys.pcie.clockrate;
|
||||
pciep.clockRate *= 1e6;
|
||||
pciep.num_units = XML->sys.pcie.number_units;
|
||||
pciep.num_channels = XML->sys.pcie.num_channels;
|
||||
pciep.duty_cycle = XML->sys.pcie.duty_cycle;
|
||||
pciep.perc_load = XML->sys.pcie.total_load_perc;
|
||||
pciep.type = XML->sys.pcie.type;
|
||||
pciep.withPHY = XML->sys.pcie.withPHY;
|
||||
// pciep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
|
||||
|
||||
}
|
||||
|
||||
FlashController::FlashController(ParseXML *XML_interface,InputParameter* interface_ip_)
|
||||
:XML(XML_interface),
|
||||
interface_ip(*interface_ip_)
|
||||
{
|
||||
local_result = init_interface(&interface_ip);
|
||||
double frontend_area, phy_area, ctrl_area, SerDer_area;
|
||||
double ctrl_dyn, frontend_dyn, SerDer_dyn;
|
||||
double ctrl_gates,frontend_gates, SerDer_gates;
|
||||
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
|
||||
double NMOS_sizing, PMOS_sizing;
|
||||
|
||||
/* Assuming PCIe is bit-slice based architecture
|
||||
* This is the reason for /8 in both area and power calculation
|
||||
* to get per lane numbers
|
||||
*/
|
||||
|
||||
set_fc_param();
|
||||
if (fcp.type == 0) //high performance NIU
|
||||
{
|
||||
cout<<"Current McPAT does not support high performance flash contorller since even low power designs are enough for maintain throughput"<<endl;
|
||||
exit(0);
|
||||
NMOS_sizing = 5*g_tp.min_w_nmos_;
|
||||
PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrl_area = 0.243 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL from CAST
|
||||
SerDer_area = 0.36/8 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
|
||||
//based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it support 8x lanes with each lane
|
||||
//speed up to 250MB/s (PCIe1.1x) This is already saturate the 200MB/s of the flash controller core above.
|
||||
ctrl_gates = 129267;
|
||||
SerDer_gates = 200000/8;
|
||||
NMOS_sizing = g_tp.min_w_nmos_;
|
||||
PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
|
||||
//Power
|
||||
//Cadence ChipEstimate using 65nm the controller 125mW for every 200MB/s This is power not energy!
|
||||
ctrl_dyn = 0.125*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
|
||||
//SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
|
||||
SerDer_dyn = 0.01*1.6*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
|
||||
//max Per controller speed is 1.6Gb/s (200MB/s)
|
||||
}
|
||||
double number_channel = 1+(fcp.num_channels-1)*0.2;
|
||||
area.set_area((ctrl_area + (fcp.withPHY? SerDer_area:0))*1e6*number_channel);
|
||||
power_t.readOp.dynamic = (ctrl_dyn + (fcp.withPHY? SerDer_dyn:0))*number_channel;
|
||||
power_t.readOp.leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
|
||||
power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
|
||||
power_t.readOp.gate_leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
}
|
||||
|
||||
void FlashController::computeEnergy(bool is_tdp)
|
||||
{
|
||||
if (is_tdp)
|
||||
{
|
||||
|
||||
|
||||
power = power_t;
|
||||
power.readOp.dynamic *= fcp.duty_cycle;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power = power_t;
|
||||
rt_power.readOp.dynamic *= fcp.perc_load;
|
||||
}
|
||||
}
|
||||
|
||||
void FlashController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
bool long_channel = XML->sys.longer_channel_device;
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
cout << "Flash Controller:" << endl;
|
||||
cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;//no multiply of clock since this is power already
|
||||
cout << indent_str<< "Subthreshold Leakage = "
|
||||
<< (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
|
||||
cout<<endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void FlashController::set_fc_param()
|
||||
{
|
||||
// fcp.clockRate = XML->sys.flashc.mc_clock;
|
||||
// fcp.clockRate *= 1e6;
|
||||
fcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
|
||||
fcp.num_channels = ceil(fcp.peakDataTransferRate/200);
|
||||
fcp.num_mcs = XML->sys.flashc.number_mcs;
|
||||
fcp.duty_cycle = XML->sys.flashc.duty_cycle;
|
||||
fcp.perc_load = XML->sys.flashc.total_load_perc;
|
||||
fcp.type = XML->sys.flashc.type;
|
||||
fcp.withPHY = XML->sys.flashc.withPHY;
|
||||
// flashcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
|
||||
|
||||
}
|
87
ext/mcpat/iocontrollers.h
Normal file
87
ext/mcpat/iocontrollers.h
Normal file
|
@ -0,0 +1,87 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#ifndef IOCONTROLLERS_H_
|
||||
#define IOCONTROLLERS_H_
|
||||
|
||||
|
||||
#endif /* IOCONTROLLERS_H_ */
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "parameter.h"
|
||||
//#include "io.h"
|
||||
#include "array.h"
|
||||
//#include "Undifferentiated_Core_Area.h"
|
||||
#include <vector>
|
||||
|
||||
#include "basic_components.h"
|
||||
|
||||
class NIUController : public Component {
|
||||
public:
|
||||
ParseXML *XML;
|
||||
InputParameter interface_ip;
|
||||
NIUParam niup;
|
||||
powerDef power_t;
|
||||
uca_org_t local_result;
|
||||
NIUController(ParseXML *XML_interface,InputParameter* interface_ip_);
|
||||
void set_niu_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~NIUController(){};
|
||||
};
|
||||
|
||||
class PCIeController : public Component {
|
||||
public:
|
||||
ParseXML *XML;
|
||||
InputParameter interface_ip;
|
||||
PCIeParam pciep;
|
||||
powerDef power_t;
|
||||
uca_org_t local_result;
|
||||
PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_);
|
||||
void set_pcie_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~PCIeController(){};
|
||||
};
|
||||
|
||||
class FlashController : public Component {
|
||||
public:
|
||||
ParseXML *XML;
|
||||
InputParameter interface_ip;
|
||||
MCParam fcp;
|
||||
powerDef power_t;
|
||||
uca_org_t local_result;
|
||||
FlashController(ParseXML *XML_interface,InputParameter* interface_ip_);
|
||||
void set_fc_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~FlashController(){};
|
||||
};
|
||||
|
1014
ext/mcpat/logic.cc
Normal file
1014
ext/mcpat/logic.cc
Normal file
File diff suppressed because it is too large
Load diff
233
ext/mcpat/logic.h
Normal file
233
ext/mcpat/logic.h
Normal file
|
@ -0,0 +1,233 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#ifndef LOGIC_H_
|
||||
#define LOGIC_H_
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "arch_const.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "basic_components.h"
|
||||
#include "cacti_interface.h"
|
||||
#include "component.h"
|
||||
#include "const.h"
|
||||
#include "decoder.h"
|
||||
#include "parameter.h"
|
||||
#include "xmlParser.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class selection_logic : public Component{
|
||||
public:
|
||||
selection_logic(bool _is_default, int win_entries_,
|
||||
int issue_width_, const InputParameter *configure_interface,
|
||||
enum Device_ty device_ty_=Core_device,
|
||||
enum Core_type core_ty_=Inorder);//, const ParseXML *_XML_interface);
|
||||
bool is_default;
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
const ParseXML *XML_interface;
|
||||
int win_entries;
|
||||
int issue_width;
|
||||
int num_threads;
|
||||
enum Device_ty device_ty;
|
||||
enum Core_type core_ty;
|
||||
|
||||
void selection_power();
|
||||
void leakage_feedback(double temperature); // TODO
|
||||
};
|
||||
|
||||
class dep_resource_conflict_check : public Component{
|
||||
public:
|
||||
dep_resource_conflict_check(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, int compare_bits_, bool _is_default=true);
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
double WNORn, WNORp, Wevalinvp, Wevalinvn, Wcompn, Wcompp, Wcomppreequ;
|
||||
CoreDynParam coredynp;
|
||||
int compare_bits;
|
||||
bool is_default;
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
|
||||
void conflict_check_power();
|
||||
double compare_cap();
|
||||
~dep_resource_conflict_check(){
|
||||
local_result.cleanup();
|
||||
}
|
||||
|
||||
void leakage_feedback(double temperature);
|
||||
};
|
||||
|
||||
class inst_decoder: public Component{
|
||||
public:
|
||||
inst_decoder(bool _is_default, const InputParameter *configure_interface,
|
||||
int opcode_length_,
|
||||
int num_decoders_,
|
||||
bool x86_,
|
||||
enum Device_ty device_ty_=Core_device,
|
||||
enum Core_type core_ty_=Inorder);
|
||||
inst_decoder();
|
||||
bool is_default;
|
||||
int opcode_length;
|
||||
int num_decoders;
|
||||
bool x86;
|
||||
int num_decoder_segments;
|
||||
int num_decoded_signals;
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
enum Device_ty device_ty;
|
||||
enum Core_type core_ty;
|
||||
|
||||
Decoder * final_dec;
|
||||
Predec * pre_dec;
|
||||
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
void inst_decoder_delay_power();
|
||||
~inst_decoder();
|
||||
void leakage_feedback(double temperature);
|
||||
};
|
||||
|
||||
class DFFCell : public Component {
|
||||
public:
|
||||
DFFCell(bool _is_dram, double _WdecNANDn, double _WdecNANDp,double _cell_load,
|
||||
const InputParameter *configure_interface);
|
||||
InputParameter l_ip;
|
||||
bool is_dram;
|
||||
double cell_load;
|
||||
double WdecNANDn;
|
||||
double WdecNANDp;
|
||||
double clock_cap;
|
||||
int model;
|
||||
int n_switch;
|
||||
int n_keep_1;
|
||||
int n_keep_0;
|
||||
int n_clock;
|
||||
powerDef e_switch;
|
||||
powerDef e_keep_1;
|
||||
powerDef e_keep_0;
|
||||
powerDef e_clock;
|
||||
|
||||
double fpfp_node_cap(unsigned int fan_in, unsigned int fan_out);
|
||||
void compute_DFF_cell(void);
|
||||
};
|
||||
|
||||
class Pipeline : public Component{
|
||||
public:
|
||||
Pipeline(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, enum Device_ty device_ty_=Core_device, bool _is_core_pipeline=true, bool _is_default=true);
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
CoreDynParam coredynp;
|
||||
enum Device_ty device_ty;
|
||||
bool is_core_pipeline, is_default;
|
||||
double num_piperegs;
|
||||
// int pipeline_stages;
|
||||
// int tot_stage_vector, per_stage_vector;
|
||||
bool process_ind;
|
||||
double WNANDn ;
|
||||
double WNANDp;
|
||||
double load_per_pipeline_stage;
|
||||
// int Hthread, num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length;
|
||||
// int PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width;
|
||||
// bool thread_clock_gated;
|
||||
// bool in_order, multithreaded;
|
||||
void compute_stage_vector();
|
||||
void compute();
|
||||
~Pipeline(){
|
||||
local_result.cleanup();
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
//class core_pipeline :public pipeline{
|
||||
//public:
|
||||
// int Hthread, num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length;
|
||||
// int PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width;
|
||||
// bool thread_clock_gated;
|
||||
// bool in_order, multithreaded;
|
||||
// core_pipeline(bool _is_default, const InputParameter *configure_interface);
|
||||
// virtual void compute_stage_vector();
|
||||
//
|
||||
//};
|
||||
|
||||
class FunctionalUnit :public Component{
|
||||
public:
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double FU_height;
|
||||
double clockRate,executionTime;
|
||||
double num_fu;
|
||||
double energy, base_energy,per_access_energy, leakage, gate_leakage;
|
||||
bool is_default;
|
||||
enum FU_type fu_type;
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
|
||||
FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
void leakage_feedback(double temperature);
|
||||
|
||||
};
|
||||
|
||||
class UndiffCore :public Component{
|
||||
public:
|
||||
UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true, bool embedded_=false);
|
||||
ParseXML *XML;
|
||||
int ithCore;
|
||||
InputParameter interface_ip;
|
||||
CoreDynParam coredynp;
|
||||
double clockRate,executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
enum Core_type core_ty;
|
||||
bool opt_performance, embedded;
|
||||
double pipeline_stage,num_hthreads,issue_width;
|
||||
bool is_default;
|
||||
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~UndiffCore(){};
|
||||
bool exist;
|
||||
|
||||
|
||||
};
|
||||
#endif /* LOGIC_H_ */
|
101
ext/mcpat/main.cc
Normal file
101
ext/mcpat/main.cc
Normal file
|
@ -0,0 +1,101 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#include <iostream>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "globalvar.h"
|
||||
#include "io.h"
|
||||
#include "processor.h"
|
||||
#include "version.h"
|
||||
#include "xmlParser.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void print_usage(char * argv0);
|
||||
|
||||
int main(int argc,char *argv[])
|
||||
{
|
||||
char * fb ;
|
||||
bool infile_specified = false;
|
||||
int plevel = 2;
|
||||
opt_for_clk =true;
|
||||
//cout.precision(10);
|
||||
if (argc <= 1 || argv[1] == string("-h") || argv[1] == string("--help"))
|
||||
{
|
||||
print_usage(argv[0]);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < argc; i++)
|
||||
{
|
||||
if (argv[i] == string("-infile"))
|
||||
{
|
||||
infile_specified = true;
|
||||
i++;
|
||||
fb = argv[ i];
|
||||
}
|
||||
|
||||
if (argv[i] == string("-print_level"))
|
||||
{
|
||||
i++;
|
||||
plevel = atoi(argv[i]);
|
||||
}
|
||||
|
||||
if (argv[i] == string("-opt_for_clk"))
|
||||
{
|
||||
i++;
|
||||
opt_for_clk = (bool)atoi(argv[i]);
|
||||
}
|
||||
}
|
||||
if (infile_specified == false)
|
||||
{
|
||||
print_usage(argv[0]);
|
||||
}
|
||||
|
||||
|
||||
cout<<"McPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
|
||||
<< " of " << VER_UPDATE << ") is computing the target processor...\n "<<endl;
|
||||
|
||||
//parse XML-based interface
|
||||
ParseXML *p1= new ParseXML();
|
||||
p1->parse(fb);
|
||||
Processor proc(p1);
|
||||
proc.displayEnergy(2, plevel);
|
||||
delete p1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void print_usage(char * argv0)
|
||||
{
|
||||
cerr << "How to use McPAT:" << endl;
|
||||
cerr << " mcpat -infile <input file name> -print_level < level of details 0~5 > -opt_for_clk < 0 (optimize for ED^2P only)/1 (optimzed for target clock rate)>"<< endl;
|
||||
//cerr << " Note:default print level is at processor level, please increase it to see the details" << endl;
|
||||
exit(1);
|
||||
}
|
28
ext/mcpat/makefile
Normal file
28
ext/mcpat/makefile
Normal file
|
@ -0,0 +1,28 @@
|
|||
TAR = mcpat
|
||||
|
||||
.PHONY: dbg opt depend clean clean_dbg clean_opt
|
||||
|
||||
all: opt
|
||||
|
||||
dbg: $(TAR).mk obj_dbg
|
||||
@$(MAKE) TAG=dbg -C . -f $(TAR).mk
|
||||
|
||||
opt: $(TAR).mk obj_opt
|
||||
@$(MAKE) TAG=opt -C . -f $(TAR).mk
|
||||
|
||||
obj_dbg:
|
||||
mkdir $@
|
||||
|
||||
obj_opt:
|
||||
mkdir $@
|
||||
|
||||
clean: clean_dbg clean_opt
|
||||
|
||||
clean_dbg: obj_dbg
|
||||
@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
|
||||
rm -rf $<
|
||||
|
||||
clean_opt: obj_opt
|
||||
@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
|
||||
rm -rf $<
|
||||
|
81
ext/mcpat/mcpat.mk
Normal file
81
ext/mcpat/mcpat.mk
Normal file
|
@ -0,0 +1,81 @@
|
|||
TARGET = mcpat
|
||||
SHELL = /bin/sh
|
||||
.PHONY: all depend clean
|
||||
.SUFFIXES: .cc .o
|
||||
|
||||
ifndef NTHREADS
|
||||
NTHREADS = 4
|
||||
endif
|
||||
|
||||
|
||||
LIBS =
|
||||
INCS = -lm
|
||||
|
||||
ifeq ($(TAG),dbg)
|
||||
DBG = -Wall
|
||||
OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
|
||||
else
|
||||
DBG =
|
||||
OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
|
||||
#OPT = -O0 -DNTHREADS=$(NTHREADS)
|
||||
endif
|
||||
|
||||
#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT)
|
||||
CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT)
|
||||
CXX = g++ -m32
|
||||
CC = gcc -m32
|
||||
|
||||
VPATH = cacti
|
||||
|
||||
SRCS = \
|
||||
Ucache.cc \
|
||||
XML_Parse.cc \
|
||||
arbiter.cc \
|
||||
area.cc \
|
||||
array.cc \
|
||||
bank.cc \
|
||||
basic_circuit.cc \
|
||||
basic_components.cc \
|
||||
cacti_interface.cc \
|
||||
component.cc \
|
||||
core.cc \
|
||||
crossbar.cc \
|
||||
decoder.cc \
|
||||
htree2.cc \
|
||||
interconnect.cc \
|
||||
io.cc \
|
||||
iocontrollers.cc \
|
||||
logic.cc \
|
||||
main.cc \
|
||||
mat.cc \
|
||||
memoryctrl.cc \
|
||||
noc.cc \
|
||||
nuca.cc \
|
||||
parameter.cc \
|
||||
processor.cc \
|
||||
router.cc \
|
||||
sharedcache.cc \
|
||||
subarray.cc \
|
||||
technology.cc \
|
||||
uca.cc \
|
||||
wire.cc \
|
||||
xmlParser.cc
|
||||
|
||||
OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
|
||||
|
||||
all: obj_$(TAG)/$(TARGET)
|
||||
cp -f obj_$(TAG)/$(TARGET) $(TARGET)
|
||||
|
||||
obj_$(TAG)/$(TARGET) : $(OBJS)
|
||||
$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
|
||||
|
||||
#obj_$(TAG)/%.o : %.cc
|
||||
# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
|
||||
|
||||
obj_$(TAG)/%.o : %.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
-rm -f *.o $(TARGET)
|
||||
|
||||
|
81
ext/mcpat/mcpatXeonCore.mk
Normal file
81
ext/mcpat/mcpatXeonCore.mk
Normal file
|
@ -0,0 +1,81 @@
|
|||
TARGET = mcpatXeonCore
|
||||
SHELL = /bin/sh
|
||||
.PHONY: all depend clean
|
||||
.SUFFIXES: .cc .o
|
||||
|
||||
ifndef NTHREADS
|
||||
NTHREADS = 4
|
||||
endif
|
||||
|
||||
|
||||
LIBS =
|
||||
INCS = -lm
|
||||
|
||||
ifeq ($(TAG),dbg)
|
||||
DBG = -Wall
|
||||
OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
|
||||
else
|
||||
DBG =
|
||||
OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
|
||||
#OPT = -O0 -DNTHREADS=$(NTHREADS)
|
||||
endif
|
||||
|
||||
#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT)
|
||||
CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT)
|
||||
CXX = g++ -m32
|
||||
CC = gcc -m32
|
||||
|
||||
VPATH = cacti
|
||||
|
||||
SRCS = \
|
||||
Ucache.cc \
|
||||
XML_Parse.cc \
|
||||
arbiter.cc \
|
||||
area.cc \
|
||||
array.cc \
|
||||
bank.cc \
|
||||
basic_circuit.cc \
|
||||
basic_components.cc \
|
||||
cacti_interface.cc \
|
||||
component.cc \
|
||||
core.cc \
|
||||
crossbar.cc \
|
||||
decoder.cc \
|
||||
htree2.cc \
|
||||
interconnect.cc \
|
||||
io.cc \
|
||||
iocontrollers.cc \
|
||||
logic.cc \
|
||||
main.cc \
|
||||
mat.cc \
|
||||
memoryctrl.cc \
|
||||
noc.cc \
|
||||
nuca.cc \
|
||||
parameter.cc \
|
||||
processor.cc \
|
||||
router.cc \
|
||||
sharedcache.cc \
|
||||
subarray.cc \
|
||||
technology_xeon_core.cc \
|
||||
uca.cc \
|
||||
wire.cc \
|
||||
xmlParser.cc
|
||||
|
||||
OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
|
||||
|
||||
all: obj_$(TAG)/$(TARGET)
|
||||
cp -f obj_$(TAG)/$(TARGET) $(TARGET)
|
||||
|
||||
obj_$(TAG)/$(TARGET) : $(OBJS)
|
||||
$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
|
||||
|
||||
#obj_$(TAG)/%.o : %.cc
|
||||
# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
|
||||
|
||||
obj_$(TAG)/%.o : %.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
-rm -f *.o $(TARGET)
|
||||
|
||||
|
736
ext/mcpat/memoryctrl.cc
Normal file
736
ext/mcpat/memoryctrl.cc
Normal file
|
@ -0,0 +1,736 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "basic_components.h"
|
||||
#include "const.h"
|
||||
#include "io.h"
|
||||
#include "logic.h"
|
||||
#include "memoryctrl.h"
|
||||
#include "parameter.h"
|
||||
|
||||
/* overview of MC models:
|
||||
* McPAT memory controllers are modeled according to large number of industrial data points.
|
||||
* The Basic memory controller architecture is base on the Synopsis designs
|
||||
* (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers)
|
||||
* as in Cadence ChipEstimator Tool
|
||||
*
|
||||
* An MC has 3 parts as shown in this design. McPAT models both high performance MC
|
||||
* based on Niagara processor designs and curving and low power MC based on data points in
|
||||
* Cadence ChipEstimator Tool.
|
||||
*
|
||||
* The frontend is modeled analytically, the backend is modeled empirically according to
|
||||
* DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator Tool
|
||||
* The PHY is modeled based on
|
||||
* "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006,
|
||||
* and A 14mW 6.25Gb/s Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007
|
||||
*
|
||||
* In Cadence ChipEstimator Tool there are two types of memory controllers: the full memory controllers
|
||||
* that includes the frontend as the DesignWare DDR2/DDR3-Lite memory controllers and the backend only
|
||||
* memory controllers as the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite memory
|
||||
* controllers, all memory controller IP in Cadence ChipEstimator Tool are backend memory controllers such as
|
||||
* DDRC 1600A and DDRC 800A). Thus,to some extend the area and power difference between DesignWare
|
||||
* DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers can be an estimation to the
|
||||
* frontend power and area, which is very close the analitically modeled results of the frontend for Niagara2@65nm
|
||||
*
|
||||
*/
|
||||
|
||||
MCBackend::MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
|
||||
:l_ip(*interface_ip_),
|
||||
mc_type(mc_type_),
|
||||
mcp(mcp_)
|
||||
{
|
||||
|
||||
local_result = init_interface(&l_ip);
|
||||
compute();
|
||||
|
||||
}
|
||||
|
||||
|
||||
void MCBackend::compute()
|
||||
{
|
||||
//double max_row_addr_width = 20.0;//Current address 12~18bits
|
||||
double C_MCB, mc_power, backend_dyn, backend_gates;//, refresh_period,refresh_freq;//Equivalent per bit Cap for backend,
|
||||
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
|
||||
double NMOS_sizing, PMOS_sizing;
|
||||
|
||||
if (mc_type == MC)
|
||||
{
|
||||
if (mcp.type == 0)
|
||||
{
|
||||
//area = (2.2927*log(peakDataTransferRate)-14.504)*memDataWidth/144.0*(l_ip.F_sz_um/0.09);
|
||||
area.set_area((2.7927*log(mcp.peakDataTransferRate*2)-19.862)/2.0*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6);//um^2
|
||||
//assuming the approximately same scaling factor as seen in processors.
|
||||
//C_MCB=0.2/1.3/1.3/266/64/0.09*g_ip.F_sz_um;//based on AMD Geode processor which has a very basic mc on chip.
|
||||
//C_MCB = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power numbers.The base power (W) is divided by device frequency and vdd and scale to target process.
|
||||
//mc_power = 0.0291*2;//29.1mW@200MHz @130nm From Power Analysis of SystemLevel OnChip Communication Architectures by Lahiri et
|
||||
mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
|
||||
C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
|
||||
power_t.readOp.dynamic = C_MCB*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(mcp.dataBusWidth/*+mcp.addressBusWidth*/);//per access energy in memory controller
|
||||
power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
|
||||
power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
|
||||
|
||||
}
|
||||
else
|
||||
{ NMOS_sizing = g_tp.min_w_nmos_;
|
||||
PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
area.set_area(0.15*mcp.dataBusWidth/72.0*(l_ip.F_sz_um/0.065)* (l_ip.F_sz_um/0.065)*mcp.num_channels*1e6);//um^2
|
||||
backend_dyn = 0.9e-9/800e6*mcp.clockRate/12800*mcp.peakDataTransferRate*mcp.dataBusWidth/72.0*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(l_ip.F_sz_nm/65.0);//Average on DDR2/3 protocol controller and DDRC 1600/800A in Cadence ChipEstimate
|
||||
//Scaling to technology and DIMM feature. The base IP support DDR3-1600(PC3 12800)
|
||||
backend_gates = 50000*mcp.dataBusWidth/64.0;//5000 is from Cadence ChipEstimator
|
||||
|
||||
power_t.readOp.dynamic = backend_dyn;
|
||||
power_t.readOp.leakage = (backend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
power_t.readOp.gate_leakage = (backend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{//skip old model
|
||||
cout<<"Unknown memory controllers"<<endl;exit(0);
|
||||
area.set_area(0.243*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
|
||||
//mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
|
||||
C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
|
||||
power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
|
||||
power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
|
||||
power_t.readOp.dynamic *= 1.2;
|
||||
power_t.readOp.leakage *= 1.2;
|
||||
power_t.readOp.gate_leakage *= 1.2;
|
||||
//flash controller has about 20% more backend power since BCH ECC in flash is complex and power hungry
|
||||
}
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
|
||||
power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
|
||||
}
|
||||
|
||||
void MCBackend::computeEnergy(bool is_tdp)
|
||||
{
|
||||
//backend uses internal data buswidth
|
||||
if (is_tdp)
|
||||
{
|
||||
//init stats for Peak
|
||||
stats_t.readAc.access = 0.5*mcp.num_channels;
|
||||
stats_t.writeAc.access = 0.5*mcp.num_channels;
|
||||
tdp_stats = stats_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
//init stats for runtime power (RTP)
|
||||
stats_t.readAc.access = mcp.reads;
|
||||
stats_t.writeAc.access = mcp.writes;
|
||||
tdp_stats = stats_t;
|
||||
}
|
||||
if (is_tdp)
|
||||
{
|
||||
power = power_t;
|
||||
power.readOp.dynamic = (stats_t.readAc.access + stats_t.writeAc.access)*power_t.readOp.dynamic;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power.readOp.dynamic = (stats_t.readAc.access + stats_t.writeAc.access)*mcp.llcBlockSize*8.0/mcp.dataBusWidth*power_t.readOp.dynamic;
|
||||
rt_power = rt_power + power_t*pppm_lkg;
|
||||
rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
|
||||
//Assume 10% of peak power is consumed by routine job including memory refreshing and scrubbing
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
MCPHY::MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
|
||||
:l_ip(*interface_ip_),
|
||||
mc_type(mc_type_),
|
||||
mcp(mcp_)
|
||||
{
|
||||
|
||||
local_result = init_interface(&l_ip);
|
||||
compute();
|
||||
}
|
||||
|
||||
void MCPHY::compute()
|
||||
{
|
||||
//PHY uses internal data buswidth but the actuall off-chip datawidth is 64bits + ecc
|
||||
double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio() ;
|
||||
/*
|
||||
* according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006;
|
||||
* From Cadence ChipEstimator for normal I/O around 0.4~0.8 mW/Gb/s
|
||||
*/
|
||||
double power_per_gb_per_s, phy_dyn,phy_gates, NMOS_sizing, PMOS_sizing;
|
||||
|
||||
if (mc_type == MC)
|
||||
{
|
||||
if (mcp.type == 0)
|
||||
{
|
||||
power_per_gb_per_s = mcp.LVDS? 0.01:0.04;
|
||||
//Based on die photos from Niagara 1 and 2.
|
||||
//TODO merge this into undifferentiated core.PHY only achieves square root of the ideal scaling.
|
||||
//area = (6.4323*log(peakDataTransferRate)-34.76)*memDataWidth/128.0*(l_ip.F_sz_um/0.09);
|
||||
area.set_area((6.4323*log(mcp.peakDataTransferRate*2)-48.134)*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6/2);//TODO:/2
|
||||
//This is from curve fitting based on Niagara 1 and 2's PHY die photo.
|
||||
//This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
|
||||
//power.readOp.dynamic = 0.02*memAccesses*llcBlocksize*8;//change from Bytes to bits.
|
||||
power_t.readOp.dynamic = power_per_gb_per_s*sqrt(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
|
||||
power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
|
||||
power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
NMOS_sizing = g_tp.min_w_nmos_;
|
||||
PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
|
||||
//Designware/synopsis 16bit DDR3 PHY is 1.3mm (WITH IOs) at 40nm for upto DDR3 2133 (PC3 17066)
|
||||
double non_IO_percentage = 0.2;
|
||||
area.set_area(1.3*non_IO_percentage/2133.0e6*mcp.clockRate/17066*mcp.peakDataTransferRate*mcp.dataBusWidth/16.0*(l_ip.F_sz_um/0.040)* (l_ip.F_sz_um/0.040)*mcp.num_channels*1e6);//um^2
|
||||
phy_gates = 200000*mcp.dataBusWidth/64.0;
|
||||
power_per_gb_per_s = 0.01;
|
||||
//This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
|
||||
power_t.readOp.dynamic = power_per_gb_per_s*(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
|
||||
power_t.readOp.leakage = (mcp.withPHY? phy_gates:0)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
power_t.readOp.gate_leakage = (mcp.withPHY? phy_gates:0)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
area.set_area(0.4e6/2*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
|
||||
}
|
||||
|
||||
// double phy_factor = (int)ceil(mcp.dataBusWidth/72.0);//Previous phy power numbers are based on 72 bit DIMM interface
|
||||
// power_t.readOp.dynamic *= phy_factor;
|
||||
// power_t.readOp.leakage *= phy_factor;
|
||||
// power_t.readOp.gate_leakage *= phy_factor;
|
||||
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
|
||||
power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
|
||||
}
|
||||
|
||||
|
||||
void MCPHY::computeEnergy(bool is_tdp)
|
||||
{
|
||||
if (is_tdp)
|
||||
{
|
||||
//init stats for Peak
|
||||
stats_t.readAc.access = 0.5*mcp.num_channels; //time share on buses
|
||||
stats_t.writeAc.access = 0.5*mcp.num_channels;
|
||||
tdp_stats = stats_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
//init stats for runtime power (RTP)
|
||||
stats_t.readAc.access = mcp.reads;
|
||||
stats_t.writeAc.access = mcp.writes;
|
||||
tdp_stats = stats_t;
|
||||
}
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
double data_transfer_unit = (mc_type == MC)? 72:16;/*DIMM data width*/
|
||||
power = power_t;
|
||||
power.readOp.dynamic = power.readOp.dynamic * (mcp.peakDataTransferRate*8*1e6/1e9/*change to Gbs*/)*mcp.dataBusWidth/data_transfer_unit*mcp.num_channels/mcp.clockRate;
|
||||
// divide by clock rate is for match the final computation where *clock is used
|
||||
//(stats_t.readAc.access*power_t.readOp.dynamic+
|
||||
// stats_t.writeAc.access*power_t.readOp.dynamic);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power = power_t;
|
||||
// rt_power.readOp.dynamic = (stats_t.readAc.access*power_t.readOp.dynamic+
|
||||
// stats_t.writeAc.access*power_t.readOp.dynamic);
|
||||
|
||||
rt_power.readOp.dynamic=power_t.readOp.dynamic*(stats_t.readAc.access + stats_t.writeAc.access)*(mcp.llcBlockSize)*8/1e9/mcp.executionTime*(mcp.executionTime);
|
||||
rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
|
||||
}
|
||||
}
|
||||
|
||||
MCFrontEnd::MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
|
||||
:XML(XML_interface),
|
||||
interface_ip(*interface_ip_),
|
||||
mc_type(mc_type_),
|
||||
mcp(mcp_),
|
||||
MC_arb(0),
|
||||
frontendBuffer(0),
|
||||
readBuffer(0),
|
||||
writeBuffer(0)
|
||||
{
|
||||
/* All computations are for a single MC
|
||||
*
|
||||
*/
|
||||
|
||||
int tag, data;
|
||||
bool is_default =true;//indication for default setup
|
||||
|
||||
/* MC frontend engine channels share the same engines but logically partitioned
|
||||
* For all hardware inside MC. different channels do not share resources.
|
||||
* TODO: add docodeing/mux stage to steer memory requests to different channels.
|
||||
*/
|
||||
|
||||
//memory request reorder buffer
|
||||
tag = mcp.addressBusWidth + EXTRA_TAG_BITS + mcp.opcodeW;
|
||||
data = int(ceil((XML->sys.physical_address_width + mcp.opcodeW)/8.0));
|
||||
interface_ip.cache_sz = data*XML->sys.mc.req_window_size_per_channel;
|
||||
interface_ip.line_sz = data;
|
||||
interface_ip.assoc = 0;
|
||||
interface_ip.nbanks = 1;
|
||||
interface_ip.out_w = interface_ip.line_sz*8;
|
||||
interface_ip.specific_tag = 1;
|
||||
interface_ip.tag_w = tag;
|
||||
interface_ip.access_mode = 0;
|
||||
interface_ip.throughput = 1.0/mcp.clockRate;
|
||||
interface_ip.latency = 1.0/mcp.clockRate;
|
||||
interface_ip.is_cache = true;
|
||||
interface_ip.pure_cam = false;
|
||||
interface_ip.pure_ram = false;
|
||||
interface_ip.obj_func_dyn_energy = 0;
|
||||
interface_ip.obj_func_dyn_power = 0;
|
||||
interface_ip.obj_func_leak_power = 0;
|
||||
interface_ip.obj_func_cycle_t = 1;
|
||||
interface_ip.num_rw_ports = 0;
|
||||
interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc;
|
||||
interface_ip.num_wr_ports = interface_ip.num_rd_ports;
|
||||
interface_ip.num_se_rd_ports = 0;
|
||||
interface_ip.num_search_ports = XML->sys.mc.memory_channels_per_mc;
|
||||
frontendBuffer = new ArrayST(&interface_ip, "MC ReorderBuffer", Uncore_device);
|
||||
frontendBuffer->area.set_area(frontendBuffer->area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
|
||||
area.set_area(area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
|
||||
|
||||
//selection and arbitration logic
|
||||
MC_arb = new selection_logic(is_default, XML->sys.mc.req_window_size_per_channel,1,&interface_ip, Uncore_device);
|
||||
|
||||
//read buffers.
|
||||
data = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
|
||||
interface_ip.cache_sz = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
|
||||
interface_ip.line_sz = data;
|
||||
interface_ip.assoc = 1;
|
||||
interface_ip.nbanks = 1;
|
||||
interface_ip.out_w = interface_ip.line_sz*8;
|
||||
interface_ip.access_mode = 1;
|
||||
interface_ip.throughput = 1.0/mcp.clockRate;
|
||||
interface_ip.latency = 1.0/mcp.clockRate;
|
||||
interface_ip.is_cache = false;
|
||||
interface_ip.pure_cam = false;
|
||||
interface_ip.pure_ram = true;
|
||||
interface_ip.obj_func_dyn_energy = 0;
|
||||
interface_ip.obj_func_dyn_power = 0;
|
||||
interface_ip.obj_func_leak_power = 0;
|
||||
interface_ip.obj_func_cycle_t = 1;
|
||||
interface_ip.num_rw_ports = 0;//XML->sys.mc.memory_channels_per_mc*2>2?2:XML->sys.mc.memory_channels_per_mc*2;
|
||||
interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc;
|
||||
interface_ip.num_wr_ports = interface_ip.num_rd_ports;
|
||||
interface_ip.num_se_rd_ports = 0;
|
||||
readBuffer = new ArrayST(&interface_ip, "MC ReadBuffer", Uncore_device);
|
||||
readBuffer->area.set_area(readBuffer->area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
|
||||
area.set_area(area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
|
||||
|
||||
//write buffer
|
||||
data = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
|
||||
interface_ip.cache_sz = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
|
||||
interface_ip.line_sz = data;
|
||||
interface_ip.assoc = 1;
|
||||
interface_ip.nbanks = 1;
|
||||
interface_ip.out_w = interface_ip.line_sz*8;
|
||||
interface_ip.access_mode = 0;
|
||||
interface_ip.throughput = 1.0/mcp.clockRate;
|
||||
interface_ip.latency = 1.0/mcp.clockRate;
|
||||
interface_ip.obj_func_dyn_energy = 0;
|
||||
interface_ip.obj_func_dyn_power = 0;
|
||||
interface_ip.obj_func_leak_power = 0;
|
||||
interface_ip.obj_func_cycle_t = 1;
|
||||
interface_ip.num_rw_ports = 0;
|
||||
interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc;
|
||||
interface_ip.num_wr_ports = interface_ip.num_rd_ports;
|
||||
interface_ip.num_se_rd_ports = 0;
|
||||
writeBuffer = new ArrayST(&interface_ip, "MC writeBuffer", Uncore_device);
|
||||
writeBuffer->area.set_area(writeBuffer->area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
|
||||
area.set_area(area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
|
||||
}
|
||||
|
||||
void MCFrontEnd::computeEnergy(bool is_tdp)
|
||||
{
|
||||
if (is_tdp)
|
||||
{
|
||||
//init stats for Peak
|
||||
frontendBuffer->stats_t.readAc.access = frontendBuffer->l_ip.num_search_ports;
|
||||
frontendBuffer->stats_t.writeAc.access = frontendBuffer->l_ip.num_wr_ports;
|
||||
frontendBuffer->tdp_stats = frontendBuffer->stats_t;
|
||||
|
||||
readBuffer->stats_t.readAc.access = readBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
|
||||
readBuffer->stats_t.writeAc.access = readBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
|
||||
readBuffer->tdp_stats = readBuffer->stats_t;
|
||||
|
||||
writeBuffer->stats_t.readAc.access = writeBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
|
||||
writeBuffer->stats_t.writeAc.access = writeBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
|
||||
writeBuffer->tdp_stats = writeBuffer->stats_t;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
//init stats for runtime power (RTP)
|
||||
frontendBuffer->stats_t.readAc.access = XML->sys.mc.memory_reads *mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
|
||||
//For each channel, each memory word need to check the address data to achieve best scheduling results.
|
||||
//and this need to be done on all physical DIMMs in each logical memory DIMM *mcp.dataBusWidth/72
|
||||
frontendBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
|
||||
frontendBuffer->rtp_stats = frontendBuffer->stats_t;
|
||||
|
||||
readBuffer->stats_t.readAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
|
||||
readBuffer->stats_t.writeAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
|
||||
readBuffer->rtp_stats = readBuffer->stats_t;
|
||||
|
||||
writeBuffer->stats_t.readAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
|
||||
writeBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
|
||||
writeBuffer->rtp_stats = writeBuffer->stats_t;
|
||||
}
|
||||
|
||||
frontendBuffer->power_t.reset();
|
||||
readBuffer->power_t.reset();
|
||||
writeBuffer->power_t.reset();
|
||||
|
||||
// frontendBuffer->power_t.readOp.dynamic += (frontendBuffer->stats_t.readAc.access*
|
||||
// (frontendBuffer->local_result.power.searchOp.dynamic+frontendBuffer->local_result.power.readOp.dynamic)+
|
||||
// frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic);
|
||||
|
||||
frontendBuffer->power_t.readOp.dynamic += (frontendBuffer->stats_t.readAc.access +
|
||||
frontendBuffer->stats_t.writeAc.access)*frontendBuffer->local_result.power.searchOp.dynamic
|
||||
+ frontendBuffer->stats_t.readAc.access * frontendBuffer->local_result.power.readOp.dynamic
|
||||
+ frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic;
|
||||
|
||||
readBuffer->power_t.readOp.dynamic += (readBuffer->stats_t.readAc.access*
|
||||
readBuffer->local_result.power.readOp.dynamic+
|
||||
readBuffer->stats_t.writeAc.access*readBuffer->local_result.power.writeOp.dynamic);
|
||||
writeBuffer->power_t.readOp.dynamic += (writeBuffer->stats_t.readAc.access*
|
||||
writeBuffer->local_result.power.readOp.dynamic+
|
||||
writeBuffer->stats_t.writeAc.access*writeBuffer->local_result.power.writeOp.dynamic);
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
power = power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
|
||||
(frontendBuffer->local_result.power +
|
||||
readBuffer->local_result.power +
|
||||
writeBuffer->local_result.power)*pppm_lkg;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power = rt_power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
|
||||
(frontendBuffer->local_result.power +
|
||||
readBuffer->local_result.power +
|
||||
writeBuffer->local_result.power)*pppm_lkg;
|
||||
rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
|
||||
}
|
||||
}
|
||||
|
||||
void MCFrontEnd::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
cout << indent_str << "Front End ROB:" << endl;
|
||||
cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->power.readOp.leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << frontendBuffer->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << frontendBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
|
||||
cout <<endl;
|
||||
cout << indent_str<< "Read Buffer:" << endl;
|
||||
cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6 << " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << readBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->power.readOp.leakage << " W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << readBuffer->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << readBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
cout << indent_str << "Write Buffer:" << endl;
|
||||
cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << writeBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->power.readOp.leakage << " W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << writeBuffer->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << writeBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << indent_str << "Front End ROB:" << endl;
|
||||
cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->rt_power.readOp.leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << frontendBuffer->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
cout <<endl;
|
||||
cout << indent_str<< "Read Buffer:" << endl;
|
||||
cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6 << " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << readBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->rt_power.readOp.leakage << " W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << readBuffer->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
cout <<endl;
|
||||
cout << indent_str << "Write Buffer:" << endl;
|
||||
cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << writeBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->rt_power.readOp.leakage << " W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << writeBuffer->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
MemoryController::MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_)
|
||||
:XML(XML_interface),
|
||||
interface_ip(*interface_ip_),
|
||||
mc_type(mc_type_),
|
||||
frontend(0),
|
||||
transecEngine(0),
|
||||
PHY(0),
|
||||
pipeLogic(0)
|
||||
{
|
||||
/* All computations are for a single MC
|
||||
*
|
||||
*/
|
||||
interface_ip.wire_is_mat_type = 2;
|
||||
interface_ip.wire_os_mat_type = 2;
|
||||
interface_ip.wt =Global;
|
||||
set_mc_param();
|
||||
frontend = new MCFrontEnd(XML, &interface_ip, mcp, mc_type);
|
||||
area.set_area(area.get_area()+ frontend->area.get_area());
|
||||
transecEngine = new MCBackend(&interface_ip, mcp, mc_type);
|
||||
area.set_area(area.get_area()+ transecEngine->area.get_area());
|
||||
if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
|
||||
{
|
||||
PHY = new MCPHY(&interface_ip, mcp, mc_type);
|
||||
area.set_area(area.get_area()+ PHY->area.get_area());
|
||||
}
|
||||
//+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
|
||||
// transecEngine.initialize(&interface_ip);
|
||||
// transecEngine.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
|
||||
// transecEngine.memDataWidth = dataBusWidth;
|
||||
// transecEngine.memRank = XML->sys.mem.number_ranks;
|
||||
// //transecEngine.memAccesses=XML->sys.mc.memory_accesses;
|
||||
// //transecEngine.llcBlocksize=llcBlockSize;
|
||||
// transecEngine.compute();
|
||||
// transecEngine.area.set_area(XML->sys.mc.memory_channels_per_mc*transecEngine.area.get_area()) ;
|
||||
// area.set_area(area.get_area()+ transecEngine.area.get_area());
|
||||
// ///cout<<"area="<<area<<endl;
|
||||
////
|
||||
// //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
|
||||
// PHY.initialize(&interface_ip);
|
||||
// PHY.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
|
||||
// PHY.memDataWidth = dataBusWidth;
|
||||
// //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
|
||||
// //PHY.llcBlocksize=llcBlockSize;
|
||||
// PHY.compute();
|
||||
// PHY.area.set_area(XML->sys.mc.memory_channels_per_mc*PHY.area.get_area()) ;
|
||||
// area.set_area(area.get_area()+ PHY.area.get_area());
|
||||
///cout<<"area="<<area<<endl;
|
||||
//
|
||||
// interface_ip.pipeline_stages = 5;//normal memory controller has five stages in the pipeline.
|
||||
// interface_ip.per_stage_vector = addressBusWidth + XML->sys.core[0].opcode_width + dataBusWidth;
|
||||
// pipeLogic = new pipeline(is_default, &interface_ip);
|
||||
// //pipeLogic.init_pipeline(is_default, &interface_ip);
|
||||
// pipeLogic->compute_pipeline();
|
||||
// area.set_area(area.get_area()+ pipeLogic->area.get_area()*1e-6);
|
||||
// area.set_area((area.get_area()+mc_area*1e-6)*1.1);//placement and routing overhead
|
||||
//
|
||||
//
|
||||
//// //clock
|
||||
//// clockNetwork.init_wire_external(is_default, &interface_ip);
|
||||
//// clockNetwork.clk_area =area*1.1;//10% of placement overhead. rule of thumb
|
||||
//// clockNetwork.end_wiring_level =5;//toplevel metal
|
||||
//// clockNetwork.start_wiring_level =5;//toplevel metal
|
||||
//// clockNetwork.num_regs = pipeLogic.tot_stage_vector;
|
||||
//// clockNetwork.optimize_wire();
|
||||
|
||||
|
||||
}
|
||||
void MemoryController::computeEnergy(bool is_tdp)
|
||||
{
|
||||
|
||||
frontend->computeEnergy(is_tdp);
|
||||
transecEngine->computeEnergy(is_tdp);
|
||||
if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
|
||||
{
|
||||
PHY->computeEnergy(is_tdp);
|
||||
}
|
||||
if (is_tdp)
|
||||
{
|
||||
power = power + frontend->power + transecEngine->power;
|
||||
if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
|
||||
{
|
||||
power = power + PHY->power;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rt_power = rt_power + frontend->rt_power + transecEngine->rt_power;
|
||||
if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
|
||||
{
|
||||
rt_power = rt_power + PHY->rt_power;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
bool long_channel = XML->sys.longer_channel_device;
|
||||
|
||||
if (is_tdp)
|
||||
{
|
||||
cout << "Memory Controller:" << endl;
|
||||
cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str<< "Subthreshold Leakage = "
|
||||
<< (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
cout<<endl;
|
||||
cout << indent_str << "Front End Engine:" << endl;
|
||||
cout << indent_str_next << "Area = " << frontend->area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << frontend->power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? frontend->power.readOp.longer_channel_leakage:frontend->power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << frontend->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << frontend->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
if (plevel >2){
|
||||
frontend->displayEnergy(indent+4,is_tdp);
|
||||
}
|
||||
cout << indent_str << "Transaction Engine:" << endl;
|
||||
cout << indent_str_next << "Area = " << transecEngine->area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << transecEngine->power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? transecEngine->power.readOp.longer_channel_leakage:transecEngine->power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << transecEngine->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << transecEngine->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
|
||||
{
|
||||
cout << indent_str << "PHY:" << endl;
|
||||
cout << indent_str_next << "Area = " << PHY->area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << PHY->power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? PHY->power.readOp.longer_channel_leakage:PHY->power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << PHY->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << PHY->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "Memory Controller:" << endl;
|
||||
cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout<<endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void MemoryController::set_mc_param()
|
||||
{
|
||||
|
||||
if (mc_type==MC)
|
||||
{
|
||||
mcp.clockRate =XML->sys.mc.mc_clock*2;//DDR double pumped
|
||||
mcp.clockRate *= 1e6;
|
||||
mcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
|
||||
|
||||
mcp.llcBlockSize =int(ceil(XML->sys.mc.llc_line_length/8.0))+XML->sys.mc.llc_line_length;//ecc overhead
|
||||
mcp.dataBusWidth =int(ceil(XML->sys.mc.databus_width/8.0)) + XML->sys.mc.databus_width;
|
||||
mcp.addressBusWidth =int(ceil(XML->sys.mc.addressbus_width));//XML->sys.physical_address_width;
|
||||
mcp.opcodeW =16;
|
||||
mcp.num_mcs = XML->sys.mc.number_mcs;
|
||||
mcp.num_channels = XML->sys.mc.memory_channels_per_mc;
|
||||
mcp.reads = XML->sys.mc.memory_reads;
|
||||
mcp.writes = XML->sys.mc.memory_writes;
|
||||
//+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
|
||||
mcp.peakDataTransferRate = XML->sys.mc.peak_transfer_rate;
|
||||
mcp.memRank = XML->sys.mc.number_ranks;
|
||||
//++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
|
||||
//PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
|
||||
//PHY.llcBlocksize=llcBlockSize;
|
||||
mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
|
||||
mcp.LVDS = XML->sys.mc.LVDS;
|
||||
mcp.type = XML->sys.mc.type;
|
||||
mcp.withPHY = XML->sys.mc.withPHY;
|
||||
}
|
||||
// else if (mc_type==FLASHC)
|
||||
// {
|
||||
// mcp.clockRate =XML->sys.flashc.mc_clock*2;//DDR double pumped
|
||||
// mcp.clockRate *= 1e6;
|
||||
// mcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
|
||||
//
|
||||
// mcp.llcBlockSize =int(ceil(XML->sys.flashc.llc_line_length/8.0))+XML->sys.flashc.llc_line_length;//ecc overhead
|
||||
// mcp.dataBusWidth =int(ceil(XML->sys.flashc.databus_width/8.0)) + XML->sys.flashc.databus_width;
|
||||
// mcp.addressBusWidth =int(ceil(XML->sys.flashc.addressbus_width));//XML->sys.physical_address_width;
|
||||
// mcp.opcodeW =16;
|
||||
// mcp.num_mcs = XML->sys.flashc.number_mcs;
|
||||
// mcp.num_channels = XML->sys.flashc.memory_channels_per_mc;
|
||||
// mcp.reads = XML->sys.flashc.memory_reads;
|
||||
// mcp.writes = XML->sys.flashc.memory_writes;
|
||||
// //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
|
||||
// mcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
|
||||
// mcp.memRank = XML->sys.flashc.number_ranks;
|
||||
// //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
|
||||
// //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
|
||||
// //PHY.llcBlocksize=llcBlockSize;
|
||||
// mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
|
||||
// mcp.LVDS = XML->sys.flashc.LVDS;
|
||||
// mcp.type = XML->sys.flashc.type;
|
||||
// }
|
||||
else
|
||||
{
|
||||
cout<<"Unknown memory controller type: neither DRAM controller nor Flash controller" <<endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
MCFrontEnd ::~MCFrontEnd(){
|
||||
|
||||
if(MC_arb) {delete MC_arb; MC_arb = 0;}
|
||||
if(frontendBuffer) {delete frontendBuffer; frontendBuffer = 0;}
|
||||
if(readBuffer) {delete readBuffer; readBuffer = 0;}
|
||||
if(writeBuffer) {delete writeBuffer; writeBuffer = 0;}
|
||||
}
|
||||
|
||||
MemoryController ::~MemoryController(){
|
||||
|
||||
if(frontend) {delete frontend; frontend = 0;}
|
||||
if(transecEngine) {delete transecEngine; transecEngine = 0;}
|
||||
if(PHY) {delete PHY; PHY = 0;}
|
||||
if(pipeLogic) {delete pipeLogic; pipeLogic = 0;}
|
||||
}
|
||||
|
113
ext/mcpat/memoryctrl.h
Normal file
113
ext/mcpat/memoryctrl.h
Normal file
|
@ -0,0 +1,113 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef MEMORYCTRL_H_
|
||||
#define MEMORYCTRL_H_
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "parameter.h"
|
||||
//#include "io.h"
|
||||
#include "array.h"
|
||||
//#include "Undifferentiated_Core_Area.h"
|
||||
#include <vector>
|
||||
|
||||
#include "basic_components.h"
|
||||
|
||||
class MCBackend : public Component {
|
||||
public:
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
enum MemoryCtrl_type mc_type;
|
||||
MCParam mcp;
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
|
||||
void compute();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~MCBackend(){};
|
||||
};
|
||||
|
||||
class MCPHY : public Component {
|
||||
public:
|
||||
InputParameter l_ip;
|
||||
uca_org_t local_result;
|
||||
enum MemoryCtrl_type mc_type;
|
||||
MCParam mcp;
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
|
||||
void compute();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~MCPHY(){};
|
||||
};
|
||||
|
||||
class MCFrontEnd : public Component {
|
||||
public:
|
||||
ParseXML *XML;
|
||||
InputParameter interface_ip;
|
||||
enum MemoryCtrl_type mc_type;
|
||||
MCParam mcp;
|
||||
selection_logic * MC_arb;
|
||||
ArrayST * frontendBuffer;
|
||||
ArrayST * readBuffer;
|
||||
ArrayST * writeBuffer;
|
||||
|
||||
MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~MCFrontEnd();
|
||||
};
|
||||
|
||||
class MemoryController : public Component {
|
||||
public:
|
||||
ParseXML *XML;
|
||||
InputParameter interface_ip;
|
||||
enum MemoryCtrl_type mc_type;
|
||||
MCParam mcp;
|
||||
MCFrontEnd * frontend;
|
||||
MCBackend * transecEngine;
|
||||
MCPHY * PHY;
|
||||
Pipeline * pipeLogic;
|
||||
|
||||
//clock_network clockNetwork;
|
||||
MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_);
|
||||
void set_mc_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~MemoryController();
|
||||
};
|
||||
#endif /* MEMORYCTRL_H_ */
|
355
ext/mcpat/noc.cc
Normal file
355
ext/mcpat/noc.cc
Normal file
|
@ -0,0 +1,355 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "const.h"
|
||||
#include "io.h"
|
||||
#include "noc.h"
|
||||
#include "parameter.h"
|
||||
|
||||
NoC::NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_, double link_len_)
|
||||
:XML(XML_interface),
|
||||
ithNoC(ithNoC_),
|
||||
interface_ip(*interface_ip_),
|
||||
router(0),
|
||||
link_bus(0),
|
||||
link_bus_exist(false),
|
||||
router_exist(false),
|
||||
M_traffic_pattern(M_traffic_pattern_)
|
||||
{
|
||||
/*
|
||||
* initialize, compute and optimize individual components.
|
||||
*/
|
||||
|
||||
if (XML->sys.Embedded)
|
||||
{
|
||||
interface_ip.wt =Global_30;
|
||||
interface_ip.wire_is_mat_type = 0;
|
||||
interface_ip.wire_os_mat_type = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
interface_ip.wt =Global;
|
||||
interface_ip.wire_is_mat_type = 2;
|
||||
interface_ip.wire_os_mat_type = 2;
|
||||
}
|
||||
set_noc_param();
|
||||
local_result=init_interface(&interface_ip);
|
||||
scktRatio = g_tp.sckt_co_eff;
|
||||
|
||||
if (nocdynp.type)
|
||||
{/*
|
||||
* if NOC compute router, router links must be computed separately
|
||||
* and called from external
|
||||
* since total chip area must be known first
|
||||
*/
|
||||
init_router();
|
||||
}
|
||||
else
|
||||
{
|
||||
init_link_bus(link_len_); //if bus compute bus
|
||||
}
|
||||
|
||||
// //clock power
|
||||
// clockNetwork.init_wire_external(is_default, &interface_ip);
|
||||
// clockNetwork.clk_area =area*1.1;//10% of placement overhead. rule of thumb
|
||||
// clockNetwork.end_wiring_level =5;//toplevel metal
|
||||
// clockNetwork.start_wiring_level =5;//toplevel metal
|
||||
// clockNetwork.num_regs = corepipe.tot_stage_vector;
|
||||
// clockNetwork.optimize_wire();
|
||||
}
|
||||
|
||||
void NoC::init_router()
|
||||
{
|
||||
router = new Router(nocdynp.flit_size,
|
||||
nocdynp.virtual_channel_per_port*nocdynp.input_buffer_entries_per_vc,
|
||||
nocdynp.virtual_channel_per_port, &(g_tp.peri_global),
|
||||
nocdynp.input_ports,nocdynp.output_ports, M_traffic_pattern);
|
||||
//router->print_router();
|
||||
area.set_area(area.get_area()+ router->area.get_area()*nocdynp.total_nodes);
|
||||
|
||||
double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
|
||||
router->power.readOp.longer_channel_leakage = router->power.readOp.leakage * long_channel_device_reduction;
|
||||
router->buffer.power.readOp.longer_channel_leakage = router->buffer.power.readOp.leakage * long_channel_device_reduction;
|
||||
router->crossbar.power.readOp.longer_channel_leakage = router->crossbar.power.readOp.leakage * long_channel_device_reduction;
|
||||
router->arbiter.power.readOp.longer_channel_leakage = router->arbiter.power.readOp.leakage * long_channel_device_reduction;
|
||||
router_exist = true;
|
||||
}
|
||||
|
||||
void NoC ::init_link_bus(double link_len_)
|
||||
{
|
||||
|
||||
|
||||
// if (nocdynp.min_ports==1 )
|
||||
if (nocdynp.type)
|
||||
link_name = "Links";
|
||||
else
|
||||
link_name = "Bus";
|
||||
|
||||
link_len=link_len_;
|
||||
assert(link_len>0);
|
||||
|
||||
interface_ip.throughput = nocdynp.link_throughput/nocdynp.clockRate;
|
||||
interface_ip.latency = nocdynp.link_latency/nocdynp.clockRate;
|
||||
|
||||
link_len /= (nocdynp.horizontal_nodes + nocdynp.vertical_nodes)/2;
|
||||
|
||||
if (nocdynp.total_nodes >1) link_len /=2; //All links are shared by neighbors
|
||||
link_bus = new interconnect(name, Uncore_device, 1, 1, nocdynp.flit_size,
|
||||
link_len, &interface_ip, 3, true/*pipelinable*/, nocdynp.route_over_perc);
|
||||
|
||||
link_bus_tot_per_Router.area.set_area(link_bus_tot_per_Router.area.get_area()+ link_bus->area.get_area()
|
||||
* nocdynp.global_linked_ports);
|
||||
|
||||
area.set_area(area.get_area()+ link_bus_tot_per_Router.area.get_area()* nocdynp.total_nodes);
|
||||
link_bus_exist = true;
|
||||
}
|
||||
void NoC::computeEnergy(bool is_tdp)
|
||||
{
|
||||
//power_point_product_masks
|
||||
double pppm_t[4] = {1,1,1,1};
|
||||
double M=nocdynp.duty_cycle;
|
||||
if (is_tdp)
|
||||
{
|
||||
//init stats for TDP
|
||||
stats_t.readAc.access = M;
|
||||
tdp_stats = stats_t;
|
||||
if (router_exist)
|
||||
{
|
||||
set_pppm(pppm_t, 1*M, 1, 1, 1);//reset traffic pattern
|
||||
router->power = router->power*pppm_t;
|
||||
set_pppm(pppm_t, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes);
|
||||
power = power + router->power*pppm_t;
|
||||
}
|
||||
if (link_bus_exist)
|
||||
{
|
||||
if (nocdynp.type)
|
||||
set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports -1), nocdynp.global_linked_ports,
|
||||
nocdynp.global_linked_ports, nocdynp.global_linked_ports);
|
||||
//reset traffic pattern; local port do not have router links
|
||||
else
|
||||
set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports), nocdynp.global_linked_ports,
|
||||
nocdynp.global_linked_ports, nocdynp.global_linked_ports);//reset traffic pattern
|
||||
|
||||
link_bus_tot_per_Router.power = link_bus->power*pppm_t;
|
||||
|
||||
set_pppm(pppm_t, nocdynp.total_nodes,
|
||||
nocdynp.total_nodes,
|
||||
nocdynp.total_nodes,
|
||||
nocdynp.total_nodes);
|
||||
power = power + link_bus_tot_per_Router.power*pppm_t;
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//init stats for runtime power (RTP)
|
||||
stats_t.readAc.access = XML->sys.NoC[ithNoC].total_accesses;
|
||||
rtp_stats = stats_t;
|
||||
set_pppm(pppm_t, 1, 0 , 0, 0);
|
||||
if (router_exist)
|
||||
{
|
||||
router->buffer.rt_power.readOp.dynamic = (router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)*rtp_stats.readAc.access ;
|
||||
router->crossbar.rt_power.readOp.dynamic = router->crossbar.power.readOp.dynamic*rtp_stats.readAc.access ;
|
||||
router->arbiter.rt_power.readOp.dynamic = router->arbiter.power.readOp.dynamic*rtp_stats.readAc.access ;
|
||||
|
||||
router->rt_power = router->rt_power + (router->buffer.rt_power + router->crossbar.rt_power + router->arbiter.rt_power)*pppm_t +
|
||||
router->power*pppm_lkg;//TDP power must be calculated first!
|
||||
rt_power = rt_power + router->rt_power;
|
||||
}
|
||||
if (link_bus_exist)
|
||||
{
|
||||
set_pppm(pppm_t, rtp_stats.readAc.access, 1 , 1, rtp_stats.readAc.access);
|
||||
link_bus->rt_power = link_bus->power * pppm_t;
|
||||
rt_power = rt_power + link_bus->rt_power;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void NoC::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
bool long_channel = XML->sys.longer_channel_device;
|
||||
|
||||
double M =M_traffic_pattern*nocdynp.duty_cycle;
|
||||
/*only router as a whole has been applied the M_traffic_pattern(0.6 by default) factor in router.cc;
|
||||
* When power of crossbars, arbiters, etc need to be displayed, the M_traffic_pattern factor need to
|
||||
* be applied together with McPAT's extra traffic pattern.
|
||||
* */
|
||||
if (is_tdp)
|
||||
{
|
||||
cout << name << endl;
|
||||
cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str<< "Peak Dynamic = " << power.readOp.dynamic*nocdynp.clockRate << " W" << endl;
|
||||
cout << indent_str << "Subthreshold Leakage = "
|
||||
<< (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str<< "Runtime Dynamic = " << rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
|
||||
cout<<endl;
|
||||
|
||||
if (router_exist)
|
||||
{
|
||||
cout << indent_str << "Router: " << endl;
|
||||
cout << indent_str_next << "Area = " << router->area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next<< "Peak Dynamic = " << router->power.readOp.dynamic*nocdynp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? router->power.readOp.longer_channel_leakage:router->power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << router->power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next<< "Runtime Dynamic = " << router->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
|
||||
cout<<endl;
|
||||
if (plevel >2){
|
||||
cout << indent_str<< indent_str << "Virtual Channel Buffer:" << endl;
|
||||
cout << indent_str<< indent_str_next << "Area = " << router->buffer.area.get_area()*1e-6*nocdynp.input_ports<< " mm^2" << endl;
|
||||
cout << indent_str<< indent_str_next << "Peak Dynamic = " <<(router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)
|
||||
*nocdynp.min_ports*M*nocdynp.clockRate << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? router->buffer.power.readOp.longer_channel_leakage*nocdynp.input_ports:router->buffer.power.readOp.leakage*nocdynp.input_ports) <<" W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Gate Leakage = " << router->buffer.power.readOp.gate_leakage*nocdynp.input_ports << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->buffer.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
cout << indent_str<< indent_str<< "Crossbar:" << endl;
|
||||
cout << indent_str<< indent_str_next << "Area = " << router->crossbar.area.get_area()*1e-6 << " mm^2" << endl;
|
||||
cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->crossbar.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? router->crossbar.power.readOp.longer_channel_leakage:router->crossbar.power.readOp.leakage) << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Gate Leakage = " << router->crossbar.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->crossbar.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
cout << indent_str<< indent_str<< "Arbiter:" << endl;
|
||||
cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->arbiter.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? router->arbiter.power.readOp.longer_channel_leakage:router->arbiter.power.readOp.leakage) << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Gate Leakage = " << router->arbiter.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->arbiter.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
}
|
||||
if (link_bus_exist)
|
||||
{
|
||||
cout << indent_str << (nocdynp.type? "Per Router ":"") << link_name<<": " << endl;
|
||||
cout << indent_str_next << "Area = " << link_bus_tot_per_Router.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next<< "Peak Dynamic = " << link_bus_tot_per_Router.power.readOp.dynamic*
|
||||
nocdynp.clockRate << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? link_bus_tot_per_Router.power.readOp.longer_channel_leakage:link_bus_tot_per_Router.power.readOp.leakage)
|
||||
<<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << link_bus_tot_per_Router.power.readOp.gate_leakage
|
||||
<< " W" << endl;
|
||||
cout << indent_str_next<< "Runtime Dynamic = " << link_bus->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl;
|
||||
cout<<endl;
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// cout << indent_str_next << "Instruction Fetch Unit Peak Dynamic = " << ifu->rt_power.readOp.dynamic*clockRate << " W" << endl;
|
||||
// cout << indent_str_next << "Instruction Fetch Unit Subthreshold Leakage = " << ifu->rt_power.readOp.leakage <<" W" << endl;
|
||||
// cout << indent_str_next << "Instruction Fetch Unit Gate Leakage = " << ifu->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
// cout << indent_str_next << "Load Store Unit Peak Dynamic = " << lsu->rt_power.readOp.dynamic*clockRate << " W" << endl;
|
||||
// cout << indent_str_next << "Load Store Unit Subthreshold Leakage = " << lsu->rt_power.readOp.leakage << " W" << endl;
|
||||
// cout << indent_str_next << "Load Store Unit Gate Leakage = " << lsu->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
// cout << indent_str_next << "Memory Management Unit Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate << " W" << endl;
|
||||
// cout << indent_str_next << "Memory Management Unit Subthreshold Leakage = " << mmu->rt_power.readOp.leakage << " W" << endl;
|
||||
// cout << indent_str_next << "Memory Management Unit Gate Leakage = " << mmu->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
// cout << indent_str_next << "Execution Unit Peak Dynamic = " << exu->rt_power.readOp.dynamic*clockRate << " W" << endl;
|
||||
// cout << indent_str_next << "Execution Unit Subthreshold Leakage = " << exu->rt_power.readOp.leakage << " W" << endl;
|
||||
// cout << indent_str_next << "Execution Unit Gate Leakage = " << exu->rt_power.readOp.gate_leakage << " W" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void NoC::set_noc_param()
|
||||
{
|
||||
|
||||
nocdynp.type = XML->sys.NoC[ithNoC].type;
|
||||
nocdynp.clockRate =XML->sys.NoC[ithNoC].clockrate;
|
||||
nocdynp.clockRate *= 1e6;
|
||||
nocdynp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
|
||||
|
||||
nocdynp.flit_size = XML->sys.NoC[ithNoC].flit_bits;
|
||||
if (nocdynp.type)
|
||||
{
|
||||
nocdynp.input_ports = XML->sys.NoC[ithNoC].input_ports;
|
||||
nocdynp.output_ports = XML->sys.NoC[ithNoC].output_ports;//later minus 1
|
||||
nocdynp.min_ports = min(nocdynp.input_ports,nocdynp.output_ports);
|
||||
nocdynp.global_linked_ports = (nocdynp.input_ports-1) + (nocdynp.output_ports-1);
|
||||
/*
|
||||
* Except local i/o ports, all ports needs links( global_linked_ports);
|
||||
* However only min_ports can be fully active simultaneously
|
||||
* since the fewer number of ports (input or output ) is the bottleneck.
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
nocdynp.input_ports = 1;
|
||||
nocdynp.output_ports = 1;
|
||||
nocdynp.min_ports = min(nocdynp.input_ports,nocdynp.output_ports);
|
||||
nocdynp.global_linked_ports = 1;
|
||||
}
|
||||
|
||||
nocdynp.virtual_channel_per_port = XML->sys.NoC[ithNoC].virtual_channel_per_port;
|
||||
nocdynp.input_buffer_entries_per_vc = XML->sys.NoC[ithNoC].input_buffer_entries_per_vc;
|
||||
|
||||
nocdynp.horizontal_nodes = XML->sys.NoC[ithNoC].horizontal_nodes;
|
||||
nocdynp.vertical_nodes = XML->sys.NoC[ithNoC].vertical_nodes;
|
||||
nocdynp.total_nodes = nocdynp.horizontal_nodes*nocdynp.vertical_nodes;
|
||||
nocdynp.duty_cycle = XML->sys.NoC[ithNoC].duty_cycle;
|
||||
nocdynp.has_global_link = XML->sys.NoC[ithNoC].has_global_link;
|
||||
nocdynp.link_throughput = XML->sys.NoC[ithNoC].link_throughput;
|
||||
nocdynp.link_latency = XML->sys.NoC[ithNoC].link_latency;
|
||||
nocdynp.chip_coverage = XML->sys.NoC[ithNoC].chip_coverage;
|
||||
nocdynp.route_over_perc = XML->sys.NoC[ithNoC].route_over_perc;
|
||||
|
||||
assert (nocdynp.chip_coverage <=1);
|
||||
assert (nocdynp.route_over_perc <=1);
|
||||
|
||||
if (nocdynp.type)
|
||||
name = "NOC";
|
||||
else
|
||||
name = "BUSES";
|
||||
|
||||
}
|
||||
|
||||
|
||||
NoC ::~NoC(){
|
||||
|
||||
if(router) {delete router; router = 0;}
|
||||
if(link_bus) {delete link_bus; link_bus = 0;}
|
||||
}
|
75
ext/mcpat/noc.h
Normal file
75
ext/mcpat/noc.h
Normal file
|
@ -0,0 +1,75 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef NOC_H_
|
||||
#define NOC_H_
|
||||
#include "XML_Parse.h"
|
||||
#include "array.h"
|
||||
#include "basic_components.h"
|
||||
#include "interconnect.h"
|
||||
#include "logic.h"
|
||||
#include "parameter.h"
|
||||
#include "router.h"
|
||||
|
||||
class NoC :public Component {
|
||||
public:
|
||||
|
||||
ParseXML *XML;
|
||||
int ithNoC;
|
||||
InputParameter interface_ip;
|
||||
double link_len;
|
||||
double executionTime;
|
||||
double scktRatio, chip_PR_overhead, macro_PR_overhead;
|
||||
Router * router;
|
||||
interconnect * link_bus;
|
||||
NoCParam nocdynp;
|
||||
uca_org_t local_result;
|
||||
statsDef tdp_stats;
|
||||
statsDef rtp_stats;
|
||||
statsDef stats_t;
|
||||
powerDef power_t;
|
||||
Component link_bus_tot_per_Router;
|
||||
bool link_bus_exist;
|
||||
bool router_exist;
|
||||
string name, link_name;
|
||||
double M_traffic_pattern;
|
||||
NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_ = 0.6,double link_len_=0);
|
||||
void set_noc_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
void init_link_bus(double link_len_);
|
||||
void init_router();
|
||||
void computeEnergy_link_bus(bool is_tdp=true);
|
||||
void displayEnergy_link_bus(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
~NoC();
|
||||
};
|
||||
|
||||
#endif /* NOC_H_ */
|
839
ext/mcpat/processor.cc
Normal file
839
ext/mcpat/processor.cc
Normal file
|
@ -0,0 +1,839 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "array.h"
|
||||
#include "basic_circuit.h"
|
||||
#include "const.h"
|
||||
#include "parameter.h"
|
||||
#include "processor.h"
|
||||
#include "version.h"
|
||||
|
||||
Processor::Processor(ParseXML *XML_interface)
|
||||
:XML(XML_interface),//TODO: using one global copy may have problems.
|
||||
mc(0),
|
||||
niu(0),
|
||||
pcie(0),
|
||||
flashcontroller(0)
|
||||
{
|
||||
/*
|
||||
* placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
* There is no point to have heterogeneous memory controller on chip,
|
||||
* thus McPAT only support homogeneous memory controllers.
|
||||
*/
|
||||
int i;
|
||||
double pppm_t[4] = {1,1,1,1};
|
||||
set_proc_param();
|
||||
if (procdynp.homoCore)
|
||||
numCore = procdynp.numCore==0? 0:1;
|
||||
else
|
||||
numCore = procdynp.numCore;
|
||||
|
||||
if (procdynp.homoL2)
|
||||
numL2 = procdynp.numL2==0? 0:1;
|
||||
else
|
||||
numL2 = procdynp.numL2;
|
||||
|
||||
if (XML->sys.Private_L2 && numCore != numL2)
|
||||
{
|
||||
cout<<"Number of private L2 does not match number of cores"<<endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (procdynp.homoL3)
|
||||
numL3 = procdynp.numL3==0? 0:1;
|
||||
else
|
||||
numL3 = procdynp.numL3;
|
||||
|
||||
if (procdynp.homoNOC)
|
||||
numNOC = procdynp.numNOC==0? 0:1;
|
||||
else
|
||||
numNOC = procdynp.numNOC;
|
||||
|
||||
// if (!procdynp.homoNOC)
|
||||
// {
|
||||
// cout<<"Current McPAT does not support heterogeneous NOC"<<endl;
|
||||
// exit(0);
|
||||
// }
|
||||
|
||||
if (procdynp.homoL1Dir)
|
||||
numL1Dir = procdynp.numL1Dir==0? 0:1;
|
||||
else
|
||||
numL1Dir = procdynp.numL1Dir;
|
||||
|
||||
if (procdynp.homoL2Dir)
|
||||
numL2Dir = procdynp.numL2Dir==0? 0:1;
|
||||
else
|
||||
numL2Dir = procdynp.numL2Dir;
|
||||
|
||||
for (i = 0;i < numCore; i++)
|
||||
{
|
||||
cores.push_back(new Core(XML,i, &interface_ip));
|
||||
cores[i]->computeEnergy();
|
||||
cores[i]->computeEnergy(false);
|
||||
if (procdynp.homoCore){
|
||||
core.area.set_area(core.area.get_area() + cores[i]->area.get_area()*procdynp.numCore);
|
||||
set_pppm(pppm_t,cores[i]->clockRate*procdynp.numCore, procdynp.numCore,procdynp.numCore,procdynp.numCore);
|
||||
core.power = core.power + cores[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/cores[i]->executionTime, procdynp.numCore,procdynp.numCore,procdynp.numCore);
|
||||
core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t;
|
||||
area.set_area(area.get_area() + core.area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
power = power + core.power;
|
||||
rt_power = rt_power + core.rt_power;
|
||||
}
|
||||
else{
|
||||
core.area.set_area(core.area.get_area() + cores[i]->area.get_area());
|
||||
area.set_area(area.get_area() + cores[i]->area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
|
||||
set_pppm(pppm_t,cores[i]->clockRate, 1, 1, 1);
|
||||
core.power = core.power + cores[i]->power*pppm_t;
|
||||
power = power + cores[i]->power*pppm_t;
|
||||
|
||||
set_pppm(pppm_t,1/cores[i]->executionTime, 1, 1, 1);
|
||||
core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t;
|
||||
rt_power = rt_power + cores[i]->rt_power*pppm_t;
|
||||
}
|
||||
}
|
||||
|
||||
if (!XML->sys.Private_L2)
|
||||
{
|
||||
if (numL2 >0)
|
||||
for (i = 0;i < numL2; i++)
|
||||
{
|
||||
l2array.push_back(new SharedCache(XML,i, &interface_ip));
|
||||
l2array[i]->computeEnergy();
|
||||
l2array[i]->computeEnergy(false);
|
||||
if (procdynp.homoL2){
|
||||
l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area()*procdynp.numL2);
|
||||
set_pppm(pppm_t,l2array[i]->cachep.clockRate*procdynp.numL2, procdynp.numL2,procdynp.numL2,procdynp.numL2);
|
||||
l2.power = l2.power + l2array[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, procdynp.numL2,procdynp.numL2,procdynp.numL2);
|
||||
l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t;
|
||||
area.set_area(area.get_area() + l2.area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
power = power + l2.power;
|
||||
rt_power = rt_power + l2.rt_power;
|
||||
}
|
||||
else{
|
||||
l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area());
|
||||
area.set_area(area.get_area() + l2array[i]->area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
|
||||
set_pppm(pppm_t,l2array[i]->cachep.clockRate, 1, 1, 1);
|
||||
l2.power = l2.power + l2array[i]->power*pppm_t;
|
||||
power = power + l2array[i]->power*pppm_t;;
|
||||
set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, 1, 1, 1);
|
||||
l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t;
|
||||
rt_power = rt_power + l2array[i]->rt_power*pppm_t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numL3 >0)
|
||||
for (i = 0;i < numL3; i++)
|
||||
{
|
||||
l3array.push_back(new SharedCache(XML,i, &interface_ip, L3));
|
||||
l3array[i]->computeEnergy();
|
||||
l3array[i]->computeEnergy(false);
|
||||
if (procdynp.homoL3){
|
||||
l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area()*procdynp.numL3);
|
||||
set_pppm(pppm_t,l3array[i]->cachep.clockRate*procdynp.numL3, procdynp.numL3,procdynp.numL3,procdynp.numL3);
|
||||
l3.power = l3.power + l3array[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, procdynp.numL3,procdynp.numL3,procdynp.numL3);
|
||||
l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t;
|
||||
area.set_area(area.get_area() + l3.area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
power = power + l3.power;
|
||||
rt_power = rt_power + l3.rt_power;
|
||||
|
||||
}
|
||||
else{
|
||||
l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area());
|
||||
area.set_area(area.get_area() + l3array[i]->area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
set_pppm(pppm_t,l3array[i]->cachep.clockRate, 1, 1, 1);
|
||||
l3.power = l3.power + l3array[i]->power*pppm_t;
|
||||
power = power + l3array[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, 1, 1, 1);
|
||||
l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t;
|
||||
rt_power = rt_power + l3array[i]->rt_power*pppm_t;
|
||||
|
||||
}
|
||||
}
|
||||
if (numL1Dir >0)
|
||||
for (i = 0;i < numL1Dir; i++)
|
||||
{
|
||||
l1dirarray.push_back(new SharedCache(XML,i, &interface_ip, L1Directory));
|
||||
l1dirarray[i]->computeEnergy();
|
||||
l1dirarray[i]->computeEnergy(false);
|
||||
if (procdynp.homoL1Dir){
|
||||
l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area()*procdynp.numL1Dir);
|
||||
set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate*procdynp.numL1Dir, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir);
|
||||
l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir);
|
||||
l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t;
|
||||
area.set_area(area.get_area() + l1dir.area.get_area());//placement and routing overhead is 10%, l1dir scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
power = power + l1dir.power;
|
||||
rt_power = rt_power + l1dir.rt_power;
|
||||
|
||||
}
|
||||
else{
|
||||
l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area());
|
||||
area.set_area(area.get_area() + l1dirarray[i]->area.get_area());
|
||||
set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate, 1, 1, 1);
|
||||
l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t;
|
||||
power = power + l1dirarray[i]->power;
|
||||
set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, 1, 1, 1);
|
||||
l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t;
|
||||
rt_power = rt_power + l1dirarray[i]->rt_power;
|
||||
}
|
||||
}
|
||||
|
||||
if (numL2Dir >0)
|
||||
for (i = 0;i < numL2Dir; i++)
|
||||
{
|
||||
l2dirarray.push_back(new SharedCache(XML,i, &interface_ip, L2Directory));
|
||||
l2dirarray[i]->computeEnergy();
|
||||
l2dirarray[i]->computeEnergy(false);
|
||||
if (procdynp.homoL2Dir){
|
||||
l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area()*procdynp.numL2Dir);
|
||||
set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate*procdynp.numL2Dir, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir);
|
||||
l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir);
|
||||
l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t;
|
||||
area.set_area(area.get_area() + l2dir.area.get_area());//placement and routing overhead is 10%, l2dir scales worse than cache 40% is accumulated from 90 to 22nm
|
||||
power = power + l2dir.power;
|
||||
rt_power = rt_power + l2dir.rt_power;
|
||||
|
||||
}
|
||||
else{
|
||||
l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area());
|
||||
area.set_area(area.get_area() + l2dirarray[i]->area.get_area());
|
||||
set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate, 1, 1, 1);
|
||||
l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t;
|
||||
power = power + l2dirarray[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, 1, 1, 1);
|
||||
l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t;
|
||||
rt_power = rt_power + l2dirarray[i]->rt_power*pppm_t;
|
||||
}
|
||||
}
|
||||
|
||||
if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
|
||||
{
|
||||
mc = new MemoryController(XML, &interface_ip, MC);
|
||||
mc->computeEnergy();
|
||||
mc->computeEnergy(false);
|
||||
mcs.area.set_area(mcs.area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs);
|
||||
area.set_area(area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs);
|
||||
set_pppm(pppm_t,XML->sys.mc.number_mcs*mc->mcp.clockRate, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs);
|
||||
mcs.power = mc->power*pppm_t;
|
||||
power = power + mcs.power;
|
||||
set_pppm(pppm_t,1/mc->mcp.executionTime, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs);
|
||||
mcs.rt_power = mc->rt_power*pppm_t;
|
||||
rt_power = rt_power + mcs.rt_power;
|
||||
|
||||
}
|
||||
|
||||
if (XML->sys.flashc.number_mcs >0 )//flash controller
|
||||
{
|
||||
flashcontroller = new FlashController(XML, &interface_ip);
|
||||
flashcontroller->computeEnergy();
|
||||
flashcontroller->computeEnergy(false);
|
||||
double number_fcs = flashcontroller->fcp.num_mcs;
|
||||
flashcontrollers.area.set_area(flashcontrollers.area.get_area()+flashcontroller->area.get_area()*number_fcs);
|
||||
area.set_area(area.get_area()+flashcontrollers.area.get_area());
|
||||
set_pppm(pppm_t,number_fcs, number_fcs ,number_fcs, number_fcs );
|
||||
flashcontrollers.power = flashcontroller->power*pppm_t;
|
||||
power = power + flashcontrollers.power;
|
||||
set_pppm(pppm_t,number_fcs , number_fcs ,number_fcs ,number_fcs );
|
||||
flashcontrollers.rt_power = flashcontroller->rt_power*pppm_t;
|
||||
rt_power = rt_power + flashcontrollers.rt_power;
|
||||
|
||||
}
|
||||
|
||||
if (XML->sys.niu.number_units >0)
|
||||
{
|
||||
niu = new NIUController(XML, &interface_ip);
|
||||
niu->computeEnergy();
|
||||
niu->computeEnergy(false);
|
||||
nius.area.set_area(nius.area.get_area()+niu->area.get_area()*XML->sys.niu.number_units);
|
||||
area.set_area(area.get_area()+niu->area.get_area()*XML->sys.niu.number_units);
|
||||
set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
|
||||
nius.power = niu->power*pppm_t;
|
||||
power = power + nius.power;
|
||||
set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
|
||||
nius.rt_power = niu->rt_power*pppm_t;
|
||||
rt_power = rt_power + nius.rt_power;
|
||||
|
||||
}
|
||||
|
||||
if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels >0)
|
||||
{
|
||||
pcie = new PCIeController(XML, &interface_ip);
|
||||
pcie->computeEnergy();
|
||||
pcie->computeEnergy(false);
|
||||
pcies.area.set_area(pcies.area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units);
|
||||
area.set_area(area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units);
|
||||
set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
|
||||
pcies.power = pcie->power*pppm_t;
|
||||
power = power + pcies.power;
|
||||
set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
|
||||
pcies.rt_power = pcie->rt_power*pppm_t;
|
||||
rt_power = rt_power + pcies.rt_power;
|
||||
|
||||
}
|
||||
|
||||
if (numNOC >0)
|
||||
{
|
||||
for (i = 0;i < numNOC; i++)
|
||||
{
|
||||
if (XML->sys.NoC[i].type)
|
||||
{//First add up area of routers if NoC is used
|
||||
nocs.push_back(new NoC(XML,i, &interface_ip, 1));
|
||||
if (procdynp.homoNOC)
|
||||
{
|
||||
noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC);
|
||||
area.set_area(area.get_area() + noc.area.get_area());
|
||||
}
|
||||
else
|
||||
{
|
||||
noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
|
||||
area.set_area(area.get_area() + nocs[i]->area.get_area());
|
||||
}
|
||||
}
|
||||
else
|
||||
{//Bus based interconnect
|
||||
nocs.push_back(new NoC(XML,i, &interface_ip, 1, sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage)));
|
||||
if (procdynp.homoNOC){
|
||||
noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC);
|
||||
area.set_area(area.get_area() + noc.area.get_area());
|
||||
}
|
||||
else
|
||||
{
|
||||
noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
|
||||
area.set_area(area.get_area() + nocs[i]->area.get_area());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute global links associated with each NOC, if any. This must be done at the end (even after the NOC router part) since the total chip
|
||||
* area must be obtain to decide the link routing
|
||||
*/
|
||||
for (i = 0;i < numNOC; i++)
|
||||
{
|
||||
if (nocs[i]->nocdynp.has_global_link && XML->sys.NoC[i].type)
|
||||
{
|
||||
nocs[i]->init_link_bus(sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage));//compute global links
|
||||
if (procdynp.homoNOC)
|
||||
{
|
||||
noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
|
||||
* nocs[i]->nocdynp.total_nodes
|
||||
* procdynp.numNOC);
|
||||
area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
|
||||
* nocs[i]->nocdynp.total_nodes
|
||||
* procdynp.numNOC);
|
||||
}
|
||||
else
|
||||
{
|
||||
noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
|
||||
* nocs[i]->nocdynp.total_nodes);
|
||||
area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area()
|
||||
* nocs[i]->nocdynp.total_nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
//Compute energy of NoC (w or w/o links) or buses
|
||||
for (i = 0;i < numNOC; i++)
|
||||
{
|
||||
nocs[i]->computeEnergy();
|
||||
nocs[i]->computeEnergy(false);
|
||||
if (procdynp.homoNOC){
|
||||
set_pppm(pppm_t,procdynp.numNOC*nocs[i]->nocdynp.clockRate, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC);
|
||||
noc.power = noc.power + nocs[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC);
|
||||
noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t;
|
||||
power = power + noc.power;
|
||||
rt_power = rt_power + noc.rt_power;
|
||||
}
|
||||
else
|
||||
{
|
||||
set_pppm(pppm_t,nocs[i]->nocdynp.clockRate, 1, 1, 1);
|
||||
noc.power = noc.power + nocs[i]->power*pppm_t;
|
||||
power = power + nocs[i]->power*pppm_t;
|
||||
set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, 1, 1, 1);
|
||||
noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t;
|
||||
rt_power = rt_power + nocs[i]->rt_power*pppm_t;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// //clock power
|
||||
// globalClock.init_wire_external(is_default, &interface_ip);
|
||||
// globalClock.clk_area =area*1e6; //change it from mm^2 to um^2
|
||||
// globalClock.end_wiring_level =5;//toplevel metal
|
||||
// globalClock.start_wiring_level =5;//toplevel metal
|
||||
// globalClock.l_ip.with_clock_grid=false;//global clock does not drive local final nodes
|
||||
// globalClock.optimize_wire();
|
||||
|
||||
}
|
||||
|
||||
void Processor::displayDeviceType(int device_type_, uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
switch ( device_type_ ) {
|
||||
|
||||
case 0 :
|
||||
cout <<indent_str<<"Device Type= "<<"ITRS high performance device type"<<endl;
|
||||
break;
|
||||
case 1 :
|
||||
cout <<indent_str<<"Device Type= "<<"ITRS low standby power device type"<<endl;
|
||||
break;
|
||||
case 2 :
|
||||
cout <<indent_str<<"Device Type= "<<"ITRS low operating power device type"<<endl;
|
||||
break;
|
||||
case 3 :
|
||||
cout <<indent_str<<"Device Type= "<<"LP-DRAM device type"<<endl;
|
||||
break;
|
||||
case 4 :
|
||||
cout <<indent_str<<"Device Type= "<<"COMM-DRAM device type"<<endl;
|
||||
break;
|
||||
default :
|
||||
{
|
||||
cout <<indent_str<<"Unknown Device Type"<<endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Processor::displayInterconnectType(int interconnect_type_, uint32_t indent)
|
||||
{
|
||||
string indent_str(indent, ' ');
|
||||
|
||||
switch ( interconnect_type_ ) {
|
||||
|
||||
case 0 :
|
||||
cout <<indent_str<<"Interconnect metal projection= "<<"aggressive interconnect technology projection"<<endl;
|
||||
break;
|
||||
case 1 :
|
||||
cout <<indent_str<<"Interconnect metal projection= "<<"conservative interconnect technology projection"<<endl;
|
||||
break;
|
||||
default :
|
||||
{
|
||||
cout <<indent_str<<"Unknown Interconnect Projection Type"<<endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Processor::displayEnergy(uint32_t indent, int plevel, bool is_tdp)
|
||||
{
|
||||
int i;
|
||||
bool long_channel = XML->sys.longer_channel_device;
|
||||
string indent_str(indent, ' ');
|
||||
string indent_str_next(indent+2, ' ');
|
||||
if (is_tdp)
|
||||
{
|
||||
|
||||
if (plevel<5)
|
||||
{
|
||||
cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
|
||||
<< " of " << VER_UPDATE << ") results (current print level is "<< plevel
|
||||
<<", please increase print level to see the details in components): "<<endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR
|
||||
<< " of " << VER_UPDATE << ") results (current print level is 5)"<< endl;
|
||||
}
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
cout <<indent_str<<"Technology "<<XML->sys.core_tech_node<<" nm"<<endl;
|
||||
//cout <<indent_str<<"Device Type= "<<XML->sys.device_type<<endl;
|
||||
if (long_channel)
|
||||
cout <<indent_str<<"Using Long Channel Devices When Appropriate"<<endl;
|
||||
//cout <<indent_str<<"Interconnect metal projection= "<<XML->sys.interconnect_projection_type<<endl;
|
||||
displayInterconnectType(XML->sys.interconnect_projection_type, indent);
|
||||
cout <<indent_str<<"Core clock Rate(MHz) "<<XML->sys.core[0].clock_rate<<endl;
|
||||
cout <<endl;
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
cout <<"Processor: "<<endl;
|
||||
cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str << "Peak Power = " << power.readOp.dynamic +
|
||||
(long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl;
|
||||
cout << indent_str << "Total Leakage = " <<
|
||||
(long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl;
|
||||
cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str << "Subthreshold Leakage = " << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
if (numCore >0){
|
||||
cout <<indent_str<<"Total Cores: "<<XML->sys.number_of_cores << " cores "<<endl;
|
||||
displayDeviceType(XML->sys.device_type,indent);
|
||||
cout << indent_str_next << "Area = " << core.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << core.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? core.power.readOp.longer_channel_leakage:core.power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str_next << "Subthreshold Leakage = " << core.power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << core.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << core.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (!XML->sys.Private_L2)
|
||||
{
|
||||
if (numL2 >0){
|
||||
cout <<indent_str<<"Total L2s: "<<endl;
|
||||
displayDeviceType(XML->sys.L2[0].device_type,indent);
|
||||
cout << indent_str_next << "Area = " << l2.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << l2.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? l2.power.readOp.longer_channel_leakage:l2.power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str_next << "Subthreshold Leakage = " << l2.power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << l2.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << l2.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
}
|
||||
if (numL3 >0){
|
||||
cout <<indent_str<<"Total L3s: "<<endl;
|
||||
displayDeviceType(XML->sys.L3[0].device_type, indent);
|
||||
cout << indent_str_next << "Area = " << l3.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << l3.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? l3.power.readOp.longer_channel_leakage:l3.power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str_next << "Subthreshold Leakage = " << l3.power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << l3.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << l3.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (numL1Dir >0){
|
||||
cout <<indent_str<<"Total First Level Directory: "<<endl;
|
||||
displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
|
||||
cout << indent_str_next << "Area = " << l1dir.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << l1dir.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? l1dir.power.readOp.longer_channel_leakage:l1dir.power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str_next << "Subthreshold Leakage = " << l1dir.power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << l1dir.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << l1dir.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (numL2Dir >0){
|
||||
cout <<indent_str<<"Total First Level Directory: "<<endl;
|
||||
displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
|
||||
cout << indent_str_next << "Area = " << l2dir.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << l2dir.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? l2dir.power.readOp.longer_channel_leakage:l2dir.power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str_next << "Subthreshold Leakage = " << l2dir.power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << l2dir.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << l2dir.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (numNOC >0){
|
||||
cout <<indent_str<<"Total NoCs (Network/Bus): "<<endl;
|
||||
displayDeviceType(XML->sys.device_type, indent);
|
||||
cout << indent_str_next << "Area = " << noc.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << noc.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? noc.power.readOp.longer_channel_leakage:noc.power.readOp.leakage) <<" W" << endl;
|
||||
//cout << indent_str_next << "Subthreshold Leakage = " << noc.power.readOp.longer_channel_leakage <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << noc.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << noc.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
|
||||
{
|
||||
cout <<indent_str<<"Total MCs: "<<XML->sys.mc.number_mcs << " Memory Controllers "<<endl;
|
||||
displayDeviceType(XML->sys.device_type, indent);
|
||||
cout << indent_str_next << "Area = " << mcs.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << mcs.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? mcs.power.readOp.longer_channel_leakage:mcs.power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << mcs.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << mcs.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (XML->sys.flashc.number_mcs >0)
|
||||
{
|
||||
cout <<indent_str<<"Total Flash/SSD Controllers: "<<flashcontroller->fcp.num_mcs << " Flash/SSD Controllers "<<endl;
|
||||
displayDeviceType(XML->sys.device_type, indent);
|
||||
cout << indent_str_next << "Area = " << flashcontrollers.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << flashcontrollers.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? flashcontrollers.power.readOp.longer_channel_leakage:flashcontrollers.power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << flashcontrollers.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << flashcontrollers.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (XML->sys.niu.number_units >0 )
|
||||
{
|
||||
cout <<indent_str<<"Total NIUs: "<<niu->niup.num_units << " Network Interface Units "<<endl;
|
||||
displayDeviceType(XML->sys.device_type, indent);
|
||||
cout << indent_str_next << "Area = " << nius.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << nius.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? nius.power.readOp.longer_channel_leakage:nius.power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << nius.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << nius.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0)
|
||||
{
|
||||
cout <<indent_str<<"Total PCIes: "<<pcie->pciep.num_units << " PCIe Controllers "<<endl;
|
||||
displayDeviceType(XML->sys.device_type, indent);
|
||||
cout << indent_str_next << "Area = " << pcies.area.get_area()*1e-6<< " mm^2" << endl;
|
||||
cout << indent_str_next << "Peak Dynamic = " << pcies.power.readOp.dynamic << " W" << endl;
|
||||
cout << indent_str_next << "Subthreshold Leakage = "
|
||||
<< (long_channel? pcies.power.readOp.longer_channel_leakage:pcies.power.readOp.leakage) <<" W" << endl;
|
||||
cout << indent_str_next << "Gate Leakage = " << pcies.power.readOp.gate_leakage << " W" << endl;
|
||||
cout << indent_str_next << "Runtime Dynamic = " << pcies.rt_power.readOp.dynamic << " W" << endl;
|
||||
cout <<endl;
|
||||
}
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
if (plevel >1)
|
||||
{
|
||||
for (i = 0;i < numCore; i++)
|
||||
{
|
||||
cores[i]->displayEnergy(indent+4,plevel,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
if (!XML->sys.Private_L2)
|
||||
{
|
||||
for (i = 0;i < numL2; i++)
|
||||
{
|
||||
l2array[i]->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
}
|
||||
for (i = 0;i < numL3; i++)
|
||||
{
|
||||
l3array[i]->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
for (i = 0;i < numL1Dir; i++)
|
||||
{
|
||||
l1dirarray[i]->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
for (i = 0;i < numL2Dir; i++)
|
||||
{
|
||||
l2dirarray[i]->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0)
|
||||
{
|
||||
mc->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
if (XML->sys.flashc.number_mcs >0 && XML->sys.flashc.memory_channels_per_mc>0)
|
||||
{
|
||||
flashcontroller->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
if (XML->sys.niu.number_units >0 )
|
||||
{
|
||||
niu->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0)
|
||||
{
|
||||
pcie->displayEnergy(indent+4,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
|
||||
for (i = 0;i < numNOC; i++)
|
||||
{
|
||||
nocs[i]->displayEnergy(indent+4,plevel,is_tdp);
|
||||
cout <<"*****************************************************************************************"<<endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Processor::set_proc_param()
|
||||
{
|
||||
bool debug = false;
|
||||
|
||||
procdynp.homoCore = bool(debug?1:XML->sys.homogeneous_cores);
|
||||
procdynp.homoL2 = bool(debug?1:XML->sys.homogeneous_L2s);
|
||||
procdynp.homoL3 = bool(debug?1:XML->sys.homogeneous_L3s);
|
||||
procdynp.homoNOC = bool(debug?1:XML->sys.homogeneous_NoCs);
|
||||
procdynp.homoL1Dir = bool(debug?1:XML->sys.homogeneous_L1Directories);
|
||||
procdynp.homoL2Dir = bool(debug?1:XML->sys.homogeneous_L2Directories);
|
||||
|
||||
procdynp.numCore = XML->sys.number_of_cores;
|
||||
procdynp.numL2 = XML->sys.number_of_L2s;
|
||||
procdynp.numL3 = XML->sys.number_of_L3s;
|
||||
procdynp.numNOC = XML->sys.number_of_NoCs;
|
||||
procdynp.numL1Dir = XML->sys.number_of_L1Directories;
|
||||
procdynp.numL2Dir = XML->sys.number_of_L2Directories;
|
||||
procdynp.numMC = XML->sys.mc.number_mcs;
|
||||
procdynp.numMCChannel = XML->sys.mc.memory_channels_per_mc;
|
||||
|
||||
// if (procdynp.numCore<1)
|
||||
// {
|
||||
// cout<<" The target processor should at least have one core on chip." <<endl;
|
||||
// exit(0);
|
||||
// }
|
||||
|
||||
// if (numNOCs<0 || numNOCs>2)
|
||||
// {
|
||||
// cout <<"number of NOCs must be 1 (only global NOCs) or 2 (both global and local NOCs)"<<endl;
|
||||
// exit(0);
|
||||
// }
|
||||
|
||||
/* Basic parameters*/
|
||||
interface_ip.data_arr_ram_cell_tech_type = debug?0:XML->sys.device_type;
|
||||
interface_ip.data_arr_peri_global_tech_type = debug?0:XML->sys.device_type;
|
||||
interface_ip.tag_arr_ram_cell_tech_type = debug?0:XML->sys.device_type;
|
||||
interface_ip.tag_arr_peri_global_tech_type = debug?0:XML->sys.device_type;
|
||||
|
||||
interface_ip.ic_proj_type = debug?0:XML->sys.interconnect_projection_type;
|
||||
interface_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied.
|
||||
interface_ip.area_wt = 0;//Fixed number, This is used to exhaustive search for individual components.
|
||||
interface_ip.dynamic_power_wt = 100;//Fixed number, This is used to exhaustive search for individual components.
|
||||
interface_ip.leakage_power_wt = 0;
|
||||
interface_ip.cycle_time_wt = 0;
|
||||
|
||||
interface_ip.delay_dev = 10000;//Fixed number, make sure timing can be satisfied.
|
||||
interface_ip.area_dev = 10000;//Fixed number, This is used to exhaustive search for individual components.
|
||||
interface_ip.dynamic_power_dev = 10000;//Fixed number, This is used to exhaustive search for individual components.
|
||||
interface_ip.leakage_power_dev = 10000;
|
||||
interface_ip.cycle_time_dev = 10000;
|
||||
|
||||
interface_ip.ed = 2;
|
||||
interface_ip.burst_len = 1;//parameters are fixed for processor section, since memory is processed separately
|
||||
interface_ip.int_prefetch_w = 1;
|
||||
interface_ip.page_sz_bits = 0;
|
||||
interface_ip.temp = debug?360: XML->sys.temperature;
|
||||
interface_ip.F_sz_nm = debug?90:XML->sys.core_tech_node;//XML->sys.core_tech_node;
|
||||
interface_ip.F_sz_um = interface_ip.F_sz_nm / 1000;
|
||||
|
||||
//***********This section of code does not have real meaning, they are just to ensure all data will have initial value to prevent errors.
|
||||
//They will be overridden during each components initialization
|
||||
interface_ip.cache_sz =64;
|
||||
interface_ip.line_sz = 1;
|
||||
interface_ip.assoc = 1;
|
||||
interface_ip.nbanks = 1;
|
||||
interface_ip.out_w = interface_ip.line_sz*8;
|
||||
interface_ip.specific_tag = 1;
|
||||
interface_ip.tag_w = 64;
|
||||
interface_ip.access_mode = 2;
|
||||
|
||||
interface_ip.obj_func_dyn_energy = 0;
|
||||
interface_ip.obj_func_dyn_power = 0;
|
||||
interface_ip.obj_func_leak_power = 0;
|
||||
interface_ip.obj_func_cycle_t = 1;
|
||||
|
||||
interface_ip.is_main_mem = false;
|
||||
interface_ip.rpters_in_htree = true ;
|
||||
interface_ip.ver_htree_wires_over_array = 0;
|
||||
interface_ip.broadcast_addr_din_over_ver_htrees = 0;
|
||||
|
||||
interface_ip.num_rw_ports = 1;
|
||||
interface_ip.num_rd_ports = 0;
|
||||
interface_ip.num_wr_ports = 0;
|
||||
interface_ip.num_se_rd_ports = 0;
|
||||
interface_ip.num_search_ports = 1;
|
||||
interface_ip.nuca = 0;
|
||||
interface_ip.nuca_bank_count = 0;
|
||||
interface_ip.is_cache =true;
|
||||
interface_ip.pure_ram =false;
|
||||
interface_ip.pure_cam =false;
|
||||
interface_ip.force_cache_config =false;
|
||||
if (XML->sys.Embedded)
|
||||
{
|
||||
interface_ip.wt =Global_30;
|
||||
interface_ip.wire_is_mat_type = 0;
|
||||
interface_ip.wire_os_mat_type = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
interface_ip.wt =Global;
|
||||
interface_ip.wire_is_mat_type = 2;
|
||||
interface_ip.wire_os_mat_type = 2;
|
||||
}
|
||||
interface_ip.force_wiretype = false;
|
||||
interface_ip.print_detail = 1;
|
||||
interface_ip.add_ecc_b_ =true;
|
||||
}
|
||||
|
||||
Processor::~Processor(){
|
||||
while (!cores.empty())
|
||||
{
|
||||
delete cores.back();
|
||||
cores.pop_back();
|
||||
}
|
||||
while (!l2array.empty())
|
||||
{
|
||||
delete l2array.back();
|
||||
l2array.pop_back();
|
||||
}
|
||||
while (!l3array.empty())
|
||||
{
|
||||
delete l3array.back();
|
||||
l3array.pop_back();
|
||||
}
|
||||
while (!nocs.empty())
|
||||
{
|
||||
delete nocs.back();
|
||||
nocs.pop_back();
|
||||
}
|
||||
if (!mc)
|
||||
{
|
||||
delete mc;
|
||||
}
|
||||
if (!niu)
|
||||
{
|
||||
delete niu;
|
||||
}
|
||||
if (!pcie)
|
||||
{
|
||||
delete pcie;
|
||||
}
|
||||
if (!flashcontroller)
|
||||
{
|
||||
delete flashcontroller;
|
||||
}
|
||||
};
|
79
ext/mcpat/processor.h
Normal file
79
ext/mcpat/processor.h
Normal file
|
@ -0,0 +1,79 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
#ifndef PROCESSOR_H_
|
||||
#define PROCESSOR_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "arbiter.h"
|
||||
#include "area.h"
|
||||
#include "array.h"
|
||||
#include "basic_components.h"
|
||||
#include "core.h"
|
||||
#include "decoder.h"
|
||||
#include "iocontrollers.h"
|
||||
#include "memoryctrl.h"
|
||||
#include "noc.h"
|
||||
#include "parameter.h"
|
||||
#include "router.h"
|
||||
#include "sharedcache.h"
|
||||
|
||||
class Processor : public Component
|
||||
{
|
||||
public:
|
||||
ParseXML *XML;
|
||||
vector<Core *> cores;
|
||||
vector<SharedCache *> l2array;
|
||||
vector<SharedCache *> l3array;
|
||||
vector<SharedCache *> l1dirarray;
|
||||
vector<SharedCache *> l2dirarray;
|
||||
vector<NoC *> nocs;
|
||||
MemoryController * mc;
|
||||
NIUController * niu;
|
||||
PCIeController * pcie;
|
||||
FlashController * flashcontroller;
|
||||
InputParameter interface_ip;
|
||||
ProcParam procdynp;
|
||||
//wire globalInterconnect;
|
||||
//clock_network globalClock;
|
||||
Component core, l2, l3, l1dir, l2dir, noc, mcs, cc, nius, pcies,flashcontrollers;
|
||||
int numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir;
|
||||
Processor(ParseXML *XML_interface);
|
||||
void compute();
|
||||
void set_proc_param();
|
||||
void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true);
|
||||
void displayDeviceType(int device_type_, uint32_t indent = 0);
|
||||
void displayInterconnectType(int interconnect_type_, uint32_t indent = 0);
|
||||
~Processor();
|
||||
};
|
||||
|
||||
#endif /* PROCESSOR_H_ */
|
321
ext/mcpat/results/A9_2000
Normal file
321
ext/mcpat/results/A9_2000
Normal file
|
@ -0,0 +1,321 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 40 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= conservative interconnect technology projection
|
||||
Core clock Rate(MHz) 2000
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 5.83937 mm^2
|
||||
Peak Power = 1.32283 W
|
||||
Total Leakage = 0.182558 W
|
||||
Peak Dynamic = 1.14027 W
|
||||
Subthreshold Leakage = 0.0869601 W
|
||||
Gate Leakage = 0.095598 W
|
||||
Runtime Dynamic = 2.86361 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 5.33485 mm^2
|
||||
Peak Dynamic = 1.07823 W
|
||||
Subthreshold Leakage = 0.0827641 W
|
||||
Gate Leakage = 0.0887315 W
|
||||
Runtime Dynamic = 0.975395 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.489711 mm^2
|
||||
Peak Dynamic = 0.0449752 W
|
||||
Subthreshold Leakage = 0.00397708 W
|
||||
Gate Leakage = 0.00655632 W
|
||||
Runtime Dynamic = 1.80289 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.0148119 mm^2
|
||||
Peak Dynamic = 0.0170648 W
|
||||
Subthreshold Leakage = 0.000218992 W
|
||||
Gate Leakage = 0.000310207 W
|
||||
Runtime Dynamic = 0.0853239 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 2.66742 mm^2
|
||||
Peak Dynamic = 0.539116 W
|
||||
Subthreshold Leakage = 0.041382 W
|
||||
Gate Leakage = 0.0443657 W
|
||||
Runtime Dynamic = 0.975395 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 0.565848 mm^2
|
||||
Peak Dynamic = 0.184724 W
|
||||
Subthreshold Leakage = 0.00572394 W
|
||||
Gate Leakage = 0.00380598 W
|
||||
Runtime Dynamic = 0.283222 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.235613 mm^2
|
||||
Peak Dynamic = 0.0310428 W
|
||||
Subthreshold Leakage = 0.00309635 W
|
||||
Gate Leakage = 0.00216385 W
|
||||
Runtime Dynamic = 0.0461626 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 0.251259 mm^2
|
||||
Peak Dynamic = 0.0174433 W
|
||||
Subthreshold Leakage = 0.00170231 W
|
||||
Gate Leakage = 0.000908123 W
|
||||
Runtime Dynamic = 0.0697733 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.064441 mm^2
|
||||
Peak Dynamic = 0.00815792 W
|
||||
Subthreshold Leakage = 0.00070444 W
|
||||
Gate Leakage = 0.000477387 W
|
||||
Runtime Dynamic = 0.0113878 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.0313969 mm^2
|
||||
Peak Dynamic = 0.00374527 W
|
||||
Subthreshold Leakage = 0.00034631 W
|
||||
Gate Leakage = 0.000233555 W
|
||||
Runtime Dynamic = 0.00545806 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.000711939 mm^2
|
||||
Peak Dynamic = 0.000301014 W
|
||||
Subthreshold Leakage = 6.13457e-06 W
|
||||
Gate Leakage = 5.63471e-06 W
|
||||
Runtime Dynamic = 0.000471566 W
|
||||
|
||||
Area = 0.000650815 mm^2
|
||||
Peak Dynamic = 0.000230123 W
|
||||
Subthreshold Leakage = 5.7769e-06 W
|
||||
Gate Leakage = 4.75075e-06 W
|
||||
Runtime Dynamic = 0.000354988 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.0313969 mm^2
|
||||
Peak Dynamic = 0.00374527 W
|
||||
Subthreshold Leakage = 0.00034631 W
|
||||
Gate Leakage = 0.000233555 W
|
||||
Runtime Dynamic = 0.00545806 W
|
||||
|
||||
RAS:
|
||||
Area = 0.000996272 mm^2
|
||||
Peak Dynamic = 0.000366372 W
|
||||
Subthreshold Leakage = 5.68653e-06 W
|
||||
Gate Leakage = 4.64147e-06 W
|
||||
Runtime Dynamic = 6.23994e-08 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.00820192 mm^2
|
||||
Peak Dynamic = 0.0669878 W
|
||||
Subthreshold Leakage = 6.33536e-05 W
|
||||
Gate Leakage = 4.34841e-05 W
|
||||
Runtime Dynamic = 0.0382787 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.00468731 mm^2
|
||||
Peak Dynamic = 0.05881 W
|
||||
Subthreshold Leakage = 0.000127696 W
|
||||
Gate Leakage = 0.000115494 W
|
||||
Runtime Dynamic = 0.11762 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 0.0903068 mm^2
|
||||
Peak Dynamic = 0.0451514 W
|
||||
Subthreshold Leakage = 0.000345688 W
|
||||
Gate Leakage = 0.00032022 W
|
||||
Runtime Dynamic = 0.0731287 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 0.0543672 mm^2
|
||||
Peak Dynamic = 0.0237617 W
|
||||
Subthreshold Leakage = 0.000175223 W
|
||||
Gate Leakage = 0.000121525 W
|
||||
Runtime Dynamic = 0.0475234 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 0.0185325 mm^2
|
||||
Peak Dynamic = 0.00949419 W
|
||||
Subthreshold Leakage = 0.000100325 W
|
||||
Gate Leakage = 6.76251e-05 W
|
||||
Runtime Dynamic = 0.00949419 W
|
||||
|
||||
Free List:
|
||||
Area = 0.00599955 mm^2
|
||||
Peak Dynamic = 0.00225065 W
|
||||
Subthreshold Leakage = 1.24363e-05 W
|
||||
Gate Leakage = 1.00844e-05 W
|
||||
Runtime Dynamic = 0.0090026 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.00605969 mm^2
|
||||
Peak Dynamic = 0.00448392 W
|
||||
Subthreshold Leakage = 1.33231e-05 W
|
||||
Gate Leakage = 1.16235e-05 W
|
||||
Runtime Dynamic = 0.00448392 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.000650815 mm^2
|
||||
Peak Dynamic = 0.00067334 W
|
||||
Subthreshold Leakage = 5.7769e-06 W
|
||||
Gate Leakage = 4.75075e-06 W
|
||||
Runtime Dynamic = 0.00067334 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.00305098 mm^2
|
||||
Peak Dynamic = 0.00195124 W
|
||||
Subthreshold Leakage = 8.81712e-06 W
|
||||
Gate Leakage = 6.96054e-06 W
|
||||
Runtime Dynamic = 0.00195124 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 0.274913 mm^2
|
||||
Peak Dynamic = 0.0347482 W
|
||||
Subthreshold Leakage = 0.0032012 W
|
||||
Gate Leakage = 0.00235752 W
|
||||
Runtime Dynamic = 0.195304 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.240878 mm^2
|
||||
Peak Dynamic = 0.0293665 W
|
||||
Subthreshold Leakage = 0.00312878 W
|
||||
Gate Leakage = 0.00220794 W
|
||||
Runtime Dynamic = 0.19026 W
|
||||
|
||||
StoreQ:
|
||||
Area = 0.00754674 mm^2
|
||||
Peak Dynamic = 0.00358087 W
|
||||
Subthreshold Leakage = 4.2633e-05 W
|
||||
Gate Leakage = 5.19212e-05 W
|
||||
Runtime Dynamic = 0.00504348 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.021508 mm^2
|
||||
Peak Dynamic = 0.0127337 W
|
||||
Subthreshold Leakage = 0.000210621 W
|
||||
Gate Leakage = 0.000290666 W
|
||||
Runtime Dynamic = 0.037071 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.00993091 mm^2
|
||||
Peak Dynamic = 0.00617846 W
|
||||
Subthreshold Leakage = 9.04168e-05 W
|
||||
Gate Leakage = 9.65082e-05 W
|
||||
Runtime Dynamic = 0.012357 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.00993091 mm^2
|
||||
Peak Dynamic = 0.00438671 W
|
||||
Subthreshold Leakage = 9.04168e-05 W
|
||||
Gate Leakage = 9.65082e-05 W
|
||||
Runtime Dynamic = 0.0247139 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 1.65498 mm^2
|
||||
Peak Dynamic = 0.261758 W
|
||||
Subthreshold Leakage = 0.0305522 W
|
||||
Gate Leakage = 0.0360036 W
|
||||
Runtime Dynamic = 0.386669 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.203203 mm^2
|
||||
Peak Dynamic = 0.0763282 W
|
||||
Subthreshold Leakage = 0.000197046 W
|
||||
Gate Leakage = 0.00016338 W
|
||||
Runtime Dynamic = 0.0386066 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.146073 mm^2
|
||||
Peak Dynamic = 0.0763282 W
|
||||
Subthreshold Leakage = 0.000120303 W
|
||||
Gate Leakage = 9.97867e-05 W
|
||||
Runtime Dynamic = 0.0345689 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.05713 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 7.67427e-05 W
|
||||
Gate Leakage = 6.35938e-05 W
|
||||
Runtime Dynamic = 0.00403765 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.0582889 mm^2
|
||||
Peak Dynamic = 0.0522571 W
|
||||
Subthreshold Leakage = 0.000128698 W
|
||||
Gate Leakage = 0.000185714 W
|
||||
Runtime Dynamic = 0.0787473 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.053925 mm^2
|
||||
Peak Dynamic = 0.0445895 W
|
||||
Subthreshold Leakage = 9.52936e-05 W
|
||||
Gate Leakage = 0.000130718 W
|
||||
Runtime Dynamic = 0.0602231 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.00436388 mm^2
|
||||
Peak Dynamic = 0.00766759 W
|
||||
Subthreshold Leakage = 3.34043e-05 W
|
||||
Gate Leakage = 5.49962e-05 W
|
||||
Runtime Dynamic = 0.0185242 W
|
||||
|
||||
Integer ALUs (Count: 3 ):
|
||||
Area = 0.312404 mm^2
|
||||
Peak Dynamic = 0.0283684 W
|
||||
Subthreshold Leakage = 0.0140724 W
|
||||
Gate Leakage = 0.0165703 W
|
||||
Runtime Dynamic = 0.0373268 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 1 ):
|
||||
Area = 0.971259 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.0109377 W
|
||||
Gate Leakage = 0.0128792 W
|
||||
Runtime Dynamic = 0.0373268 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.104135 mm^2
|
||||
Peak Dynamic = 0.0204053 W
|
||||
Subthreshold Leakage = 0.00469079 W
|
||||
Gate Leakage = 0.00552345 W
|
||||
Runtime Dynamic = 0.049769 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.00404385 mm^2
|
||||
Peak Dynamic = 0.0824719 W
|
||||
Subthreshold Leakage = 0.000495836 W
|
||||
Gate Leakage = 0.000583852 W
|
||||
Runtime Dynamic = 0.144892 W
|
||||
|
||||
*****************************************************************************************
|
||||
First Level Directory
|
||||
Area = 0.244856 mm^2
|
||||
Peak Dynamic = 0.0224876 W
|
||||
Subthreshold Leakage = 0.00198854 W
|
||||
Gate Leakage = 0.00327816 W
|
||||
Runtime Dynamic = 1.80289 W
|
||||
|
||||
*****************************************************************************************
|
||||
BUSES
|
||||
Area = 0.0148119 mm^2
|
||||
Peak Dynamic = 0.0170648 W
|
||||
Subthreshold Leakage = 0.000218992 W
|
||||
Gate Leakage = 0.000310207 W
|
||||
Runtime Dynamic = 0.0853239 W
|
||||
|
||||
Bus:
|
||||
Area = 0.0148119 mm^2
|
||||
Peak Dynamic = 0.0170648 W
|
||||
Subthreshold Leakage = 0.000218992 W
|
||||
Gate Leakage = 0.000310207 W
|
||||
Runtime Dynamic = 0.0853239 W
|
||||
|
||||
*****************************************************************************************
|
410
ext/mcpat/results/A9_2000_withIOC
Normal file
410
ext/mcpat/results/A9_2000_withIOC
Normal file
|
@ -0,0 +1,410 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
|
||||
SerDer_dyn 0.00216115
|
||||
ctrl_dyn 0.0278216
|
||||
ctrl_dyn 6.14856e-11
|
||||
SerDer_dyn 1.54368e-11
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 40 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= conservative interconnect technology projection
|
||||
Core clock Rate(MHz) 2000
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 7.05775 mm^2
|
||||
Peak Power = 2.06734 W
|
||||
Total Leakage = 0.204814 W
|
||||
Peak Dynamic = 1.86253 W
|
||||
Subthreshold Leakage = 0.0916805 W
|
||||
Gate Leakage = 0.113134 W
|
||||
Runtime Dynamic = 5.3744 W
|
||||
|
||||
Total Cores: 2 cores
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 5.33485 mm^2
|
||||
Peak Dynamic = 1.07823 W
|
||||
Subthreshold Leakage = 0.0827641 W
|
||||
Gate Leakage = 0.0887315 W
|
||||
Runtime Dynamic = 0.975395 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.489711 mm^2
|
||||
Peak Dynamic = 0.0449752 W
|
||||
Subthreshold Leakage = 0.00397708 W
|
||||
Gate Leakage = 0.00655632 W
|
||||
Runtime Dynamic = 1.80289 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.0162858 mm^2
|
||||
Peak Dynamic = 0.0187629 W
|
||||
Subthreshold Leakage = 0.000240784 W
|
||||
Gate Leakage = 0.000341076 W
|
||||
Runtime Dynamic = 0.0938146 W
|
||||
|
||||
Total MCs: 1 Memory Controllers
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.554183 mm^2
|
||||
Peak Dynamic = 0.31033 W
|
||||
Subthreshold Leakage = 0.0020922 W
|
||||
Gate Leakage = 0.00751531 W
|
||||
Runtime Dynamic = 2.21514 W
|
||||
|
||||
Total Flash/SSD Controllers: 1 Flash/SSD Controllers
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.109065 mm^2
|
||||
Peak Dynamic = 0.0299827 W
|
||||
Subthreshold Leakage = 0.000522213 W
|
||||
Gate Leakage = 0.0020015 W
|
||||
Runtime Dynamic = 0.0209879 W
|
||||
|
||||
Total NIUs: 1 Network Interface Units
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.261302 mm^2
|
||||
Peak Dynamic = 0.164859 W
|
||||
Subthreshold Leakage = 0.000730171 W
|
||||
Gate Leakage = 0.00279855 W
|
||||
Runtime Dynamic = 0.115402 W
|
||||
|
||||
Total PCIes: 1 PCIe Controllers
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.292355 mm^2
|
||||
Peak Dynamic = 0.215383 W
|
||||
Subthreshold Leakage = 0.00135405 W
|
||||
Gate Leakage = 0.00518971 W
|
||||
Runtime Dynamic = 0.150768 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 2.66742 mm^2
|
||||
Peak Dynamic = 0.539116 W
|
||||
Subthreshold Leakage = 0.041382 W
|
||||
Gate Leakage = 0.0443657 W
|
||||
Runtime Dynamic = 0.975395 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 0.565848 mm^2
|
||||
Peak Dynamic = 0.184724 W
|
||||
Subthreshold Leakage = 0.00572394 W
|
||||
Gate Leakage = 0.00380598 W
|
||||
Runtime Dynamic = 0.283222 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.235613 mm^2
|
||||
Peak Dynamic = 0.0310428 W
|
||||
Subthreshold Leakage = 0.00309635 W
|
||||
Gate Leakage = 0.00216385 W
|
||||
Runtime Dynamic = 0.0461626 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 0.251259 mm^2
|
||||
Peak Dynamic = 0.0174433 W
|
||||
Subthreshold Leakage = 0.00170231 W
|
||||
Gate Leakage = 0.000908123 W
|
||||
Runtime Dynamic = 0.0697733 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.064441 mm^2
|
||||
Peak Dynamic = 0.00815792 W
|
||||
Subthreshold Leakage = 0.00070444 W
|
||||
Gate Leakage = 0.000477387 W
|
||||
Runtime Dynamic = 0.0113878 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.0313969 mm^2
|
||||
Peak Dynamic = 0.00374527 W
|
||||
Subthreshold Leakage = 0.00034631 W
|
||||
Gate Leakage = 0.000233555 W
|
||||
Runtime Dynamic = 0.00545806 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.000711939 mm^2
|
||||
Peak Dynamic = 0.000301014 W
|
||||
Subthreshold Leakage = 6.13457e-06 W
|
||||
Gate Leakage = 5.63471e-06 W
|
||||
Runtime Dynamic = 0.000471566 W
|
||||
|
||||
Area = 0.000650815 mm^2
|
||||
Peak Dynamic = 0.000230123 W
|
||||
Subthreshold Leakage = 5.7769e-06 W
|
||||
Gate Leakage = 4.75075e-06 W
|
||||
Runtime Dynamic = 0.000354988 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.0313969 mm^2
|
||||
Peak Dynamic = 0.00374527 W
|
||||
Subthreshold Leakage = 0.00034631 W
|
||||
Gate Leakage = 0.000233555 W
|
||||
Runtime Dynamic = 0.00545806 W
|
||||
|
||||
RAS:
|
||||
Area = 0.000996272 mm^2
|
||||
Peak Dynamic = 0.000366372 W
|
||||
Subthreshold Leakage = 5.68653e-06 W
|
||||
Gate Leakage = 4.64147e-06 W
|
||||
Runtime Dynamic = 6.23994e-08 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.00820192 mm^2
|
||||
Peak Dynamic = 0.0669878 W
|
||||
Subthreshold Leakage = 6.33536e-05 W
|
||||
Gate Leakage = 4.34841e-05 W
|
||||
Runtime Dynamic = 0.0382787 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.00468731 mm^2
|
||||
Peak Dynamic = 0.05881 W
|
||||
Subthreshold Leakage = 0.000127696 W
|
||||
Gate Leakage = 0.000115494 W
|
||||
Runtime Dynamic = 0.11762 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 0.0903068 mm^2
|
||||
Peak Dynamic = 0.0451514 W
|
||||
Subthreshold Leakage = 0.000345688 W
|
||||
Gate Leakage = 0.00032022 W
|
||||
Runtime Dynamic = 0.0731287 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 0.0543672 mm^2
|
||||
Peak Dynamic = 0.0237617 W
|
||||
Subthreshold Leakage = 0.000175223 W
|
||||
Gate Leakage = 0.000121525 W
|
||||
Runtime Dynamic = 0.0475234 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 0.0185325 mm^2
|
||||
Peak Dynamic = 0.00949419 W
|
||||
Subthreshold Leakage = 0.000100325 W
|
||||
Gate Leakage = 6.76251e-05 W
|
||||
Runtime Dynamic = 0.00949419 W
|
||||
|
||||
Free List:
|
||||
Area = 0.00599955 mm^2
|
||||
Peak Dynamic = 0.00225065 W
|
||||
Subthreshold Leakage = 1.24363e-05 W
|
||||
Gate Leakage = 1.00844e-05 W
|
||||
Runtime Dynamic = 0.0090026 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.00605969 mm^2
|
||||
Peak Dynamic = 0.00448392 W
|
||||
Subthreshold Leakage = 1.33231e-05 W
|
||||
Gate Leakage = 1.16235e-05 W
|
||||
Runtime Dynamic = 0.00448392 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.000650815 mm^2
|
||||
Peak Dynamic = 0.00067334 W
|
||||
Subthreshold Leakage = 5.7769e-06 W
|
||||
Gate Leakage = 4.75075e-06 W
|
||||
Runtime Dynamic = 0.00067334 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.00305098 mm^2
|
||||
Peak Dynamic = 0.00195124 W
|
||||
Subthreshold Leakage = 8.81712e-06 W
|
||||
Gate Leakage = 6.96054e-06 W
|
||||
Runtime Dynamic = 0.00195124 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 0.274913 mm^2
|
||||
Peak Dynamic = 0.0347482 W
|
||||
Subthreshold Leakage = 0.0032012 W
|
||||
Gate Leakage = 0.00235752 W
|
||||
Runtime Dynamic = 0.195304 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.240878 mm^2
|
||||
Peak Dynamic = 0.0293665 W
|
||||
Subthreshold Leakage = 0.00312878 W
|
||||
Gate Leakage = 0.00220794 W
|
||||
Runtime Dynamic = 0.19026 W
|
||||
|
||||
StoreQ:
|
||||
Area = 0.00754674 mm^2
|
||||
Peak Dynamic = 0.00358087 W
|
||||
Subthreshold Leakage = 4.2633e-05 W
|
||||
Gate Leakage = 5.19212e-05 W
|
||||
Runtime Dynamic = 0.00504348 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.021508 mm^2
|
||||
Peak Dynamic = 0.0127337 W
|
||||
Subthreshold Leakage = 0.000210621 W
|
||||
Gate Leakage = 0.000290666 W
|
||||
Runtime Dynamic = 0.037071 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.00993091 mm^2
|
||||
Peak Dynamic = 0.00617846 W
|
||||
Subthreshold Leakage = 9.04168e-05 W
|
||||
Gate Leakage = 9.65082e-05 W
|
||||
Runtime Dynamic = 0.012357 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.00993091 mm^2
|
||||
Peak Dynamic = 0.00438671 W
|
||||
Subthreshold Leakage = 9.04168e-05 W
|
||||
Gate Leakage = 9.65082e-05 W
|
||||
Runtime Dynamic = 0.0247139 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 1.65498 mm^2
|
||||
Peak Dynamic = 0.261758 W
|
||||
Subthreshold Leakage = 0.0305522 W
|
||||
Gate Leakage = 0.0360036 W
|
||||
Runtime Dynamic = 0.386669 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.203203 mm^2
|
||||
Peak Dynamic = 0.0763282 W
|
||||
Subthreshold Leakage = 0.000197046 W
|
||||
Gate Leakage = 0.00016338 W
|
||||
Runtime Dynamic = 0.0386066 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.146073 mm^2
|
||||
Peak Dynamic = 0.0763282 W
|
||||
Subthreshold Leakage = 0.000120303 W
|
||||
Gate Leakage = 9.97867e-05 W
|
||||
Runtime Dynamic = 0.0345689 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.05713 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 7.67427e-05 W
|
||||
Gate Leakage = 6.35938e-05 W
|
||||
Runtime Dynamic = 0.00403765 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.0582889 mm^2
|
||||
Peak Dynamic = 0.0522571 W
|
||||
Subthreshold Leakage = 0.000128698 W
|
||||
Gate Leakage = 0.000185714 W
|
||||
Runtime Dynamic = 0.0787473 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.053925 mm^2
|
||||
Peak Dynamic = 0.0445895 W
|
||||
Subthreshold Leakage = 9.52936e-05 W
|
||||
Gate Leakage = 0.000130718 W
|
||||
Runtime Dynamic = 0.0602231 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.00436388 mm^2
|
||||
Peak Dynamic = 0.00766759 W
|
||||
Subthreshold Leakage = 3.34043e-05 W
|
||||
Gate Leakage = 5.49962e-05 W
|
||||
Runtime Dynamic = 0.0185242 W
|
||||
|
||||
Integer ALUs (Count: 3 ):
|
||||
Area = 0.312404 mm^2
|
||||
Peak Dynamic = 0.0283684 W
|
||||
Subthreshold Leakage = 0.0140724 W
|
||||
Gate Leakage = 0.0165703 W
|
||||
Runtime Dynamic = 0.0373268 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 1 ):
|
||||
Area = 0.971259 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.0109377 W
|
||||
Gate Leakage = 0.0128792 W
|
||||
Runtime Dynamic = 0.0373268 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.104135 mm^2
|
||||
Peak Dynamic = 0.0204053 W
|
||||
Subthreshold Leakage = 0.00469079 W
|
||||
Gate Leakage = 0.00552345 W
|
||||
Runtime Dynamic = 0.049769 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.00404385 mm^2
|
||||
Peak Dynamic = 0.0824719 W
|
||||
Subthreshold Leakage = 0.000495836 W
|
||||
Gate Leakage = 0.000583852 W
|
||||
Runtime Dynamic = 0.144892 W
|
||||
|
||||
*****************************************************************************************
|
||||
First Level Directory
|
||||
Area = 0.244856 mm^2
|
||||
Peak Dynamic = 0.0224876 W
|
||||
Subthreshold Leakage = 0.00198854 W
|
||||
Gate Leakage = 0.00327816 W
|
||||
Runtime Dynamic = 1.80289 W
|
||||
|
||||
*****************************************************************************************
|
||||
Memory Controller:
|
||||
Area = 0.554183 mm^2
|
||||
Peak Dynamic = 0.31033 W
|
||||
Subthreshold Leakage = 0.0020922 W
|
||||
Gate Leakage = 0.00751531 W
|
||||
Runtime Dynamic = 2.21514 W
|
||||
|
||||
Front End Engine:
|
||||
Area = 0.111447 mm^2
|
||||
Peak Dynamic = 0.0117646 W
|
||||
Subthreshold Leakage = 0.000188068 W
|
||||
Gate Leakage = 0.000217277 W
|
||||
Runtime Dynamic = 0.0796061 W
|
||||
|
||||
Transaction Engine:
|
||||
Area = 0.113609 mm^2
|
||||
Peak Dynamic = 0.160252 W
|
||||
Subthreshold Leakage = 0.000380826 W
|
||||
Gate Leakage = 0.00145961 W
|
||||
Runtime Dynamic = 1.08436 W
|
||||
|
||||
PHY:
|
||||
Area = 0.329127 mm^2
|
||||
Peak Dynamic = 0.138314 W
|
||||
Subthreshold Leakage = 0.00152331 W
|
||||
Gate Leakage = 0.00583843 W
|
||||
Runtime Dynamic = 1.05117 W
|
||||
|
||||
*****************************************************************************************
|
||||
Flash Controller:
|
||||
Area = 0.109065 mm^2
|
||||
Peak Dynamic = 0.0299827 W
|
||||
Subthreshold Leakage = 0.000522213 W
|
||||
Gate Leakage = 0.0020015 W
|
||||
Runtime Dynamic = 0.0209879 W
|
||||
|
||||
*****************************************************************************************
|
||||
NIU:
|
||||
Area = 0.261302 mm^2
|
||||
Peak Dynamic = 0.164859 W
|
||||
Subthreshold Leakage = 0.000730171 W
|
||||
Gate Leakage = 0.00279855 W
|
||||
Runtime Dynamic = 0.115402 W
|
||||
|
||||
*****************************************************************************************
|
||||
PCIe:
|
||||
Area = 0.292355 mm^2
|
||||
Peak Dynamic = 0.215383 W
|
||||
Subthreshold Leakage = 0.00135405 W
|
||||
Gate Leakage = 0.00518971 W
|
||||
Runtime Dynamic = 0.150768 W
|
||||
|
||||
*****************************************************************************************
|
||||
BUSES
|
||||
Area = 0.0162858 mm^2
|
||||
Peak Dynamic = 0.0187629 W
|
||||
Subthreshold Leakage = 0.000240784 W
|
||||
Gate Leakage = 0.000341076 W
|
||||
Runtime Dynamic = 0.0938146 W
|
||||
|
||||
Bus:
|
||||
Area = 0.0162858 mm^2
|
||||
Peak Dynamic = 0.0187629 W
|
||||
Subthreshold Leakage = 0.000240784 W
|
||||
Gate Leakage = 0.000341076 W
|
||||
Runtime Dynamic = 0.0938146 W
|
||||
|
||||
*****************************************************************************************
|
320
ext/mcpat/results/A9_800
Normal file
320
ext/mcpat/results/A9_800
Normal file
|
@ -0,0 +1,320 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 40 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= conservative interconnect technology projection
|
||||
Core clock Rate(MHz) 800
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 5.48929 mm^2
|
||||
Peak Power = 0.577263 W
|
||||
Total Leakage = 0.127046 W
|
||||
Peak Dynamic = 0.450217 W
|
||||
Subthreshold Leakage = 0.0608257 W
|
||||
Gate Leakage = 0.0662198 W
|
||||
Runtime Dynamic = 1.13304 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 4.98521 mm^2
|
||||
Peak Dynamic = 0.425609 W
|
||||
Subthreshold Leakage = 0.0577408 W
|
||||
Gate Leakage = 0.061241 W
|
||||
Runtime Dynamic = 0.37879 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.489711 mm^2
|
||||
Peak Dynamic = 0.0179901 W
|
||||
Subthreshold Leakage = 0.0029286 W
|
||||
Gate Leakage = 0.00476045 W
|
||||
Runtime Dynamic = 0.721156 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS low operating power device type
|
||||
Area = 0.0143604 mm^2
|
||||
Peak Dynamic = 0.00661787 W
|
||||
Subthreshold Leakage = 0.000156344 W
|
||||
Gate Leakage = 0.000218372 W
|
||||
Runtime Dynamic = 0.0330893 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 2.49261 mm^2
|
||||
Peak Dynamic = 0.212805 W
|
||||
Subthreshold Leakage = 0.0288704 W
|
||||
Gate Leakage = 0.0306205 W
|
||||
Runtime Dynamic = 0.37879 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 0.450898 mm^2
|
||||
Peak Dynamic = 0.0710479 W
|
||||
Subthreshold Leakage = 0.00360576 W
|
||||
Gate Leakage = 0.00232348 W
|
||||
Runtime Dynamic = 0.101921 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.235613 mm^2
|
||||
Peak Dynamic = 0.0124171 W
|
||||
Subthreshold Leakage = 0.00228006 W
|
||||
Gate Leakage = 0.00157114 W
|
||||
Runtime Dynamic = 0.018465 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 0.136309 mm^2
|
||||
Peak Dynamic = 0.00413545 W
|
||||
Subthreshold Leakage = 0.000644359 W
|
||||
Gate Leakage = 0.000219381 W
|
||||
Runtime Dynamic = 0.0165418 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.064441 mm^2
|
||||
Peak Dynamic = 0.00326317 W
|
||||
Subthreshold Leakage = 0.000518728 W
|
||||
Gate Leakage = 0.000346624 W
|
||||
Runtime Dynamic = 0.0045551 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.0313969 mm^2
|
||||
Peak Dynamic = 0.00149811 W
|
||||
Subthreshold Leakage = 0.000255012 W
|
||||
Gate Leakage = 0.000169581 W
|
||||
Runtime Dynamic = 0.00218323 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.000711939 mm^2
|
||||
Peak Dynamic = 0.000120406 W
|
||||
Subthreshold Leakage = 4.51731e-06 W
|
||||
Gate Leakage = 4.09128e-06 W
|
||||
Runtime Dynamic = 0.000188626 W
|
||||
|
||||
Area = 0.000650815 mm^2
|
||||
Peak Dynamic = 9.20494e-05 W
|
||||
Subthreshold Leakage = 4.25393e-06 W
|
||||
Gate Leakage = 3.44945e-06 W
|
||||
Runtime Dynamic = 0.000141995 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.0313969 mm^2
|
||||
Peak Dynamic = 0.00149811 W
|
||||
Subthreshold Leakage = 0.000255012 W
|
||||
Gate Leakage = 0.000169581 W
|
||||
Runtime Dynamic = 0.00218323 W
|
||||
|
||||
RAS:
|
||||
Area = 0.000996272 mm^2
|
||||
Peak Dynamic = 0.000146549 W
|
||||
Subthreshold Leakage = 4.18739e-06 W
|
||||
Gate Leakage = 3.3701e-06 W
|
||||
Runtime Dynamic = 2.49598e-08 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.00820192 mm^2
|
||||
Peak Dynamic = 0.0267951 W
|
||||
Subthreshold Leakage = 4.66516e-05 W
|
||||
Gate Leakage = 3.15732e-05 W
|
||||
Runtime Dynamic = 0.0153115 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.00468731 mm^2
|
||||
Peak Dynamic = 0.023524 W
|
||||
Subthreshold Leakage = 9.40317e-05 W
|
||||
Gate Leakage = 8.38587e-05 W
|
||||
Runtime Dynamic = 0.047048 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 0.0903068 mm^2
|
||||
Peak Dynamic = 0.0180606 W
|
||||
Subthreshold Leakage = 0.000254554 W
|
||||
Gate Leakage = 0.000232507 W
|
||||
Runtime Dynamic = 0.0292515 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 0.0543672 mm^2
|
||||
Peak Dynamic = 0.00950468 W
|
||||
Subthreshold Leakage = 0.000129029 W
|
||||
Gate Leakage = 8.82378e-05 W
|
||||
Runtime Dynamic = 0.0190094 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 0.0185325 mm^2
|
||||
Peak Dynamic = 0.00379768 W
|
||||
Subthreshold Leakage = 7.38761e-05 W
|
||||
Gate Leakage = 4.91016e-05 W
|
||||
Runtime Dynamic = 0.00379768 W
|
||||
|
||||
Free List:
|
||||
Area = 0.00599955 mm^2
|
||||
Peak Dynamic = 0.00090026 W
|
||||
Subthreshold Leakage = 9.15772e-06 W
|
||||
Gate Leakage = 7.32213e-06 W
|
||||
Runtime Dynamic = 0.00360104 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.00605969 mm^2
|
||||
Peak Dynamic = 0.00179357 W
|
||||
Subthreshold Leakage = 9.8107e-06 W
|
||||
Gate Leakage = 8.43969e-06 W
|
||||
Runtime Dynamic = 0.00179357 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.000650815 mm^2
|
||||
Peak Dynamic = 0.000269336 W
|
||||
Subthreshold Leakage = 4.25393e-06 W
|
||||
Gate Leakage = 3.44945e-06 W
|
||||
Runtime Dynamic = 0.000269336 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.00305098 mm^2
|
||||
Peak Dynamic = 0.000780497 W
|
||||
Subthreshold Leakage = 6.49266e-06 W
|
||||
Gate Leakage = 5.05395e-06 W
|
||||
Runtime Dynamic = 0.000780497 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 0.274913 mm^2
|
||||
Peak Dynamic = 0.0138993 W
|
||||
Subthreshold Leakage = 0.00235727 W
|
||||
Gate Leakage = 0.00171176 W
|
||||
Runtime Dynamic = 0.0781216 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.240878 mm^2
|
||||
Peak Dynamic = 0.0117466 W
|
||||
Subthreshold Leakage = 0.00230394 W
|
||||
Gate Leakage = 0.00160316 W
|
||||
Runtime Dynamic = 0.0761042 W
|
||||
|
||||
StoreQ:
|
||||
Area = 0.00754674 mm^2
|
||||
Peak Dynamic = 0.00143235 W
|
||||
Subthreshold Leakage = 3.13936e-05 W
|
||||
Gate Leakage = 3.76992e-05 W
|
||||
Runtime Dynamic = 0.00201739 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.021508 mm^2
|
||||
Peak Dynamic = 0.0050935 W
|
||||
Subthreshold Leakage = 0.000155095 W
|
||||
Gate Leakage = 0.000211049 W
|
||||
Runtime Dynamic = 0.0148284 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.00993091 mm^2
|
||||
Peak Dynamic = 0.00247139 W
|
||||
Subthreshold Leakage = 6.65801e-05 W
|
||||
Gate Leakage = 7.00732e-05 W
|
||||
Runtime Dynamic = 0.0049428 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.00993091 mm^2
|
||||
Peak Dynamic = 0.00175468 W
|
||||
Subthreshold Leakage = 6.65801e-05 W
|
||||
Gate Leakage = 7.00732e-05 W
|
||||
Runtime Dynamic = 0.00988557 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 1.65498 mm^2
|
||||
Peak Dynamic = 0.104703 W
|
||||
Subthreshold Leakage = 0.0224977 W
|
||||
Gate Leakage = 0.0261417 W
|
||||
Runtime Dynamic = 0.154667 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.203203 mm^2
|
||||
Peak Dynamic = 0.0305313 W
|
||||
Subthreshold Leakage = 0.000145099 W
|
||||
Gate Leakage = 0.000118628 W
|
||||
Runtime Dynamic = 0.0154426 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.146073 mm^2
|
||||
Peak Dynamic = 0.0305313 W
|
||||
Subthreshold Leakage = 8.85877e-05 W
|
||||
Gate Leakage = 7.24537e-05 W
|
||||
Runtime Dynamic = 0.0138276 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.05713 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 5.6511e-05 W
|
||||
Gate Leakage = 4.61745e-05 W
|
||||
Runtime Dynamic = 0.00161506 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.0582889 mm^2
|
||||
Peak Dynamic = 0.0209028 W
|
||||
Subthreshold Leakage = 9.47693e-05 W
|
||||
Gate Leakage = 0.000134844 W
|
||||
Runtime Dynamic = 0.0314989 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.053925 mm^2
|
||||
Peak Dynamic = 0.0178358 W
|
||||
Subthreshold Leakage = 7.01713e-05 W
|
||||
Gate Leakage = 9.49122e-05 W
|
||||
Runtime Dynamic = 0.0240893 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.00436388 mm^2
|
||||
Peak Dynamic = 0.00306704 W
|
||||
Subthreshold Leakage = 2.45979e-05 W
|
||||
Gate Leakage = 3.99319e-05 W
|
||||
Runtime Dynamic = 0.00740966 W
|
||||
|
||||
Integer ALUs (Count: 3 ):
|
||||
Area = 0.312404 mm^2
|
||||
Peak Dynamic = 0.0113473 W
|
||||
Subthreshold Leakage = 0.0103625 W
|
||||
Gate Leakage = 0.0120315 W
|
||||
Runtime Dynamic = 0.0149307 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 1 ):
|
||||
Area = 0.971259 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.00805417 W
|
||||
Gate Leakage = 0.00935142 W
|
||||
Runtime Dynamic = 0.0149307 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.104135 mm^2
|
||||
Peak Dynamic = 0.00816212 W
|
||||
Subthreshold Leakage = 0.00345415 W
|
||||
Gate Leakage = 0.0040105 W
|
||||
Runtime Dynamic = 0.0199076 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.00404385 mm^2
|
||||
Peak Dynamic = 0.0329888 W
|
||||
Subthreshold Leakage = 0.000365119 W
|
||||
Gate Leakage = 0.000423926 W
|
||||
Runtime Dynamic = 0.0579569 W
|
||||
|
||||
*****************************************************************************************
|
||||
First Level Directory
|
||||
Area = 0.244856 mm^2
|
||||
Peak Dynamic = 0.00899504 W
|
||||
Subthreshold Leakage = 0.0014643 W
|
||||
Gate Leakage = 0.00238022 W
|
||||
Runtime Dynamic = 0.721156 W
|
||||
|
||||
*****************************************************************************************
|
||||
BUSES
|
||||
Area = 0.0143604 mm^2
|
||||
Peak Dynamic = 0.00661787 W
|
||||
Subthreshold Leakage = 0.000156344 W
|
||||
Gate Leakage = 0.000218372 W
|
||||
Runtime Dynamic = 0.0330893 W
|
||||
|
||||
Bus:
|
||||
Area = 0.0143604 mm^2
|
||||
Peak Dynamic = 0.00661787 W
|
||||
Subthreshold Leakage = 0.000156344 W
|
||||
Gate Leakage = 0.000218372 W
|
||||
Runtime Dynamic = 0.0330893 W
|
||||
|
||||
*****************************************************************************************
|
441
ext/mcpat/results/Alpha21364
Normal file
441
ext/mcpat/results/Alpha21364
Normal file
|
@ -0,0 +1,441 @@
|
|||
McPAT (version 0.7 of May, 2010) is computing the target processor...
|
||||
|
||||
Warning: icache array structure cannot satisfy throughput constraint.
|
||||
Warning: icache array structure cannot satisfy latency constraint.
|
||||
Warning: InstBuffer array structure cannot satisfy throughput constraint.
|
||||
Warning: InstBuffer array structure cannot satisfy latency constraint.
|
||||
Warning: Branch Target Buffer array structure cannot satisfy throughput constraint.
|
||||
Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
|
||||
Warning: Global Predictor array structure cannot satisfy throughput constraint.
|
||||
Warning: Global Predictor array structure cannot satisfy latency constraint.
|
||||
Warning: L1 local Predictor array structure cannot satisfy throughput constraint.
|
||||
Warning: L1 local Predictor array structure cannot satisfy latency constraint.
|
||||
Warning: L2 local Predictor array structure cannot satisfy throughput constraint.
|
||||
Warning: L2 local Predictor array structure cannot satisfy latency constraint.
|
||||
Warning: Predictor Chooser array structure cannot satisfy throughput constraint.
|
||||
Warning: Predictor Chooser array structure cannot satisfy latency constraint.
|
||||
Warning: RAS array structure cannot satisfy throughput constraint.
|
||||
Warning: RAS array structure cannot satisfy latency constraint.
|
||||
Warning: dcache array structure cannot satisfy throughput constraint.
|
||||
Warning: dcache array structure cannot satisfy latency constraint.
|
||||
Warning: Integer Register File array structure cannot satisfy throughput constraint.
|
||||
Warning: Integer Register File array structure cannot satisfy latency constraint.
|
||||
Warning: Floating point Register File array structure cannot satisfy throughput constraint.
|
||||
Warning: Floating point Register File array structure cannot satisfy latency constraint.
|
||||
Warning: ReorderBuffer array structure cannot satisfy throughput constraint.
|
||||
Warning: ReorderBuffer array structure cannot satisfy latency constraint.
|
||||
Warning: Int RetireRAT array structure cannot satisfy throughput constraint.
|
||||
Warning: Int RetireRAT array structure cannot satisfy latency constraint.
|
||||
Warning: Int RetireRAT array structure cannot satisfy latency constraint.
|
||||
Warning: Int Free List array structure cannot satisfy throughput constraint.
|
||||
Warning: Int Free List array structure cannot satisfy latency constraint.
|
||||
Warning: Int Free List array structure cannot satisfy throughput constraint.
|
||||
Warning: Int Free List array structure cannot satisfy latency constraint.
|
||||
Warning: MC ReadBuffer array structure cannot satisfy throughput constraint.
|
||||
Warning: MC ReadBuffer array structure cannot satisfy latency constraint.
|
||||
Warning: MC writeBuffer array structure cannot satisfy throughput constraint.
|
||||
Warning: MC writeBuffer array structure cannot satisfy latency constraint.
|
||||
|
||||
McPAT (version 0.7 of May, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 180 nm
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 1200
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 323.859 mm^2
|
||||
Peak Power = 90.0375 W
|
||||
Total Leakage = 0.156795 W
|
||||
Peak Dynamic = 89.8807 W
|
||||
Subthreshold Leakage = 0.151936 W
|
||||
Gate Leakage = 0.00485969 W
|
||||
Runtime Dynamic = 85.2036 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 137.839 mm^2
|
||||
Peak Dynamic = 60.6776 W
|
||||
Subthreshold Leakage = 0.067186 W
|
||||
Gate Leakage = 0.00428355 W
|
||||
Runtime Dynamic = 73.9555 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 137.063 mm^2
|
||||
Peak Dynamic = 3.55835 W
|
||||
Subthreshold Leakage = 0.0778886 W
|
||||
Gate Leakage = 0.00016078 W
|
||||
Runtime Dynamic = 6.34872 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 1.59954 mm^2
|
||||
Peak Dynamic = 0.805902 W
|
||||
Subthreshold Leakage = 0.000311783 W
|
||||
Gate Leakage = 2.63568e-05 W
|
||||
Runtime Dynamic = 0.547665 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 29.1057 mm^2
|
||||
Peak Dynamic = 16.5188 W
|
||||
Subthreshold Leakage = 0.00292556 W
|
||||
Gate Leakage = 0.000166293 W
|
||||
Runtime Dynamic = 2.54446 W
|
||||
|
||||
Total MCs:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 18.2519 mm^2
|
||||
Peak Dynamic = 8.32001 W
|
||||
Subthreshold Leakage = 0.00362353 W
|
||||
Gate Leakage = 0.000222708 W
|
||||
Runtime Dynamic = 1.80731 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 137.839 mm^2
|
||||
Peak Dynamic = 60.6776 W
|
||||
Subthreshold Leakage = 0.067186 W
|
||||
Gate Leakage = 0.00428355 W
|
||||
Runtime Dynamic = 73.9555 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 27.6096 mm^2
|
||||
Peak Dynamic = 9.86655 W
|
||||
Subthreshold Leakage = 0.00622106 W
|
||||
Gate Leakage = 0.000344671 W
|
||||
Runtime Dynamic = 10.0567 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 11.4511 mm^2
|
||||
Peak Dynamic = 1.53259 W
|
||||
Subthreshold Leakage = 0.00371341 W
|
||||
Gate Leakage = 0.000171069 W
|
||||
Runtime Dynamic = 2.13168 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 13.3377 mm^2
|
||||
Peak Dynamic = 0.56236 W
|
||||
Subthreshold Leakage = 0.001581 W
|
||||
Gate Leakage = 9.5198e-05 W
|
||||
Runtime Dynamic = 2.24944 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 2.1618 mm^2
|
||||
Peak Dynamic = 0.234643 W
|
||||
Subthreshold Leakage = 0.000469396 W
|
||||
Gate Leakage = 2.01907e-05 W
|
||||
Runtime Dynamic = 0.198646 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.893575 mm^2
|
||||
Peak Dynamic = 0.0726984 W
|
||||
Subthreshold Leakage = 0.000182866 W
|
||||
Gate Leakage = 7.91951e-06 W
|
||||
Runtime Dynamic = 0.0726984 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.420241 mm^2
|
||||
Peak Dynamic = 0.0532456 W
|
||||
Subthreshold Leakage = 9.20027e-05 W
|
||||
Gate Leakage = 3.89162e-06 W
|
||||
Runtime Dynamic = 0.0532456 W
|
||||
|
||||
Area = 0.291886 mm^2
|
||||
Peak Dynamic = 0.0292091 W
|
||||
Subthreshold Leakage = 5.262e-05 W
|
||||
Gate Leakage = 2.51093e-06 W
|
||||
Runtime Dynamic = 0.0292091 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.893575 mm^2
|
||||
Peak Dynamic = 0.0726984 W
|
||||
Subthreshold Leakage = 0.000182866 W
|
||||
Gate Leakage = 7.91951e-06 W
|
||||
Runtime Dynamic = 0.0726984 W
|
||||
|
||||
RAS:
|
||||
Area = 0.0827607 mm^2
|
||||
Peak Dynamic = 0.0360009 W
|
||||
Subthreshold Leakage = 1.16623e-05 W
|
||||
Gate Leakage = 4.60036e-07 W
|
||||
Runtime Dynamic = 3.58028e-06 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.465385 mm^2
|
||||
Peak Dynamic = 2.10455 W
|
||||
Subthreshold Leakage = 6.13248e-05 W
|
||||
Gate Leakage = 4.88113e-06 W
|
||||
Runtime Dynamic = 1.40303 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.146031 mm^2
|
||||
Peak Dynamic = 4.07384 W
|
||||
Subthreshold Leakage = 7.07416e-05 W
|
||||
Gate Leakage = 3.32268e-06 W
|
||||
Runtime Dynamic = 4.07384 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 11.7262 mm^2
|
||||
Peak Dynamic = 12.5584 W
|
||||
Subthreshold Leakage = 0.000886804 W
|
||||
Gate Leakage = 9.92419e-05 W
|
||||
Runtime Dynamic = 9.90647 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 8.24345 mm^2
|
||||
Peak Dynamic = 8.04227 W
|
||||
Subthreshold Leakage = 0.000376247 W
|
||||
Gate Leakage = 3.40623e-05 W
|
||||
Runtime Dynamic = 8.04227 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 2.549 mm^2
|
||||
Peak Dynamic = 2.75082 W
|
||||
Subthreshold Leakage = 0.000149367 W
|
||||
Gate Leakage = 1.30084e-05 W
|
||||
Runtime Dynamic = 1.37541 W
|
||||
|
||||
Free List:
|
||||
Area = 0.446019 mm^2
|
||||
Peak Dynamic = 0.156051 W
|
||||
Subthreshold Leakage = 1.32133e-05 W
|
||||
Gate Leakage = 7.4667e-07 W
|
||||
Runtime Dynamic = 0.312102 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.184445 mm^2
|
||||
Peak Dynamic = 0.102656 W
|
||||
Subthreshold Leakage = 8.50239e-06 W
|
||||
Gate Leakage = 5.28869e-07 W
|
||||
Runtime Dynamic = 0.102656 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.0567228 mm^2
|
||||
Peak Dynamic = 0.0367258 W
|
||||
Subthreshold Leakage = 5.67894e-06 W
|
||||
Gate Leakage = 3.75578e-07 W
|
||||
Runtime Dynamic = 0.0183629 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.198929 mm^2
|
||||
Peak Dynamic = 0.111293 W
|
||||
Subthreshold Leakage = 8.61952e-06 W
|
||||
Gate Leakage = 5.10875e-07 W
|
||||
Runtime Dynamic = 0.0556467 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 49.742 mm^2
|
||||
Peak Dynamic = 11.7952 W
|
||||
Subthreshold Leakage = 0.00715349 W
|
||||
Gate Leakage = 0.00052778 W
|
||||
Runtime Dynamic = 31.7658 W
|
||||
|
||||
Data Cache:
|
||||
Area = 36.106 mm^2
|
||||
Peak Dynamic = 9.28008 W
|
||||
Subthreshold Leakage = 0.00663485 W
|
||||
Gate Leakage = 0.000466572 W
|
||||
Runtime Dynamic = 31.332 W
|
||||
|
||||
LoadQ:
|
||||
Area = 2.60005 mm^2
|
||||
Peak Dynamic = 0.578279 W
|
||||
Subthreshold Leakage = 9.67302e-05 W
|
||||
Gate Leakage = 5.59905e-06 W
|
||||
Runtime Dynamic = 0.14457 W
|
||||
|
||||
StoreQ:
|
||||
Area = 2.60005 mm^2
|
||||
Peak Dynamic = 0.578279 W
|
||||
Subthreshold Leakage = 9.67302e-05 W
|
||||
Gate Leakage = 5.59905e-06 W
|
||||
Runtime Dynamic = 0.289139 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 8.74543 mm^2
|
||||
Peak Dynamic = 3.77198 W
|
||||
Subthreshold Leakage = 0.00119904 W
|
||||
Gate Leakage = 0.000127183 W
|
||||
Runtime Dynamic = 4.82688 W
|
||||
|
||||
Itlb:
|
||||
Area = 1.97969 mm^2
|
||||
Peak Dynamic = 0.537563 W
|
||||
Subthreshold Leakage = 0.000270576 W
|
||||
Gate Leakage = 2.0845e-05 W
|
||||
Runtime Dynamic = 1.07513 W
|
||||
|
||||
Dtlb:
|
||||
Area = 6.71814 mm^2
|
||||
Peak Dynamic = 1.87586 W
|
||||
Subthreshold Leakage = 0.00060329 W
|
||||
Gate Leakage = 5.63286e-05 W
|
||||
Runtime Dynamic = 3.75174 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 31.4918 mm^2
|
||||
Peak Dynamic = 22.6855 W
|
||||
Subthreshold Leakage = 0.0320294 W
|
||||
Gate Leakage = 0.00198102 W
|
||||
Runtime Dynamic = 17.3997 W
|
||||
|
||||
Register Files:
|
||||
Area = 9.9318 mm^2
|
||||
Peak Dynamic = 3.92301 W
|
||||
Subthreshold Leakage = 0.000295352 W
|
||||
Gate Leakage = 1.33517e-05 W
|
||||
Runtime Dynamic = 1.7929 W
|
||||
|
||||
Integer RF:
|
||||
Area = 6.76678 mm^2
|
||||
Peak Dynamic = 2.35597 W
|
||||
Subthreshold Leakage = 0.000185762 W
|
||||
Gate Leakage = 8.51701e-06 W
|
||||
Runtime Dynamic = 1.60634 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 3.16503 mm^2
|
||||
Peak Dynamic = 1.56704 W
|
||||
Subthreshold Leakage = 0.00010959 W
|
||||
Gate Leakage = 4.83467e-06 W
|
||||
Runtime Dynamic = 0.186553 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 5.20691 mm^2
|
||||
Peak Dynamic = 2.77224 W
|
||||
Subthreshold Leakage = 0.000202187 W
|
||||
Gate Leakage = 1.05832e-05 W
|
||||
Runtime Dynamic = 3.11355 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 1.23862 mm^2
|
||||
Peak Dynamic = 0.985117 W
|
||||
Subthreshold Leakage = 5.55506e-05 W
|
||||
Gate Leakage = 3.78978e-06 W
|
||||
Runtime Dynamic = 1.23906 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.481718 mm^2
|
||||
Peak Dynamic = 0.438839 W
|
||||
Subthreshold Leakage = 2.5962e-05 W
|
||||
Gate Leakage = 2.00351e-06 W
|
||||
Runtime Dynamic = 0.526208 W
|
||||
|
||||
ROB:
|
||||
Area = 3.48657 mm^2
|
||||
Peak Dynamic = 1.34828 W
|
||||
Subthreshold Leakage = 0.000120674 W
|
||||
Gate Leakage = 4.78991e-06 W
|
||||
Runtime Dynamic = 1.34828 W
|
||||
|
||||
Integer ALUs (Count: 4 ):
|
||||
Area = 3.4944 mm^2
|
||||
Peak Dynamic = 4.23312 W
|
||||
Subthreshold Leakage = 0.016149 W
|
||||
Gate Leakage = 0.000986885 W
|
||||
Runtime Dynamic = 3.21343 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 1 ):
|
||||
Area = 12.705 mm^2
|
||||
Peak Dynamic = 3.52215 W
|
||||
Subthreshold Leakage = 0.0146787 W
|
||||
Gate Leakage = 0.000897034 W
|
||||
Runtime Dynamic = 3.52215 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.106062 mm^2
|
||||
Peak Dynamic = 6.87645 W
|
||||
Subthreshold Leakage = 0.000378957 W
|
||||
Gate Leakage = 2.31585e-05 W
|
||||
Runtime Dynamic = 5.75766 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 137.063 mm^2
|
||||
Peak Dynamic = 3.55835 W
|
||||
Subthreshold Leakage = 0.0778886 W
|
||||
Gate Leakage = 0.00016078 W
|
||||
Runtime Dynamic = 6.34872 W
|
||||
|
||||
*****************************************************************************************
|
||||
Second Level Directory
|
||||
Area = 1.59954 mm^2
|
||||
Peak Dynamic = 0.805902 W
|
||||
Subthreshold Leakage = 0.000311783 W
|
||||
Gate Leakage = 2.63568e-05 W
|
||||
Runtime Dynamic = 0.547665 W
|
||||
|
||||
*****************************************************************************************
|
||||
Memory Controller:
|
||||
Area = 9.12595 mm^2
|
||||
Peak Dynamic = 4.16 W
|
||||
Subthreshold Leakage = 0.00181177 W
|
||||
Gate Leakage = 0.000111354 W
|
||||
Runtime Dynamic = 1.80731 W
|
||||
|
||||
Front End Engine:
|
||||
Area = 5.49326 mm^2
|
||||
Peak Dynamic = 1.42883 W
|
||||
Subthreshold Leakage = 0.000132955 W
|
||||
Gate Leakage = 8.76015e-06 W
|
||||
Runtime Dynamic = 0.348049 W
|
||||
|
||||
Transaction Engine:
|
||||
Area = 1.50616 mm^2
|
||||
Peak Dynamic = 1.93117 W
|
||||
Subthreshold Leakage = 0.000696058 W
|
||||
Gate Leakage = 4.25369e-05 W
|
||||
Runtime Dynamic = 0.579332 W
|
||||
|
||||
PHY:
|
||||
Area = 2.12653 mm^2
|
||||
Peak Dynamic = 0.8 W
|
||||
Subthreshold Leakage = 0.000982753 W
|
||||
Gate Leakage = 6.00571e-05 W
|
||||
Runtime Dynamic = 0.879928 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 29.1057 mm^2
|
||||
Peak Dynamic = 16.5188 W
|
||||
Subthreshold Leakage = 0.00292556 W
|
||||
Gate Leakage = 0.000166293 W
|
||||
Runtime Dynamic = 2.54446 W
|
||||
|
||||
Router:
|
||||
Area = 28.4197 mm^2
|
||||
Peak Dynamic = 8.76431 W
|
||||
Subthreshold Leakage = 0.00199965 W
|
||||
Gate Leakage = 0.000109709 W
|
||||
Runtime Dynamic = 1.25204 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 17.0424 mm^2
|
||||
Peak Dynamic = 7.30291 W
|
||||
Subthreshold Leakage = 0.00119658 W
|
||||
Gate Leakage = 4.15511e-05 W
|
||||
Runtime Dynamic = 1.04327 W
|
||||
|
||||
Crossbar:
|
||||
Area = 0.357655 mm^2
|
||||
Peak Dynamic = 1.27997 W
|
||||
Subthreshold Leakage = 0.000801415 W
|
||||
Gate Leakage = 6.80527e-05 W
|
||||
Runtime Dynamic = 0.182853 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.18143 W
|
||||
Subthreshold Leakage = 1.65956e-06 W
|
||||
Gate Leakage = 1.05559e-07 W
|
||||
Runtime Dynamic = 0.0259186 W
|
||||
|
||||
Per Router :
|
||||
Area = 0.685989 mm^2
|
||||
Peak Dynamic = 7.75447 W
|
||||
Subthreshold Leakage = 0.000925911 W
|
||||
Gate Leakage = 5.65834e-05 W
|
||||
Runtime Dynamic = 1.29241 W
|
||||
|
||||
*****************************************************************************************
|
408
ext/mcpat/results/Alpha21364_90nm
Normal file
408
ext/mcpat/results/Alpha21364_90nm
Normal file
|
@ -0,0 +1,408 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
Warning: icache array structure cannot satisfy latency constraint.
|
||||
Warning: dcache array structure cannot satisfy latency constraint.
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 90 nm
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 1200
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 139.86 mm^2
|
||||
Peak Power = 34.9936 W
|
||||
Total Leakage = 4.16949 W
|
||||
Peak Dynamic = 30.8241 W
|
||||
Subthreshold Leakage = 3.86203 W
|
||||
Gate Leakage = 0.307463 W
|
||||
Runtime Dynamic = 34.0612 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 61.1957 mm^2
|
||||
Peak Dynamic = 19.6269 W
|
||||
Subthreshold Leakage = 2.04452 W
|
||||
Gate Leakage = 0.277429 W
|
||||
Runtime Dynamic = 29.5972 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 62.2653 mm^2
|
||||
Peak Dynamic = 1.42987 W
|
||||
Subthreshold Leakage = 1.65481 W
|
||||
Gate Leakage = 0.00860545 W
|
||||
Runtime Dynamic = 2.73329 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 0.533824 mm^2
|
||||
Peak Dynamic = 0.275566 W
|
||||
Subthreshold Leakage = 0.00929753 W
|
||||
Gate Leakage = 0.00179126 W
|
||||
Runtime Dynamic = 0.193681 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 8.77595 mm^2
|
||||
Peak Dynamic = 6.17873 W
|
||||
Subthreshold Leakage = 0.108357 W
|
||||
Gate Leakage = 0.0139259 W
|
||||
Runtime Dynamic = 0.963385 W
|
||||
|
||||
Total MCs:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 7.08925 mm^2
|
||||
Peak Dynamic = 3.3131 W
|
||||
Subthreshold Leakage = 0.0450389 W
|
||||
Gate Leakage = 0.00571171 W
|
||||
Runtime Dynamic = 0.573656 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 61.1957 mm^2
|
||||
Peak Dynamic = 19.6269 W
|
||||
Subthreshold Leakage = 2.04452 W
|
||||
Gate Leakage = 0.277429 W
|
||||
Runtime Dynamic = 29.5972 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 7.40352 mm^2
|
||||
Peak Dynamic = 2.10646 W
|
||||
Subthreshold Leakage = 0.126581 W
|
||||
Gate Leakage = 0.0150397 W
|
||||
Runtime Dynamic = 2.55478 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 5.01657 mm^2
|
||||
Peak Dynamic = 0.745807 W
|
||||
Subthreshold Leakage = 0.0906167 W
|
||||
Gate Leakage = 0.010922 W
|
||||
Runtime Dynamic = 1.22193 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 1.63475 mm^2
|
||||
Peak Dynamic = 0.0974373 W
|
||||
Subthreshold Leakage = 0.0188281 W
|
||||
Gate Leakage = 0.00126965 W
|
||||
Runtime Dynamic = 0.389749 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.474272 mm^2
|
||||
Peak Dynamic = 0.0682449 W
|
||||
Subthreshold Leakage = 0.00901262 W
|
||||
Gate Leakage = 0.00067136 W
|
||||
Runtime Dynamic = 0.0636543 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.190297 mm^2
|
||||
Peak Dynamic = 0.0224229 W
|
||||
Subthreshold Leakage = 0.00351842 W
|
||||
Gate Leakage = 0.000260107 W
|
||||
Runtime Dynamic = 0.0239711 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.0959237 mm^2
|
||||
Peak Dynamic = 0.0143301 W
|
||||
Subthreshold Leakage = 0.00171829 W
|
||||
Gate Leakage = 0.00012889 W
|
||||
Runtime Dynamic = 0.015711 W
|
||||
|
||||
Area = 0.0484908 mm^2
|
||||
Peak Dynamic = 0.0077514 W
|
||||
Subthreshold Leakage = 0.000926283 W
|
||||
Gate Leakage = 7.55051e-05 W
|
||||
Runtime Dynamic = 0.00850163 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.190297 mm^2
|
||||
Peak Dynamic = 0.0224229 W
|
||||
Subthreshold Leakage = 0.00351842 W
|
||||
Gate Leakage = 0.000260107 W
|
||||
Runtime Dynamic = 0.0239711 W
|
||||
|
||||
RAS:
|
||||
Area = 0.0451868 mm^2
|
||||
Peak Dynamic = 0.00906891 W
|
||||
Subthreshold Leakage = 0.00025749 W
|
||||
Gate Leakage = 2.22565e-05 W
|
||||
Runtime Dynamic = 1.06361e-06 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.11139 mm^2
|
||||
Peak Dynamic = 0.30298 W
|
||||
Subthreshold Leakage = 0.000556928 W
|
||||
Gate Leakage = 4.34124e-05 W
|
||||
Runtime Dynamic = 0.201987 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.0481902 mm^2
|
||||
Peak Dynamic = 0.677465 W
|
||||
Subthreshold Leakage = 0.00135195 W
|
||||
Gate Leakage = 0.000132907 W
|
||||
Runtime Dynamic = 0.677465 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 4.5037 mm^2
|
||||
Peak Dynamic = 4.11785 W
|
||||
Subthreshold Leakage = 0.0296009 W
|
||||
Gate Leakage = 0.00668098 W
|
||||
Runtime Dynamic = 3.24944 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 2.76467 mm^2
|
||||
Peak Dynamic = 2.43279 W
|
||||
Subthreshold Leakage = 0.0129405 W
|
||||
Gate Leakage = 0.00255854 W
|
||||
Runtime Dynamic = 2.43279 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 1.39233 mm^2
|
||||
Peak Dynamic = 1.35403 W
|
||||
Subthreshold Leakage = 0.00981219 W
|
||||
Gate Leakage = 0.00205621 W
|
||||
Runtime Dynamic = 0.677017 W
|
||||
|
||||
Free List:
|
||||
Area = 0.116928 mm^2
|
||||
Peak Dynamic = 0.0436483 W
|
||||
Subthreshold Leakage = 0.000259915 W
|
||||
Gate Leakage = 2.53395e-05 W
|
||||
Runtime Dynamic = 0.0872966 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.0429772 mm^2
|
||||
Peak Dynamic = 0.0318091 W
|
||||
Subthreshold Leakage = 0.000152798 W
|
||||
Gate Leakage = 1.86722e-05 W
|
||||
Runtime Dynamic = 0.0318091 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.0153516 mm^2
|
||||
Peak Dynamic = 0.00997874 W
|
||||
Subthreshold Leakage = 8.06509e-05 W
|
||||
Gate Leakage = 7.17049e-06 W
|
||||
Runtime Dynamic = 0.00498937 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.0530951 mm^2
|
||||
Peak Dynamic = 0.0310624 W
|
||||
Subthreshold Leakage = 0.000140326 W
|
||||
Gate Leakage = 1.46766e-05 W
|
||||
Runtime Dynamic = 0.0155312 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 20.5622 mm^2
|
||||
Peak Dynamic = 5.14439 W
|
||||
Subthreshold Leakage = 0.207699 W
|
||||
Gate Leakage = 0.0357344 W
|
||||
Runtime Dynamic = 16.0217 W
|
||||
|
||||
Data Cache:
|
||||
Area = 15.2468 mm^2
|
||||
Peak Dynamic = 4.5468 W
|
||||
Subthreshold Leakage = 0.19694 W
|
||||
Gate Leakage = 0.0331746 W
|
||||
Runtime Dynamic = 15.8781 W
|
||||
|
||||
LoadQ:
|
||||
Area = 0.863734 mm^2
|
||||
Peak Dynamic = 0.191536 W
|
||||
Subthreshold Leakage = 0.00227213 W
|
||||
Gate Leakage = 0.000279753 W
|
||||
Runtime Dynamic = 0.047884 W
|
||||
|
||||
StoreQ:
|
||||
Area = 0.863734 mm^2
|
||||
Peak Dynamic = 0.191536 W
|
||||
Subthreshold Leakage = 0.00227213 W
|
||||
Gate Leakage = 0.000279753 W
|
||||
Runtime Dynamic = 0.0957681 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 3.49533 mm^2
|
||||
Peak Dynamic = 1.34391 W
|
||||
Subthreshold Leakage = 0.0412098 W
|
||||
Gate Leakage = 0.00931467 W
|
||||
Runtime Dynamic = 2.25879 W
|
||||
|
||||
Itlb:
|
||||
Area = 1.12903 mm^2
|
||||
Peak Dynamic = 0.425717 W
|
||||
Subthreshold Leakage = 0.0152632 W
|
||||
Gate Leakage = 0.00308734 W
|
||||
Runtime Dynamic = 0.851444 W
|
||||
|
||||
Dtlb:
|
||||
Area = 2.24796 mm^2
|
||||
Peak Dynamic = 0.703668 W
|
||||
Subthreshold Leakage = 0.0197321 W
|
||||
Gate Leakage = 0.00422696 W
|
||||
Runtime Dynamic = 1.40735 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 18.9802 mm^2
|
||||
Peak Dynamic = 6.91426 W
|
||||
Subthreshold Leakage = 1.01207 W
|
||||
Gate Leakage = 0.130415 W
|
||||
Runtime Dynamic = 5.51245 W
|
||||
|
||||
Register Files:
|
||||
Area = 4.63431 mm^2
|
||||
Peak Dynamic = 1.07973 W
|
||||
Subthreshold Leakage = 0.00557121 W
|
||||
Gate Leakage = 0.000534421 W
|
||||
Runtime Dynamic = 0.491409 W
|
||||
|
||||
Integer RF:
|
||||
Area = 3.11444 mm^2
|
||||
Peak Dynamic = 0.64479 W
|
||||
Subthreshold Leakage = 0.00348926 W
|
||||
Gate Leakage = 0.000338898 W
|
||||
Runtime Dynamic = 0.43963 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 1.51987 mm^2
|
||||
Peak Dynamic = 0.434944 W
|
||||
Subthreshold Leakage = 0.00208194 W
|
||||
Gate Leakage = 0.000195523 W
|
||||
Runtime Dynamic = 0.051779 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 2.2958 mm^2
|
||||
Peak Dynamic = 0.682653 W
|
||||
Subthreshold Leakage = 0.0043779 W
|
||||
Gate Leakage = 0.000496354 W
|
||||
Runtime Dynamic = 0.783433 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.416485 mm^2
|
||||
Peak Dynamic = 0.230852 W
|
||||
Subthreshold Leakage = 0.001531 W
|
||||
Gate Leakage = 0.000214549 W
|
||||
Runtime Dynamic = 0.308242 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.160067 mm^2
|
||||
Peak Dynamic = 0.0899719 W
|
||||
Subthreshold Leakage = 0.000573841 W
|
||||
Gate Leakage = 9.08104e-05 W
|
||||
Runtime Dynamic = 0.113361 W
|
||||
|
||||
ROB:
|
||||
Area = 1.71925 mm^2
|
||||
Peak Dynamic = 0.361829 W
|
||||
Subthreshold Leakage = 0.00227307 W
|
||||
Gate Leakage = 0.000190995 W
|
||||
Runtime Dynamic = 0.361829 W
|
||||
|
||||
Integer ALUs (Count: 4 ):
|
||||
Area = 2.56256 mm^2
|
||||
Peak Dynamic = 1.45952 W
|
||||
Subthreshold Leakage = 0.514377 W
|
||||
Gate Leakage = 0.0657924 W
|
||||
Runtime Dynamic = 1.12031 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 1 ):
|
||||
Area = 9.317 mm^2
|
||||
Peak Dynamic = 1.32571 W
|
||||
Subthreshold Leakage = 0.467545 W
|
||||
Gate Leakage = 0.0598023 W
|
||||
Runtime Dynamic = 1.32571 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.0521609 mm^2
|
||||
Peak Dynamic = 2.15212 W
|
||||
Subthreshold Leakage = 0.0139887 W
|
||||
Gate Leakage = 0.00178925 W
|
||||
Runtime Dynamic = 1.79159 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 62.2653 mm^2
|
||||
Peak Dynamic = 1.42987 W
|
||||
Subthreshold Leakage = 1.65481 W
|
||||
Gate Leakage = 0.00860545 W
|
||||
Runtime Dynamic = 2.73329 W
|
||||
|
||||
*****************************************************************************************
|
||||
Second Level Directory
|
||||
Area = 0.533824 mm^2
|
||||
Peak Dynamic = 0.275566 W
|
||||
Subthreshold Leakage = 0.00929753 W
|
||||
Gate Leakage = 0.00179126 W
|
||||
Runtime Dynamic = 0.193681 W
|
||||
|
||||
*****************************************************************************************
|
||||
Memory Controller:
|
||||
Area = 3.54463 mm^2
|
||||
Peak Dynamic = 1.65655 W
|
||||
Subthreshold Leakage = 0.0225194 W
|
||||
Gate Leakage = 0.00285586 W
|
||||
Runtime Dynamic = 0.573656 W
|
||||
|
||||
Front End Engine:
|
||||
Area = 1.72828 mm^2
|
||||
Peak Dynamic = 0.389588 W
|
||||
Subthreshold Leakage = 0.00246696 W
|
||||
Gate Leakage = 0.000291005 W
|
||||
Runtime Dynamic = 0.0911898 W
|
||||
|
||||
Transaction Engine:
|
||||
Area = 0.75308 mm^2
|
||||
Peak Dynamic = 1.13896 W
|
||||
Subthreshold Leakage = 0.00831402 W
|
||||
Gate Leakage = 0.00106342 W
|
||||
Runtime Dynamic = 0.341678 W
|
||||
|
||||
PHY:
|
||||
Area = 1.06326 mm^2
|
||||
Peak Dynamic = 0.128 W
|
||||
Subthreshold Leakage = 0.0117384 W
|
||||
Gate Leakage = 0.00150143 W
|
||||
Runtime Dynamic = 0.140788 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 8.77595 mm^2
|
||||
Peak Dynamic = 6.17873 W
|
||||
Subthreshold Leakage = 0.108357 W
|
||||
Gate Leakage = 0.0139259 W
|
||||
Runtime Dynamic = 0.963385 W
|
||||
|
||||
Router:
|
||||
Area = 8.3047 mm^2
|
||||
Peak Dynamic = 2.78895 W
|
||||
Subthreshold Leakage = 0.0606175 W
|
||||
Gate Leakage = 0.00781974 W
|
||||
Runtime Dynamic = 0.398421 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 4.2978 mm^2
|
||||
Peak Dynamic = 2.31409 W
|
||||
Subthreshold Leakage = 0.028002 W
|
||||
Gate Leakage = 0.00227471 W
|
||||
Runtime Dynamic = 0.330584 W
|
||||
|
||||
Crossbar:
|
||||
Area = 0.160538 mm^2
|
||||
Peak Dynamic = 0.437862 W
|
||||
Subthreshold Leakage = 0.0325996 W
|
||||
Gate Leakage = 0.00554292 W
|
||||
Runtime Dynamic = 0.0625517 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.0370018 W
|
||||
Subthreshold Leakage = 1.5858e-05 W
|
||||
Gate Leakage = 2.11117e-06 W
|
||||
Runtime Dynamic = 0.00528597 W
|
||||
|
||||
Per Router Links:
|
||||
Area = 0.471256 mm^2
|
||||
Peak Dynamic = 3.38978 W
|
||||
Subthreshold Leakage = 0.0477391 W
|
||||
Gate Leakage = 0.00610616 W
|
||||
Runtime Dynamic = 0.564963 W
|
||||
|
||||
*****************************************************************************************
|
315
ext/mcpat/results/Penryn
Normal file
315
ext/mcpat/results/Penryn
Normal file
|
@ -0,0 +1,315 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 45 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 3700
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 92.2661 mm^2
|
||||
Peak Power = 61.0228 W
|
||||
Total Leakage = 10.8609 W
|
||||
Peak Dynamic = 50.1619 W
|
||||
Subthreshold Leakage = 10.2773 W
|
||||
Gate Leakage = 0.583567 W
|
||||
Runtime Dynamic = 69.6347 W
|
||||
|
||||
Total Cores: 2 cores
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 48.2438 mm^2
|
||||
Peak Dynamic = 39.6676 W
|
||||
Subthreshold Leakage = 6.96165 W
|
||||
Gate Leakage = 0.541077 W
|
||||
Runtime Dynamic = 51.4987 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 43.1009 mm^2
|
||||
Peak Dynamic = 6.43272 W
|
||||
Subthreshold Leakage = 3.28049 W
|
||||
Gate Leakage = 0.0386655 W
|
||||
Runtime Dynamic = 13.716 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 0.921404 mm^2
|
||||
Peak Dynamic = 4.06164 W
|
||||
Subthreshold Leakage = 0.035183 W
|
||||
Gate Leakage = 0.00382481 W
|
||||
Runtime Dynamic = 4.42002 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 24.1219 mm^2
|
||||
Peak Dynamic = 19.8338 W
|
||||
Subthreshold Leakage = 3.48083 W
|
||||
Gate Leakage = 0.270538 W
|
||||
Runtime Dynamic = 51.4987 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 3.13582 mm^2
|
||||
Peak Dynamic = 2.49774 W
|
||||
Subthreshold Leakage = 0.421089 W
|
||||
Gate Leakage = 0.0246791 W
|
||||
Runtime Dynamic = 2.42869 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.702441 mm^2
|
||||
Peak Dynamic = 0.419702 W
|
||||
Subthreshold Leakage = 0.0413175 W
|
||||
Gate Leakage = 0.00175164 W
|
||||
Runtime Dynamic = 0.487111 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 0.349484 mm^2
|
||||
Peak Dynamic = 0.0903353 W
|
||||
Subthreshold Leakage = 0.0243658 W
|
||||
Gate Leakage = 0.000966387 W
|
||||
Runtime Dynamic = 0.361341 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.153017 mm^2
|
||||
Peak Dynamic = 0.0718712 W
|
||||
Subthreshold Leakage = 0.0142615 W
|
||||
Gate Leakage = 0.000619154 W
|
||||
Runtime Dynamic = 0.0647272 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.0475693 mm^2
|
||||
Peak Dynamic = 0.0231158 W
|
||||
Subthreshold Leakage = 0.00544747 W
|
||||
Gate Leakage = 0.000234591 W
|
||||
Runtime Dynamic = 0.0245764 W
|
||||
|
||||
Local Predictor:
|
||||
L1_Local Predictor:
|
||||
Area = 0.0239764 mm^2
|
||||
Peak Dynamic = 0.0142817 W
|
||||
Subthreshold Leakage = 0.00265926 W
|
||||
Gate Leakage = 0.00011608 W
|
||||
Runtime Dynamic = 0.0155731 W
|
||||
|
||||
L2_Local Predictor:
|
||||
Area = 0.012121 mm^2
|
||||
Peak Dynamic = 0.00767395 W
|
||||
Subthreshold Leakage = 0.00143248 W
|
||||
Gate Leakage = 6.77717e-05 W
|
||||
Runtime Dynamic = 0.00837399 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.0475693 mm^2
|
||||
Peak Dynamic = 0.0231158 W
|
||||
Subthreshold Leakage = 0.00544747 W
|
||||
Gate Leakage = 0.000234591 W
|
||||
Runtime Dynamic = 0.0245764 W
|
||||
|
||||
RAS:
|
||||
Area = 0.0217815 mm^2
|
||||
Peak Dynamic = 0.0113578 W
|
||||
Subthreshold Leakage = 0.000707258 W
|
||||
Gate Leakage = 3.38921e-05 W
|
||||
Runtime Dynamic = 1.2459e-06 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.0278406 mm^2
|
||||
Peak Dynamic = 0.282368 W
|
||||
Subthreshold Leakage = 0.000861686 W
|
||||
Gate Leakage = 3.91839e-05 W
|
||||
Runtime Dynamic = 0.188245 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 1.85799 mm^2
|
||||
Peak Dynamic = 1.32726 W
|
||||
Subthreshold Leakage = 0.325606 W
|
||||
Gate Leakage = 0.0185411 W
|
||||
Runtime Dynamic = 1.32726 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 1.02517 mm^2
|
||||
Peak Dynamic = 2.25746 W
|
||||
Subthreshold Leakage = 0.042129 W
|
||||
Gate Leakage = 0.00480502 W
|
||||
Runtime Dynamic = 1.55315 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 0.59725 mm^2
|
||||
Peak Dynamic = 1.25286 W
|
||||
Subthreshold Leakage = 0.0159587 W
|
||||
Gate Leakage = 0.00122436 W
|
||||
Runtime Dynamic = 1.11309 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 0.350662 mm^2
|
||||
Peak Dynamic = 0.652971 W
|
||||
Subthreshold Leakage = 0.0110219 W
|
||||
Gate Leakage = 0.00079321 W
|
||||
Runtime Dynamic = 0.326485 W
|
||||
|
||||
Free List:
|
||||
Area = 0.0322035 mm^2
|
||||
Peak Dynamic = 0.0454309 W
|
||||
Subthreshold Leakage = 0.000471802 W
|
||||
Gate Leakage = 2.57995e-05 W
|
||||
Runtime Dynamic = 0.113577 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 7.24152 mm^2
|
||||
Peak Dynamic = 6.57278 W
|
||||
Subthreshold Leakage = 0.310798 W
|
||||
Gate Leakage = 0.0358085 W
|
||||
Runtime Dynamic = 34.9208 W
|
||||
|
||||
Data Cache:
|
||||
Area = 4.65034 mm^2
|
||||
Peak Dynamic = 5.03369 W
|
||||
Subthreshold Leakage = 0.237004 W
|
||||
Gate Leakage = 0.0253255 W
|
||||
Runtime Dynamic = 33.601 W
|
||||
|
||||
LoadQ:
|
||||
Area = 0.260806 mm^2
|
||||
Peak Dynamic = 0.132332 W
|
||||
Subthreshold Leakage = 0.00523814 W
|
||||
Gate Leakage = 0.000359005 W
|
||||
Runtime Dynamic = 0.0661662 W
|
||||
|
||||
StoreQ:
|
||||
Area = 1.06006 mm^2
|
||||
Peak Dynamic = 1.25365 W
|
||||
Subthreshold Leakage = 0.0538794 W
|
||||
Gate Leakage = 0.00736236 W
|
||||
Runtime Dynamic = 1.25365 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.363299 mm^2
|
||||
Peak Dynamic = 0.610831 W
|
||||
Subthreshold Leakage = 0.0388017 W
|
||||
Gate Leakage = 0.00431691 W
|
||||
Runtime Dynamic = 1.29234 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.0590462 mm^2
|
||||
Peak Dynamic = 0.116192 W
|
||||
Subthreshold Leakage = 0.00608044 W
|
||||
Gate Leakage = 0.000398475 W
|
||||
Runtime Dynamic = 0.232386 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.259199 mm^2
|
||||
Peak Dynamic = 0.264986 W
|
||||
Subthreshold Leakage = 0.0180446 W
|
||||
Gate Leakage = 0.00115678 W
|
||||
Runtime Dynamic = 1.05995 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 7.9594 mm^2
|
||||
Peak Dynamic = 7.89497 W
|
||||
Subthreshold Leakage = 1.28761 W
|
||||
Gate Leakage = 0.0977152 W
|
||||
Runtime Dynamic = 11.3037 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.528076 mm^2
|
||||
Peak Dynamic = 0.554172 W
|
||||
Subthreshold Leakage = 0.00459231 W
|
||||
Gate Leakage = 0.000305031 W
|
||||
Runtime Dynamic = 0.283985 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.336446 mm^2
|
||||
Peak Dynamic = 0.461344 W
|
||||
Subthreshold Leakage = 0.00257976 W
|
||||
Gate Leakage = 0.00018025 W
|
||||
Runtime Dynamic = 0.247149 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.19163 mm^2
|
||||
Peak Dynamic = 0.0928276 W
|
||||
Subthreshold Leakage = 0.00201255 W
|
||||
Gate Leakage = 0.000124781 W
|
||||
Runtime Dynamic = 0.0368364 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 1.97424 mm^2
|
||||
Peak Dynamic = 1.76421 W
|
||||
Subthreshold Leakage = 0.0212898 W
|
||||
Gate Leakage = 0.0014052 W
|
||||
Runtime Dynamic = 1.96388 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.889691 mm^2
|
||||
Peak Dynamic = 0.468182 W
|
||||
Subthreshold Leakage = 0.0081033 W
|
||||
Gate Leakage = 0.000620258 W
|
||||
Runtime Dynamic = 0.601258 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.347423 mm^2
|
||||
Peak Dynamic = 0.230453 W
|
||||
Subthreshold Leakage = 0.00381664 W
|
||||
Gate Leakage = 0.000293336 W
|
||||
Runtime Dynamic = 0.29704 W
|
||||
|
||||
ROB:
|
||||
Area = 0.737129 mm^2
|
||||
Peak Dynamic = 1.06558 W
|
||||
Subthreshold Leakage = 0.00936988 W
|
||||
Gate Leakage = 0.000491606 W
|
||||
Runtime Dynamic = 1.06558 W
|
||||
|
||||
Integer ALUs (Count: 6 ):
|
||||
Area = 0.47087 mm^2
|
||||
Peak Dynamic = 2.2206 W
|
||||
Subthreshold Leakage = 0.295671 W
|
||||
Gate Leakage = 0.0221076 W
|
||||
Runtime Dynamic = 1.14549 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 2 ):
|
||||
Area = 4.6585 mm^2
|
||||
Peak Dynamic = 0.708407 W
|
||||
Subthreshold Leakage = 0.731296 W
|
||||
Gate Leakage = 0.0546797 W
|
||||
Runtime Dynamic = 1.28625 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.235435 mm^2
|
||||
Peak Dynamic = 0.257249 W
|
||||
Subthreshold Leakage = 0.147835 W
|
||||
Gate Leakage = 0.0110538 W
|
||||
Runtime Dynamic = 1.57424 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.0472187 mm^2
|
||||
Peak Dynamic = 2.08413 W
|
||||
Subthreshold Leakage = 0.0722513 W
|
||||
Gate Leakage = 0.00540229 W
|
||||
Runtime Dynamic = 5.04986 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 43.1009 mm^2
|
||||
Peak Dynamic = 6.43272 W
|
||||
Subthreshold Leakage = 3.28049 W
|
||||
Gate Leakage = 0.0386655 W
|
||||
Runtime Dynamic = 13.716 W
|
||||
|
||||
*****************************************************************************************
|
||||
BUSES
|
||||
Area = 0.921404 mm^2
|
||||
Peak Dynamic = 4.06164 W
|
||||
Subthreshold Leakage = 0.035183 W
|
||||
Gate Leakage = 0.00382481 W
|
||||
Runtime Dynamic = 4.42002 W
|
||||
|
||||
Bus:
|
||||
Area = 0.921404 mm^2
|
||||
Peak Dynamic = 4.06164 W
|
||||
Subthreshold Leakage = 0.035183 W
|
||||
Gate Leakage = 0.00382481 W
|
||||
Runtime Dynamic = 4.42002 W
|
||||
|
||||
*****************************************************************************************
|
296
ext/mcpat/results/T1
Normal file
296
ext/mcpat/results/T1
Normal file
|
@ -0,0 +1,296 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 90 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 1200
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 283.287 mm^2
|
||||
Peak Power = 55.0318 W
|
||||
Total Leakage = 9.78078 W
|
||||
Peak Dynamic = 45.2511 W
|
||||
Subthreshold Leakage = 8.64906 W
|
||||
Gate Leakage = 1.13172 W
|
||||
Runtime Dynamic = 45.5013 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 117.887 mm^2
|
||||
Peak Dynamic = 28.1307 W
|
||||
Subthreshold Leakage = 5.19354 W
|
||||
Gate Leakage = 0.730037 W
|
||||
Runtime Dynamic = 18.917 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 116.308 mm^2
|
||||
Peak Dynamic = 5.51367 W
|
||||
Subthreshold Leakage = 2.41316 W
|
||||
Gate Leakage = 0.242513 W
|
||||
Runtime Dynamic = 4.00707 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 8.77473 mm^2
|
||||
Peak Dynamic = 3.38588 W
|
||||
Subthreshold Leakage = 0.224524 W
|
||||
Gate Leakage = 0.0320801 W
|
||||
Runtime Dynamic = 15.1158 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 8.87598 mm^2
|
||||
Peak Dynamic = 3.67515 W
|
||||
Subthreshold Leakage = 0.488892 W
|
||||
Gate Leakage = 0.0852308 W
|
||||
Runtime Dynamic = 2.20509 W
|
||||
|
||||
Total MCs:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 31.441 mm^2
|
||||
Peak Dynamic = 4.5457 W
|
||||
Subthreshold Leakage = 0.328953 W
|
||||
Gate Leakage = 0.0418558 W
|
||||
Runtime Dynamic = 5.25637 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 14.7359 mm^2
|
||||
Peak Dynamic = 3.51633 W
|
||||
Subthreshold Leakage = 0.649192 W
|
||||
Gate Leakage = 0.0912546 W
|
||||
Runtime Dynamic = 18.917 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 3.60967 mm^2
|
||||
Peak Dynamic = 0.560912 W
|
||||
Subthreshold Leakage = 0.0396492 W
|
||||
Gate Leakage = 0.00709504 W
|
||||
Runtime Dynamic = 3.76593 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 3.41818 mm^2
|
||||
Peak Dynamic = 0.308492 W
|
||||
Subthreshold Leakage = 0.0286475 W
|
||||
Gate Leakage = 0.00418329 W
|
||||
Runtime Dynamic = 0.95332 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.0122742 mm^2
|
||||
Peak Dynamic = 0.0121268 W
|
||||
Subthreshold Leakage = 0.0002042 W
|
||||
Gate Leakage = 1.78658e-05 W
|
||||
Runtime Dynamic = 0.0970143 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.0229327 mm^2
|
||||
Peak Dynamic = 0.169467 W
|
||||
Subthreshold Leakage = 0.00259055 W
|
||||
Gate Leakage = 0.000252139 W
|
||||
Runtime Dynamic = 1.35574 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 3.07616 mm^2
|
||||
Peak Dynamic = 0.390349 W
|
||||
Subthreshold Leakage = 0.0362126 W
|
||||
Gate Leakage = 0.00713432 W
|
||||
Runtime Dynamic = 3.85623 W
|
||||
|
||||
Data Cache:
|
||||
Area = 1.47986 mm^2
|
||||
Peak Dynamic = 0.191211 W
|
||||
Subthreshold Leakage = 0.0157454 W
|
||||
Gate Leakage = 0.00208738 W
|
||||
Runtime Dynamic = 0.443377 W
|
||||
|
||||
Load/Store Queue:
|
||||
Area = 1.17458 mm^2
|
||||
Peak Dynamic = 0.128312 W
|
||||
Subthreshold Leakage = 0.0122603 W
|
||||
Gate Leakage = 0.0024052 W
|
||||
Runtime Dynamic = 2.05299 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 1.27751 mm^2
|
||||
Peak Dynamic = 0.324071 W
|
||||
Subthreshold Leakage = 0.0192968 W
|
||||
Gate Leakage = 0.0049902 W
|
||||
Runtime Dynamic = 2.53591 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.560615 mm^2
|
||||
Peak Dynamic = 0.117604 W
|
||||
Subthreshold Leakage = 0.00554488 W
|
||||
Gate Leakage = 0.00117423 W
|
||||
Runtime Dynamic = 0.940838 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.560615 mm^2
|
||||
Peak Dynamic = 0.0294011 W
|
||||
Subthreshold Leakage = 0.00554488 W
|
||||
Gate Leakage = 0.00117423 W
|
||||
Runtime Dynamic = 0.235211 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 3.47025 mm^2
|
||||
Peak Dynamic = 2.241 W
|
||||
Subthreshold Leakage = 0.222601 W
|
||||
Gate Leakage = 0.0296426 W
|
||||
Runtime Dynamic = 8.75894 W
|
||||
|
||||
Register Files:
|
||||
Area = 1.38355 mm^2
|
||||
Peak Dynamic = 0.0746572 W
|
||||
Subthreshold Leakage = 0.00827136 W
|
||||
Gate Leakage = 0.000628178 W
|
||||
Runtime Dynamic = 0.320633 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.592652 mm^2
|
||||
Peak Dynamic = 0.0582404 W
|
||||
Subthreshold Leakage = 0.00161128 W
|
||||
Gate Leakage = 0.000148771 W
|
||||
Runtime Dynamic = 0.312722 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.592652 mm^2
|
||||
Peak Dynamic = 0.0164168 W
|
||||
Subthreshold Leakage = 0.00161128 W
|
||||
Gate Leakage = 0.000148771 W
|
||||
Runtime Dynamic = 0.00783962 W
|
||||
|
||||
Register Windows:
|
||||
Area = 0.198243 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.00504879 W
|
||||
Gate Leakage = 0.000330636 W
|
||||
Runtime Dynamic = 7.11291e-05 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.04377 mm^2
|
||||
Peak Dynamic = 0.0284368 W
|
||||
Subthreshold Leakage = 0.000336066 W
|
||||
Gate Leakage = 5.10703e-05 W
|
||||
Runtime Dynamic = 0.244528 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.04377 mm^2
|
||||
Peak Dynamic = 0.0284368 W
|
||||
Subthreshold Leakage = 0.000336066 W
|
||||
Gate Leakage = 5.10703e-05 W
|
||||
Runtime Dynamic = 0.244528 W
|
||||
|
||||
Integer ALUs (Count: 1 ):
|
||||
Area = 0.16016 mm^2
|
||||
Peak Dynamic = 0.305285 W
|
||||
Subthreshold Leakage = 0.0321485 W
|
||||
Gate Leakage = 0.00411202 W
|
||||
Runtime Dynamic = 2.71365 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 0.125 ):
|
||||
Area = 1.16463 mm^2
|
||||
Peak Dynamic = 0.0508808 W
|
||||
Subthreshold Leakage = 0.0584431 W
|
||||
Gate Leakage = 0.00747528 W
|
||||
Runtime Dynamic = 0.101762 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.48048 mm^2
|
||||
Peak Dynamic = 0.339206 W
|
||||
Subthreshold Leakage = 0.0964456 W
|
||||
Gate Leakage = 0.0123361 W
|
||||
Runtime Dynamic = 0.678411 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.0813807 mm^2
|
||||
Peak Dynamic = 1.18756 W
|
||||
Subthreshold Leakage = 0.0187498 W
|
||||
Gate Leakage = 0.00239823 W
|
||||
Runtime Dynamic = 3.3401 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 29.0771 mm^2
|
||||
Peak Dynamic = 1.37842 W
|
||||
Subthreshold Leakage = 0.603289 W
|
||||
Gate Leakage = 0.0606283 W
|
||||
Runtime Dynamic = 4.00707 W
|
||||
|
||||
*****************************************************************************************
|
||||
First Level Directory
|
||||
Area = 2.19368 mm^2
|
||||
Peak Dynamic = 0.84647 W
|
||||
Subthreshold Leakage = 0.0561311 W
|
||||
Gate Leakage = 0.00802003 W
|
||||
Runtime Dynamic = 15.1158 W
|
||||
|
||||
*****************************************************************************************
|
||||
Memory Controller:
|
||||
Area = 7.86025 mm^2
|
||||
Peak Dynamic = 1.13642 W
|
||||
Subthreshold Leakage = 0.0822383 W
|
||||
Gate Leakage = 0.0104639 W
|
||||
Runtime Dynamic = 5.25637 W
|
||||
|
||||
Front End Engine:
|
||||
Area = 0.63078 mm^2
|
||||
Peak Dynamic = 0.0549429 W
|
||||
Subthreshold Leakage = 0.00242476 W
|
||||
Gate Leakage = 0.00025524 W
|
||||
Runtime Dynamic = 0.241753 W
|
||||
|
||||
Transaction Engine:
|
||||
Area = 2.59502 mm^2
|
||||
Peak Dynamic = 0.569482 W
|
||||
Subthreshold Leakage = 0.0286491 W
|
||||
Gate Leakage = 0.00366442 W
|
||||
Runtime Dynamic = 2.50577 W
|
||||
|
||||
PHY:
|
||||
Area = 4.63445 mm^2
|
||||
Peak Dynamic = 0.512 W
|
||||
Subthreshold Leakage = 0.0511644 W
|
||||
Gate Leakage = 0.00654429 W
|
||||
Runtime Dynamic = 2.50885 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 8.87598 mm^2
|
||||
Peak Dynamic = 3.67515 W
|
||||
Subthreshold Leakage = 0.488892 W
|
||||
Gate Leakage = 0.0852308 W
|
||||
Runtime Dynamic = 2.20509 W
|
||||
|
||||
Router:
|
||||
Area = 4.43799 mm^2
|
||||
Peak Dynamic = 1.83757 W
|
||||
Subthreshold Leakage = 0.244446 W
|
||||
Gate Leakage = 0.0426154 W
|
||||
Runtime Dynamic = 2.20509 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 1.22928 mm^2
|
||||
Peak Dynamic = 0.0508654 W
|
||||
Subthreshold Leakage = 0.000485491 W
|
||||
Gate Leakage = 7.24213e-05 W
|
||||
Runtime Dynamic = 0.0610385 W
|
||||
|
||||
Crossbar:
|
||||
Area = 1.35717 mm^2
|
||||
Peak Dynamic = 1.77185 W
|
||||
Subthreshold Leakage = 0.243949 W
|
||||
Gate Leakage = 0.0425414 W
|
||||
Runtime Dynamic = 2.12622 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.0148566 W
|
||||
Subthreshold Leakage = 1.15783e-05 W
|
||||
Gate Leakage = 1.54103e-06 W
|
||||
Runtime Dynamic = 0.0178279 W
|
||||
|
||||
*****************************************************************************************
|
270
ext/mcpat/results/T1_DC_64
Normal file
270
ext/mcpat/results/T1_DC_64
Normal file
|
@ -0,0 +1,270 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
line64
|
||||
size1.04858e+06
|
||||
line9
|
||||
size1.04858e+06
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 22 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 3500
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 322.362 mm^2
|
||||
Peak Power = 112.557 W
|
||||
Total Leakage = 28.0714 W
|
||||
Peak Dynamic = 84.4853 W
|
||||
Subthreshold Leakage = 27.7571 W
|
||||
Gate Leakage = 0.314289 W
|
||||
Runtime Dynamic = 13.4278 W
|
||||
|
||||
Total Cores: 64 cores
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 87.1986 mm^2
|
||||
Peak Dynamic = 42.426 W
|
||||
Subthreshold Leakage = 7.80232 W
|
||||
Gate Leakage = 0.0799149 W
|
||||
Runtime Dynamic = 9.61388 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 161.532 mm^2
|
||||
Peak Dynamic = 21.1059 W
|
||||
Subthreshold Leakage = 8.9583 W
|
||||
Gate Leakage = 0.100733 W
|
||||
Runtime Dynamic = 1.14063 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 22.1741 mm^2
|
||||
Peak Dynamic = 0.831407 W
|
||||
Subthreshold Leakage = 1.57123 W
|
||||
Gate Leakage = 0.0148674 W
|
||||
Runtime Dynamic = 0.175856 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 51.4571 mm^2
|
||||
Peak Dynamic = 20.122 W
|
||||
Subthreshold Leakage = 9.42527 W
|
||||
Gate Leakage = 0.118774 W
|
||||
Runtime Dynamic = 2.49747 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 1.36248 mm^2
|
||||
Peak Dynamic = 0.662906 W
|
||||
Subthreshold Leakage = 0.121911 W
|
||||
Gate Leakage = 0.00124867 W
|
||||
Runtime Dynamic = 9.61388 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 0.140786 mm^2
|
||||
Peak Dynamic = 0.0863256 W
|
||||
Subthreshold Leakage = 0.00636762 W
|
||||
Gate Leakage = 7.4998e-05 W
|
||||
Runtime Dynamic = 2.08883 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.129377 mm^2
|
||||
Peak Dynamic = 0.0476007 W
|
||||
Subthreshold Leakage = 0.00381804 W
|
||||
Gate Leakage = 2.35266e-05 W
|
||||
Runtime Dynamic = 0.0698158 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.000754971 mm^2
|
||||
Peak Dynamic = 0.00238165 W
|
||||
Subthreshold Leakage = 4.99334e-05 W
|
||||
Gate Leakage = 3.27157e-07 W
|
||||
Runtime Dynamic = 0.0190532 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.00131543 mm^2
|
||||
Peak Dynamic = 0.0246042 W
|
||||
Subthreshold Leakage = 0.000538954 W
|
||||
Gate Leakage = 3.91915e-06 W
|
||||
Runtime Dynamic = 0.196833 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 0.0977414 mm^2
|
||||
Peak Dynamic = 0.0587123 W
|
||||
Subthreshold Leakage = 0.00580883 W
|
||||
Gate Leakage = 7.48788e-05 W
|
||||
Runtime Dynamic = 2.07447 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.0569223 mm^2
|
||||
Peak Dynamic = 0.0329939 W
|
||||
Subthreshold Leakage = 0.00249221 W
|
||||
Gate Leakage = 1.63814e-05 W
|
||||
Runtime Dynamic = 0.0476753 W
|
||||
|
||||
Load/Store Queue:
|
||||
Area = 0.023444 mm^2
|
||||
Peak Dynamic = 0.0139792 W
|
||||
Subthreshold Leakage = 0.00135593 W
|
||||
Gate Leakage = 1.12722e-05 W
|
||||
Runtime Dynamic = 0.223667 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.0313997 mm^2
|
||||
Peak Dynamic = 0.0446647 W
|
||||
Subthreshold Leakage = 0.0029577 W
|
||||
Gate Leakage = 5.57335e-05 W
|
||||
Runtime Dynamic = 1.92566 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.0110306 mm^2
|
||||
Peak Dynamic = 0.0122535 W
|
||||
Subthreshold Leakage = 0.000498504 W
|
||||
Gate Leakage = 4.25417e-06 W
|
||||
Runtime Dynamic = 0.0980282 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.0110306 mm^2
|
||||
Peak Dynamic = 0.00306337 W
|
||||
Subthreshold Leakage = 0.000498504 W
|
||||
Gate Leakage = 4.25417e-06 W
|
||||
Runtime Dynamic = 0.0245072 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 0.299667 mm^2
|
||||
Peak Dynamic = 0.473204 W
|
||||
Subthreshold Leakage = 0.0379242 W
|
||||
Gate Leakage = 0.000384077 W
|
||||
Runtime Dynamic = 3.52491 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.0598365 mm^2
|
||||
Peak Dynamic = 0.0168768 W
|
||||
Subthreshold Leakage = 0.0020814 W
|
||||
Gate Leakage = 1.24237e-05 W
|
||||
Runtime Dynamic = 0.072481 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.0240072 mm^2
|
||||
Peak Dynamic = 0.0131657 W
|
||||
Subthreshold Leakage = 0.000449165 W
|
||||
Gate Leakage = 3.33111e-06 W
|
||||
Runtime Dynamic = 0.0706931 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.0240072 mm^2
|
||||
Peak Dynamic = 0.00371113 W
|
||||
Subthreshold Leakage = 0.000449165 W
|
||||
Gate Leakage = 3.33111e-06 W
|
||||
Runtime Dynamic = 0.0017722 W
|
||||
|
||||
Register Windows:
|
||||
Area = 0.0118221 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.00118307 W
|
||||
Gate Leakage = 5.76149e-06 W
|
||||
Runtime Dynamic = 1.56951e-05 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.00263062 mm^2
|
||||
Peak Dynamic = 0.00540689 W
|
||||
Subthreshold Leakage = 8.27524e-05 W
|
||||
Gate Leakage = 9.38261e-07 W
|
||||
Runtime Dynamic = 0.0464411 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.00263062 mm^2
|
||||
Peak Dynamic = 0.00540689 W
|
||||
Subthreshold Leakage = 8.27524e-05 W
|
||||
Gate Leakage = 9.38261e-07 W
|
||||
Runtime Dynamic = 0.0464411 W
|
||||
|
||||
Integer ALUs (Count: 1 ):
|
||||
Area = 0.0384544 mm^2
|
||||
Peak Dynamic = 0.0946992 W
|
||||
Subthreshold Leakage = 0.00667865 W
|
||||
Gate Leakage = 6.39207e-05 W
|
||||
Runtime Dynamic = 0.841771 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 0.125 ):
|
||||
Area = 0.0695899 mm^2
|
||||
Peak Dynamic = 0.0157832 W
|
||||
Subthreshold Leakage = 0.00302155 W
|
||||
Gate Leakage = 2.89189e-05 W
|
||||
Runtime Dynamic = 0.0315664 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.115363 mm^2
|
||||
Peak Dynamic = 0.105221 W
|
||||
Subthreshold Leakage = 0.020036 W
|
||||
Gate Leakage = 0.000191762 W
|
||||
Runtime Dynamic = 0.210443 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.00445381 mm^2
|
||||
Peak Dynamic = 0.192955 W
|
||||
Subthreshold Leakage = 0.00406321 W
|
||||
Gate Leakage = 3.88886e-05 W
|
||||
Runtime Dynamic = 0.519078 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 2.52394 mm^2
|
||||
Peak Dynamic = 0.32978 W
|
||||
Subthreshold Leakage = 0.139973 W
|
||||
Gate Leakage = 0.00157395 W
|
||||
Runtime Dynamic = 1.14063 W
|
||||
|
||||
*****************************************************************************************
|
||||
Second Level Directory
|
||||
Area = 2.77176 mm^2
|
||||
Peak Dynamic = 0.103926 W
|
||||
Subthreshold Leakage = 0.196403 W
|
||||
Gate Leakage = 0.00185842 W
|
||||
Runtime Dynamic = 0.175856 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 51.4571 mm^2
|
||||
Peak Dynamic = 20.122 W
|
||||
Subthreshold Leakage = 9.42527 W
|
||||
Gate Leakage = 0.118774 W
|
||||
Runtime Dynamic = 2.49747 W
|
||||
|
||||
Router:
|
||||
Area = 0.578434 mm^2
|
||||
Peak Dynamic = 0.184548 W
|
||||
Subthreshold Leakage = 0.125515 W
|
||||
Gate Leakage = 0.0016409 W
|
||||
Runtime Dynamic = 1.32875 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 0.159162 mm^2
|
||||
Peak Dynamic = 0.00394081 W
|
||||
Subthreshold Leakage = 0.000194478 W
|
||||
Gate Leakage = 1.84946e-06 W
|
||||
Runtime Dynamic = 0.0283738 W
|
||||
|
||||
Crossbar:
|
||||
Area = 0.160976 mm^2
|
||||
Peak Dynamic = 0.179891 W
|
||||
Subthreshold Leakage = 0.12532 W
|
||||
Gate Leakage = 0.00163905 W
|
||||
Runtime Dynamic = 1.29522 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.000716053 W
|
||||
Subthreshold Leakage = 3.67148e-07 W
|
||||
Gate Leakage = 3.86991e-09 W
|
||||
Runtime Dynamic = 0.00515558 W
|
||||
|
||||
Per Router Links:
|
||||
Area = 0.225583 mm^2
|
||||
Peak Dynamic = 0.129858 W
|
||||
Subthreshold Leakage = 0.0217549 W
|
||||
Gate Leakage = 0.000214933 W
|
||||
Runtime Dynamic = 1.16872 W
|
||||
|
||||
*****************************************************************************************
|
252
ext/mcpat/results/T1_SBT_64
Normal file
252
ext/mcpat/results/T1_SBT_64
Normal file
|
@ -0,0 +1,252 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
line72
|
||||
size1.17965e+06
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 22 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 3500
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 321.412 mm^2
|
||||
Peak Power = 114.076 W
|
||||
Total Leakage = 27.4353 W
|
||||
Peak Dynamic = 86.6406 W
|
||||
Subthreshold Leakage = 27.1256 W
|
||||
Gate Leakage = 0.309772 W
|
||||
Runtime Dynamic = 13.4064 W
|
||||
|
||||
Total Cores: 64 cores
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 87.1986 mm^2
|
||||
Peak Dynamic = 42.426 W
|
||||
Subthreshold Leakage = 7.80232 W
|
||||
Gate Leakage = 0.0799149 W
|
||||
Runtime Dynamic = 9.61388 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 182.778 mm^2
|
||||
Peak Dynamic = 24.1051 W
|
||||
Subthreshold Leakage = 9.90006 W
|
||||
Gate Leakage = 0.111104 W
|
||||
Runtime Dynamic = 1.29686 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 51.4353 mm^2
|
||||
Peak Dynamic = 20.1095 W
|
||||
Subthreshold Leakage = 9.42317 W
|
||||
Gate Leakage = 0.118753 W
|
||||
Runtime Dynamic = 2.4957 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 1.36248 mm^2
|
||||
Peak Dynamic = 0.662906 W
|
||||
Subthreshold Leakage = 0.121911 W
|
||||
Gate Leakage = 0.00124867 W
|
||||
Runtime Dynamic = 9.61388 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 0.140786 mm^2
|
||||
Peak Dynamic = 0.0863256 W
|
||||
Subthreshold Leakage = 0.00636762 W
|
||||
Gate Leakage = 7.4998e-05 W
|
||||
Runtime Dynamic = 2.08883 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.129377 mm^2
|
||||
Peak Dynamic = 0.0476007 W
|
||||
Subthreshold Leakage = 0.00381804 W
|
||||
Gate Leakage = 2.35266e-05 W
|
||||
Runtime Dynamic = 0.0698158 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.000754971 mm^2
|
||||
Peak Dynamic = 0.00238165 W
|
||||
Subthreshold Leakage = 4.99334e-05 W
|
||||
Gate Leakage = 3.27157e-07 W
|
||||
Runtime Dynamic = 0.0190532 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.00131543 mm^2
|
||||
Peak Dynamic = 0.0246042 W
|
||||
Subthreshold Leakage = 0.000538954 W
|
||||
Gate Leakage = 3.91915e-06 W
|
||||
Runtime Dynamic = 0.196833 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 0.0977414 mm^2
|
||||
Peak Dynamic = 0.0587123 W
|
||||
Subthreshold Leakage = 0.00580883 W
|
||||
Gate Leakage = 7.48788e-05 W
|
||||
Runtime Dynamic = 2.07447 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.0569223 mm^2
|
||||
Peak Dynamic = 0.0329939 W
|
||||
Subthreshold Leakage = 0.00249221 W
|
||||
Gate Leakage = 1.63814e-05 W
|
||||
Runtime Dynamic = 0.0476753 W
|
||||
|
||||
Load/Store Queue:
|
||||
Area = 0.023444 mm^2
|
||||
Peak Dynamic = 0.0139792 W
|
||||
Subthreshold Leakage = 0.00135593 W
|
||||
Gate Leakage = 1.12722e-05 W
|
||||
Runtime Dynamic = 0.223667 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.0313997 mm^2
|
||||
Peak Dynamic = 0.0446647 W
|
||||
Subthreshold Leakage = 0.0029577 W
|
||||
Gate Leakage = 5.57335e-05 W
|
||||
Runtime Dynamic = 1.92566 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.0110306 mm^2
|
||||
Peak Dynamic = 0.0122535 W
|
||||
Subthreshold Leakage = 0.000498504 W
|
||||
Gate Leakage = 4.25417e-06 W
|
||||
Runtime Dynamic = 0.0980282 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.0110306 mm^2
|
||||
Peak Dynamic = 0.00306337 W
|
||||
Subthreshold Leakage = 0.000498504 W
|
||||
Gate Leakage = 4.25417e-06 W
|
||||
Runtime Dynamic = 0.0245072 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 0.299667 mm^2
|
||||
Peak Dynamic = 0.473204 W
|
||||
Subthreshold Leakage = 0.0379242 W
|
||||
Gate Leakage = 0.000384077 W
|
||||
Runtime Dynamic = 3.52491 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.0598365 mm^2
|
||||
Peak Dynamic = 0.0168768 W
|
||||
Subthreshold Leakage = 0.0020814 W
|
||||
Gate Leakage = 1.24237e-05 W
|
||||
Runtime Dynamic = 0.072481 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.0240072 mm^2
|
||||
Peak Dynamic = 0.0131657 W
|
||||
Subthreshold Leakage = 0.000449165 W
|
||||
Gate Leakage = 3.33111e-06 W
|
||||
Runtime Dynamic = 0.0706931 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.0240072 mm^2
|
||||
Peak Dynamic = 0.00371113 W
|
||||
Subthreshold Leakage = 0.000449165 W
|
||||
Gate Leakage = 3.33111e-06 W
|
||||
Runtime Dynamic = 0.0017722 W
|
||||
|
||||
Register Windows:
|
||||
Area = 0.0118221 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.00118307 W
|
||||
Gate Leakage = 5.76149e-06 W
|
||||
Runtime Dynamic = 1.56951e-05 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.00263062 mm^2
|
||||
Peak Dynamic = 0.00540689 W
|
||||
Subthreshold Leakage = 8.27524e-05 W
|
||||
Gate Leakage = 9.38261e-07 W
|
||||
Runtime Dynamic = 0.0464411 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.00263062 mm^2
|
||||
Peak Dynamic = 0.00540689 W
|
||||
Subthreshold Leakage = 8.27524e-05 W
|
||||
Gate Leakage = 9.38261e-07 W
|
||||
Runtime Dynamic = 0.0464411 W
|
||||
|
||||
Integer ALUs (Count: 1 ):
|
||||
Area = 0.0384544 mm^2
|
||||
Peak Dynamic = 0.0946992 W
|
||||
Subthreshold Leakage = 0.00667865 W
|
||||
Gate Leakage = 6.39207e-05 W
|
||||
Runtime Dynamic = 0.841771 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 0.125 ):
|
||||
Area = 0.0695899 mm^2
|
||||
Peak Dynamic = 0.0157832 W
|
||||
Subthreshold Leakage = 0.00302155 W
|
||||
Gate Leakage = 2.89189e-05 W
|
||||
Runtime Dynamic = 0.0315664 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.115363 mm^2
|
||||
Peak Dynamic = 0.105221 W
|
||||
Subthreshold Leakage = 0.020036 W
|
||||
Gate Leakage = 0.000191762 W
|
||||
Runtime Dynamic = 0.210443 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.00445381 mm^2
|
||||
Peak Dynamic = 0.192955 W
|
||||
Subthreshold Leakage = 0.00406321 W
|
||||
Gate Leakage = 3.88886e-05 W
|
||||
Runtime Dynamic = 0.519078 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 2.85591 mm^2
|
||||
Peak Dynamic = 0.376642 W
|
||||
Subthreshold Leakage = 0.154688 W
|
||||
Gate Leakage = 0.001736 W
|
||||
Runtime Dynamic = 1.29686 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 51.4353 mm^2
|
||||
Peak Dynamic = 20.1095 W
|
||||
Subthreshold Leakage = 9.42317 W
|
||||
Gate Leakage = 0.118753 W
|
||||
Runtime Dynamic = 2.4957 W
|
||||
|
||||
Router:
|
||||
Area = 0.578434 mm^2
|
||||
Peak Dynamic = 0.184548 W
|
||||
Subthreshold Leakage = 0.125515 W
|
||||
Gate Leakage = 0.0016409 W
|
||||
Runtime Dynamic = 1.32875 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 0.159162 mm^2
|
||||
Peak Dynamic = 0.00394081 W
|
||||
Subthreshold Leakage = 0.000194478 W
|
||||
Gate Leakage = 1.84946e-06 W
|
||||
Runtime Dynamic = 0.0283738 W
|
||||
|
||||
Crossbar:
|
||||
Area = 0.160976 mm^2
|
||||
Peak Dynamic = 0.179891 W
|
||||
Subthreshold Leakage = 0.12532 W
|
||||
Gate Leakage = 0.00163905 W
|
||||
Runtime Dynamic = 1.29522 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.000716053 W
|
||||
Subthreshold Leakage = 3.67148e-07 W
|
||||
Gate Leakage = 3.86991e-09 W
|
||||
Runtime Dynamic = 0.00515558 W
|
||||
|
||||
Per Router Links:
|
||||
Area = 0.225243 mm^2
|
||||
Peak Dynamic = 0.129662 W
|
||||
Subthreshold Leakage = 0.0217221 W
|
||||
Gate Leakage = 0.000214609 W
|
||||
Runtime Dynamic = 1.16696 W
|
||||
|
||||
*****************************************************************************************
|
270
ext/mcpat/results/T1_ST_64
Normal file
270
ext/mcpat/results/T1_ST_64
Normal file
|
@ -0,0 +1,270 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
line64
|
||||
size1.04858e+06
|
||||
line9
|
||||
size8.38861e+06
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 22 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 3500
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 358.016 mm^2
|
||||
Peak Power = 168.519 W
|
||||
Total Leakage = 30.8855 W
|
||||
Peak Dynamic = 137.634 W
|
||||
Subthreshold Leakage = 30.5351 W
|
||||
Gate Leakage = 0.350385 W
|
||||
Runtime Dynamic = 84.2366 W
|
||||
|
||||
Total Cores: 64 cores
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 87.1986 mm^2
|
||||
Peak Dynamic = 42.426 W
|
||||
Subthreshold Leakage = 7.80232 W
|
||||
Gate Leakage = 0.0799149 W
|
||||
Runtime Dynamic = 9.61388 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 161.532 mm^2
|
||||
Peak Dynamic = 21.1059 W
|
||||
Subthreshold Leakage = 8.9583 W
|
||||
Gate Leakage = 0.100733 W
|
||||
Runtime Dynamic = 1.14063 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 57.033 mm^2
|
||||
Peak Dynamic = 53.5219 W
|
||||
Subthreshold Leakage = 4.27249 W
|
||||
Gate Leakage = 0.050206 W
|
||||
Runtime Dynamic = 70.9203 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 52.2524 mm^2
|
||||
Peak Dynamic = 20.5798 W
|
||||
Subthreshold Leakage = 9.50197 W
|
||||
Gate Leakage = 0.119531 W
|
||||
Runtime Dynamic = 2.56185 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 1.36248 mm^2
|
||||
Peak Dynamic = 0.662906 W
|
||||
Subthreshold Leakage = 0.121911 W
|
||||
Gate Leakage = 0.00124867 W
|
||||
Runtime Dynamic = 9.61388 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 0.140786 mm^2
|
||||
Peak Dynamic = 0.0863256 W
|
||||
Subthreshold Leakage = 0.00636762 W
|
||||
Gate Leakage = 7.4998e-05 W
|
||||
Runtime Dynamic = 2.08883 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 0.129377 mm^2
|
||||
Peak Dynamic = 0.0476007 W
|
||||
Subthreshold Leakage = 0.00381804 W
|
||||
Gate Leakage = 2.35266e-05 W
|
||||
Runtime Dynamic = 0.0698158 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.000754971 mm^2
|
||||
Peak Dynamic = 0.00238165 W
|
||||
Subthreshold Leakage = 4.99334e-05 W
|
||||
Gate Leakage = 3.27157e-07 W
|
||||
Runtime Dynamic = 0.0190532 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.00131543 mm^2
|
||||
Peak Dynamic = 0.0246042 W
|
||||
Subthreshold Leakage = 0.000538954 W
|
||||
Gate Leakage = 3.91915e-06 W
|
||||
Runtime Dynamic = 0.196833 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 0.0977414 mm^2
|
||||
Peak Dynamic = 0.0587123 W
|
||||
Subthreshold Leakage = 0.00580883 W
|
||||
Gate Leakage = 7.48788e-05 W
|
||||
Runtime Dynamic = 2.07447 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.0569223 mm^2
|
||||
Peak Dynamic = 0.0329939 W
|
||||
Subthreshold Leakage = 0.00249221 W
|
||||
Gate Leakage = 1.63814e-05 W
|
||||
Runtime Dynamic = 0.0476753 W
|
||||
|
||||
Load/Store Queue:
|
||||
Area = 0.023444 mm^2
|
||||
Peak Dynamic = 0.0139792 W
|
||||
Subthreshold Leakage = 0.00135593 W
|
||||
Gate Leakage = 1.12722e-05 W
|
||||
Runtime Dynamic = 0.223667 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.0313997 mm^2
|
||||
Peak Dynamic = 0.0446647 W
|
||||
Subthreshold Leakage = 0.0029577 W
|
||||
Gate Leakage = 5.57335e-05 W
|
||||
Runtime Dynamic = 1.92566 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.0110306 mm^2
|
||||
Peak Dynamic = 0.0122535 W
|
||||
Subthreshold Leakage = 0.000498504 W
|
||||
Gate Leakage = 4.25417e-06 W
|
||||
Runtime Dynamic = 0.0980282 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.0110306 mm^2
|
||||
Peak Dynamic = 0.00306337 W
|
||||
Subthreshold Leakage = 0.000498504 W
|
||||
Gate Leakage = 4.25417e-06 W
|
||||
Runtime Dynamic = 0.0245072 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 0.299667 mm^2
|
||||
Peak Dynamic = 0.473204 W
|
||||
Subthreshold Leakage = 0.0379242 W
|
||||
Gate Leakage = 0.000384077 W
|
||||
Runtime Dynamic = 3.52491 W
|
||||
|
||||
Register Files:
|
||||
Area = 0.0598365 mm^2
|
||||
Peak Dynamic = 0.0168768 W
|
||||
Subthreshold Leakage = 0.0020814 W
|
||||
Gate Leakage = 1.24237e-05 W
|
||||
Runtime Dynamic = 0.072481 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.0240072 mm^2
|
||||
Peak Dynamic = 0.0131657 W
|
||||
Subthreshold Leakage = 0.000449165 W
|
||||
Gate Leakage = 3.33111e-06 W
|
||||
Runtime Dynamic = 0.0706931 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.0240072 mm^2
|
||||
Peak Dynamic = 0.00371113 W
|
||||
Subthreshold Leakage = 0.000449165 W
|
||||
Gate Leakage = 3.33111e-06 W
|
||||
Runtime Dynamic = 0.0017722 W
|
||||
|
||||
Register Windows:
|
||||
Area = 0.0118221 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.00118307 W
|
||||
Gate Leakage = 5.76149e-06 W
|
||||
Runtime Dynamic = 1.56951e-05 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.00263062 mm^2
|
||||
Peak Dynamic = 0.00540689 W
|
||||
Subthreshold Leakage = 8.27524e-05 W
|
||||
Gate Leakage = 9.38261e-07 W
|
||||
Runtime Dynamic = 0.0464411 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.00263062 mm^2
|
||||
Peak Dynamic = 0.00540689 W
|
||||
Subthreshold Leakage = 8.27524e-05 W
|
||||
Gate Leakage = 9.38261e-07 W
|
||||
Runtime Dynamic = 0.0464411 W
|
||||
|
||||
Integer ALUs (Count: 1 ):
|
||||
Area = 0.0384544 mm^2
|
||||
Peak Dynamic = 0.0946992 W
|
||||
Subthreshold Leakage = 0.00667865 W
|
||||
Gate Leakage = 6.39207e-05 W
|
||||
Runtime Dynamic = 0.841771 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 0.125 ):
|
||||
Area = 0.0695899 mm^2
|
||||
Peak Dynamic = 0.0157832 W
|
||||
Subthreshold Leakage = 0.00302155 W
|
||||
Gate Leakage = 2.89189e-05 W
|
||||
Runtime Dynamic = 0.0315664 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.115363 mm^2
|
||||
Peak Dynamic = 0.105221 W
|
||||
Subthreshold Leakage = 0.020036 W
|
||||
Gate Leakage = 0.000191762 W
|
||||
Runtime Dynamic = 0.210443 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.00445381 mm^2
|
||||
Peak Dynamic = 0.192955 W
|
||||
Subthreshold Leakage = 0.00406321 W
|
||||
Gate Leakage = 3.88886e-05 W
|
||||
Runtime Dynamic = 0.519078 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 2.52394 mm^2
|
||||
Peak Dynamic = 0.32978 W
|
||||
Subthreshold Leakage = 0.139973 W
|
||||
Gate Leakage = 0.00157395 W
|
||||
Runtime Dynamic = 1.14063 W
|
||||
|
||||
*****************************************************************************************
|
||||
Second Level Directory
|
||||
Area = 57.033 mm^2
|
||||
Peak Dynamic = 53.5219 W
|
||||
Subthreshold Leakage = 4.27249 W
|
||||
Gate Leakage = 0.050206 W
|
||||
Runtime Dynamic = 70.9203 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 52.2524 mm^2
|
||||
Peak Dynamic = 20.5798 W
|
||||
Subthreshold Leakage = 9.50197 W
|
||||
Gate Leakage = 0.119531 W
|
||||
Runtime Dynamic = 2.56185 W
|
||||
|
||||
Router:
|
||||
Area = 0.578434 mm^2
|
||||
Peak Dynamic = 0.184548 W
|
||||
Subthreshold Leakage = 0.125515 W
|
||||
Gate Leakage = 0.0016409 W
|
||||
Runtime Dynamic = 1.32875 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 0.159162 mm^2
|
||||
Peak Dynamic = 0.00394081 W
|
||||
Subthreshold Leakage = 0.000194478 W
|
||||
Gate Leakage = 1.84946e-06 W
|
||||
Runtime Dynamic = 0.0283738 W
|
||||
|
||||
Crossbar:
|
||||
Area = 0.160976 mm^2
|
||||
Peak Dynamic = 0.179891 W
|
||||
Subthreshold Leakage = 0.12532 W
|
||||
Gate Leakage = 0.00163905 W
|
||||
Runtime Dynamic = 1.29522 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.000716053 W
|
||||
Subthreshold Leakage = 3.67148e-07 W
|
||||
Gate Leakage = 3.86991e-09 W
|
||||
Runtime Dynamic = 0.00515558 W
|
||||
|
||||
Per Router Links:
|
||||
Area = 0.238009 mm^2
|
||||
Peak Dynamic = 0.137011 W
|
||||
Subthreshold Leakage = 0.0229533 W
|
||||
Gate Leakage = 0.000226773 W
|
||||
Runtime Dynamic = 1.2331 W
|
||||
|
||||
*****************************************************************************************
|
321
ext/mcpat/results/T2
Normal file
321
ext/mcpat/results/T2
Normal file
|
@ -0,0 +1,321 @@
|
|||
McPAT (version 0.8 of Aug, 2010) is computing the target processor...
|
||||
|
||||
|
||||
McPAT (version 0.8 of Aug, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 65 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 1400
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 277.068 mm^2
|
||||
Peak Power = 71.8237 W
|
||||
Total Leakage = 18.2234 W
|
||||
Peak Dynamic = 53.6003 W
|
||||
Subthreshold Leakage = 14.7124 W
|
||||
Gate Leakage = 3.51096 W
|
||||
Runtime Dynamic = 48.652 W
|
||||
|
||||
Total Cores: 8 cores
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 116.441 mm^2
|
||||
Peak Dynamic = 28.0277 W
|
||||
Subthreshold Leakage = 9.00023 W
|
||||
Gate Leakage = 1.93139 W
|
||||
Runtime Dynamic = 27.9237 W
|
||||
|
||||
Total L2s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 85.0391 mm^2
|
||||
Peak Dynamic = 9.87481 W
|
||||
Subthreshold Leakage = 2.71188 W
|
||||
Gate Leakage = 0.684324 W
|
||||
Runtime Dynamic = 3.97632 W
|
||||
|
||||
Total First Level Directory:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 11.6417 mm^2
|
||||
Peak Dynamic = 5.32369 W
|
||||
Subthreshold Leakage = 0.249885 W
|
||||
Gate Leakage = 0.107486 W
|
||||
Runtime Dynamic = 5.38275 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 9.56584 mm^2
|
||||
Peak Dynamic = 1.07754 W
|
||||
Subthreshold Leakage = 1.61961 W
|
||||
Gate Leakage = 0.389994 W
|
||||
Runtime Dynamic = 1.07754 W
|
||||
|
||||
Total MCs: 4 Memory Controllers
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 32.2777 mm^2
|
||||
Peak Dynamic = 5.92507 W
|
||||
Subthreshold Leakage = 0.559071 W
|
||||
Gate Leakage = 0.10416 W
|
||||
Runtime Dynamic = 7.93157 W
|
||||
|
||||
Total NIUs: 2 Network Interface Units
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 15.8633 mm^2
|
||||
Peak Dynamic = 1.86482 W
|
||||
Subthreshold Leakage = 0.357626 W
|
||||
Gate Leakage = 0.183662 W
|
||||
Runtime Dynamic = 1.30537 W
|
||||
|
||||
Total PCIes: 1 PCIe Controllers
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 6.24 mm^2
|
||||
Peak Dynamic = 1.5067 W
|
||||
Subthreshold Leakage = 0.214091 W
|
||||
Gate Leakage = 0.109948 W
|
||||
Runtime Dynamic = 1.05469 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 14.5551 mm^2
|
||||
Peak Dynamic = 3.50346 W
|
||||
Subthreshold Leakage = 1.12503 W
|
||||
Gate Leakage = 0.241423 W
|
||||
Runtime Dynamic = 27.9237 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 2.75911 mm^2
|
||||
Peak Dynamic = 0.817936 W
|
||||
Subthreshold Leakage = 0.0912466 W
|
||||
Gate Leakage = 0.0284483 W
|
||||
Runtime Dynamic = 4.81754 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 2.51671 mm^2
|
||||
Peak Dynamic = 0.513783 W
|
||||
Subthreshold Leakage = 0.062355 W
|
||||
Gate Leakage = 0.0164185 W
|
||||
Runtime Dynamic = 1.59033 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.0130935 mm^2
|
||||
Peak Dynamic = 0.0100268 W
|
||||
Subthreshold Leakage = 0.000434992 W
|
||||
Gate Leakage = 6.02581e-05 W
|
||||
Runtime Dynamic = 0.160429 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 0.0119193 mm^2
|
||||
Peak Dynamic = 0.0892213 W
|
||||
Subthreshold Leakage = 0.00298091 W
|
||||
Gate Leakage = 0.000408973 W
|
||||
Runtime Dynamic = 1.42754 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 2.14252 mm^2
|
||||
Peak Dynamic = 0.487978 W
|
||||
Subthreshold Leakage = 0.0802768 W
|
||||
Gate Leakage = 0.0247378 W
|
||||
Runtime Dynamic = 10.9331 W
|
||||
|
||||
Data Cache:
|
||||
Area = 0.52868 mm^2
|
||||
Peak Dynamic = 0.0991646 W
|
||||
Subthreshold Leakage = 0.0119043 W
|
||||
Gate Leakage = 0.00145618 W
|
||||
Runtime Dynamic = 0.1303 W
|
||||
|
||||
Load/Store Queue:
|
||||
Area = 1.22144 mm^2
|
||||
Peak Dynamic = 0.286361 W
|
||||
Subthreshold Leakage = 0.0428969 W
|
||||
Gate Leakage = 0.011721 W
|
||||
Runtime Dynamic = 9.16355 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 1.1006 mm^2
|
||||
Peak Dynamic = 0.399121 W
|
||||
Subthreshold Leakage = 0.0527367 W
|
||||
Gate Leakage = 0.0195353 W
|
||||
Runtime Dynamic = 2.78316 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.293144 mm^2
|
||||
Peak Dynamic = 0.0743045 W
|
||||
Subthreshold Leakage = 0.00720086 W
|
||||
Gate Leakage = 0.00218791 W
|
||||
Runtime Dynamic = 0.594438 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.590071 mm^2
|
||||
Peak Dynamic = 0.0686851 W
|
||||
Subthreshold Leakage = 0.0200602 W
|
||||
Gate Leakage = 0.00578676 W
|
||||
Runtime Dynamic = 0.549486 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 6.79584 mm^2
|
||||
Peak Dynamic = 1.79843 W
|
||||
Subthreshold Leakage = 0.610924 W
|
||||
Gate Leakage = 0.116437 W
|
||||
Runtime Dynamic = 9.38994 W
|
||||
|
||||
Register Files:
|
||||
Area = 1.18037 mm^2
|
||||
Peak Dynamic = 0.0639548 W
|
||||
Subthreshold Leakage = 0.00981018 W
|
||||
Gate Leakage = 0.00106415 W
|
||||
Runtime Dynamic = 0.401933 W
|
||||
|
||||
Integer RF:
|
||||
Area = 0.648931 mm^2
|
||||
Peak Dynamic = 0.0485174 W
|
||||
Subthreshold Leakage = 0.00196627 W
|
||||
Gate Leakage = 0.000259389 W
|
||||
Runtime Dynamic = 0.392074 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 0.324465 mm^2
|
||||
Peak Dynamic = 0.0154374 W
|
||||
Subthreshold Leakage = 0.00196627 W
|
||||
Gate Leakage = 0.000259389 W
|
||||
Runtime Dynamic = 0.0098154 W
|
||||
|
||||
Register Windows:
|
||||
Area = 0.206972 mm^2
|
||||
Peak Dynamic = 0 W
|
||||
Subthreshold Leakage = 0.00587765 W
|
||||
Gate Leakage = 0.000545372 W
|
||||
Runtime Dynamic = 4.40062e-05 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 0.0458096 mm^2
|
||||
Peak Dynamic = 0.0333897 W
|
||||
Subthreshold Leakage = 0.000402487 W
|
||||
Gate Leakage = 8.61395e-05 W
|
||||
Runtime Dynamic = 0.287483 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.0458096 mm^2
|
||||
Peak Dynamic = 0.0333897 W
|
||||
Subthreshold Leakage = 0.000402487 W
|
||||
Gate Leakage = 8.61395e-05 W
|
||||
Runtime Dynamic = 0.287483 W
|
||||
|
||||
Integer ALUs (Count: 2 ):
|
||||
Area = 0.448448 mm^2
|
||||
Peak Dynamic = 0.425547 W
|
||||
Subthreshold Leakage = 0.147955 W
|
||||
Gate Leakage = 0.0266792 W
|
||||
Runtime Dynamic = 3.78264 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 1 ):
|
||||
Area = 4.85979 mm^2
|
||||
Peak Dynamic = 0.425547 W
|
||||
Subthreshold Leakage = 0.400843 W
|
||||
Gate Leakage = 0.07228 W
|
||||
Runtime Dynamic = 0.0709246 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.0440413 mm^2
|
||||
Peak Dynamic = 0.481158 W
|
||||
Subthreshold Leakage = 0.0264373 W
|
||||
Gate Leakage = 0.00476717 W
|
||||
Runtime Dynamic = 3.20772 W
|
||||
|
||||
*****************************************************************************************
|
||||
L2
|
||||
Area = 10.6299 mm^2
|
||||
Peak Dynamic = 1.23435 W
|
||||
Subthreshold Leakage = 0.338985 W
|
||||
Gate Leakage = 0.0855405 W
|
||||
Runtime Dynamic = 3.97632 W
|
||||
|
||||
*****************************************************************************************
|
||||
First Level Directory
|
||||
Area = 1.45521 mm^2
|
||||
Peak Dynamic = 0.665462 W
|
||||
Subthreshold Leakage = 0.0312356 W
|
||||
Gate Leakage = 0.0134358 W
|
||||
Runtime Dynamic = 5.38275 W
|
||||
|
||||
*****************************************************************************************
|
||||
Memory Controller:
|
||||
Area = 8.06942 mm^2
|
||||
Peak Dynamic = 1.48127 W
|
||||
Subthreshold Leakage = 0.139768 W
|
||||
Gate Leakage = 0.0260401 W
|
||||
Runtime Dynamic = 7.93157 W
|
||||
|
||||
Front End Engine:
|
||||
Area = 0.250458 mm^2
|
||||
Peak Dynamic = 0.05883 W
|
||||
Subthreshold Leakage = 0.0029079 W
|
||||
Gate Leakage = 0.000455875 W
|
||||
Runtime Dynamic = 0.298069 W
|
||||
|
||||
Transaction Engine:
|
||||
Area = 2.66058 mm^2
|
||||
Peak Dynamic = 0.6912 W
|
||||
Subthreshold Leakage = 0.0465697 W
|
||||
Gate Leakage = 0.00870562 W
|
||||
Runtime Dynamic = 3.50205 W
|
||||
|
||||
PHY:
|
||||
Area = 5.15838 mm^2
|
||||
Peak Dynamic = 0.731237 W
|
||||
Subthreshold Leakage = 0.0902901 W
|
||||
Gate Leakage = 0.0168786 W
|
||||
Runtime Dynamic = 4.13145 W
|
||||
|
||||
*****************************************************************************************
|
||||
NIU:
|
||||
Area = 7.93167 mm^2
|
||||
Peak Dynamic = 0.93241 W
|
||||
Subthreshold Leakage = 0.178813 W
|
||||
Gate Leakage = 0.0918312 W
|
||||
Runtime Dynamic = 0.652687 W
|
||||
|
||||
*****************************************************************************************
|
||||
PCIe:
|
||||
Area = 6.24 mm^2
|
||||
Peak Dynamic = 1.5067 W
|
||||
Subthreshold Leakage = 0.214091 W
|
||||
Gate Leakage = 0.109948 W
|
||||
Runtime Dynamic = 1.05469 W
|
||||
|
||||
*****************************************************************************************
|
||||
NOC
|
||||
Area = 9.56584 mm^2
|
||||
Peak Dynamic = 1.07754 W
|
||||
Subthreshold Leakage = 1.61961 W
|
||||
Gate Leakage = 0.389994 W
|
||||
Runtime Dynamic = 1.07754 W
|
||||
|
||||
Router:
|
||||
Area = 4.78292 mm^2
|
||||
Peak Dynamic = 0.538772 W
|
||||
Subthreshold Leakage = 0.809805 W
|
||||
Gate Leakage = 0.194997 W
|
||||
Runtime Dynamic = 1.07754 W
|
||||
|
||||
Virtual Channel Buffer:
|
||||
Area = 0.827721 mm^2
|
||||
Peak Dynamic = 0.0223838 W
|
||||
Subthreshold Leakage = 0.00314985 W
|
||||
Gate Leakage = 0.000413272 W
|
||||
Runtime Dynamic = 0.0447677 W
|
||||
|
||||
Crossbar:
|
||||
Area = 1.69589 mm^2
|
||||
Peak Dynamic = 0.511174 W
|
||||
Subthreshold Leakage = 0.806641 W
|
||||
Gate Leakage = 0.194581 W
|
||||
Runtime Dynamic = 1.02235 W
|
||||
|
||||
Arbiter:
|
||||
Peak Dynamic = 0.00521447 W
|
||||
Subthreshold Leakage = 1.42757e-05 W
|
||||
Gate Leakage = 2.78294e-06 W
|
||||
Runtime Dynamic = 0.0104289 W
|
||||
|
||||
*****************************************************************************************
|
341
ext/mcpat/results/Xeon_core
Normal file
341
ext/mcpat/results/Xeon_core
Normal file
|
@ -0,0 +1,341 @@
|
|||
McPAT (version 0.7 of May, 2010) is computing the target processor...
|
||||
|
||||
|
||||
McPAT (version 0.7 of May, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 65 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 3400
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 417.445 mm^2
|
||||
Peak Power = 142.148 W
|
||||
Total Leakage = 55.8021 W
|
||||
Peak Dynamic = 86.3458 W
|
||||
Subthreshold Leakage = 52.785 W
|
||||
Gate Leakage = 3.01712 W
|
||||
Runtime Dynamic = 63.1851 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 133.278 mm^2
|
||||
Peak Dynamic = 63.8414 W
|
||||
Subthreshold Leakage = 32.4393 W
|
||||
Gate Leakage = 2.72517 W
|
||||
Runtime Dynamic = 41.616 W
|
||||
|
||||
Total L3s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 278.612 mm^2
|
||||
Peak Dynamic = 6.11346 W
|
||||
Subthreshold Leakage = 20.1995 W
|
||||
Gate Leakage = 0.267752 W
|
||||
Runtime Dynamic = 5.1782 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 5.5548 mm^2
|
||||
Peak Dynamic = 16.3909 W
|
||||
Subthreshold Leakage = 0.146229 W
|
||||
Gate Leakage = 0.0241913 W
|
||||
Runtime Dynamic = 16.3909 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 66.6389 mm^2
|
||||
Peak Dynamic = 31.9207 W
|
||||
Subthreshold Leakage = 16.2197 W
|
||||
Gate Leakage = 1.36259 W
|
||||
Runtime Dynamic = 41.616 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 7.41271 mm^2
|
||||
Peak Dynamic = 5.04492 W
|
||||
Subthreshold Leakage = 1.26751 W
|
||||
Gate Leakage = 0.09429 W
|
||||
Runtime Dynamic = 5.39803 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 2.44324 mm^2
|
||||
Peak Dynamic = 1.42048 W
|
||||
Subthreshold Leakage = 0.359444 W
|
||||
Gate Leakage = 0.0187045 W
|
||||
Runtime Dynamic = 2.13804 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 0.729086 mm^2
|
||||
Peak Dynamic = 0.161698 W
|
||||
Subthreshold Leakage = 0.0616324 W
|
||||
Gate Leakage = 0.00336254 W
|
||||
Runtime Dynamic = 0.646794 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.430961 mm^2
|
||||
Peak Dynamic = 0.188469 W
|
||||
Subthreshold Leakage = 0.0698834 W
|
||||
Gate Leakage = 0.00415943 W
|
||||
Runtime Dynamic = 0.166045 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.174771 mm^2
|
||||
Peak Dynamic = 0.0633335 W
|
||||
Subthreshold Leakage = 0.0274086 W
|
||||
Gate Leakage = 0.00158249 W
|
||||
Runtime Dynamic = 0.0633335 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.0735854 mm^2
|
||||
Peak Dynamic = 0.0393754 W
|
||||
Subthreshold Leakage = 0.0111166 W
|
||||
Gate Leakage = 0.000721196 W
|
||||
Runtime Dynamic = 0.0393754 W
|
||||
|
||||
Area = 0.0507308 mm^2
|
||||
Peak Dynamic = 0.0258383 W
|
||||
Subthreshold Leakage = 0.00749994 W
|
||||
Gate Leakage = 0.000498805 W
|
||||
Runtime Dynamic = 0.0258383 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.174771 mm^2
|
||||
Peak Dynamic = 0.0633335 W
|
||||
Subthreshold Leakage = 0.0274086 W
|
||||
Gate Leakage = 0.00158249 W
|
||||
Runtime Dynamic = 0.0633335 W
|
||||
|
||||
RAS:
|
||||
Area = 0.0613744 mm^2
|
||||
Peak Dynamic = 0.0224266 W
|
||||
Subthreshold Leakage = 0.00394955 W
|
||||
Gate Leakage = 0.000273252 W
|
||||
Runtime Dynamic = 2.51602e-06 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.0684348 mm^2
|
||||
Peak Dynamic = 0.704461 W
|
||||
Subthreshold Leakage = 0.00411741 W
|
||||
Gate Leakage = 0.000240288 W
|
||||
Runtime Dynamic = 0.46964 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 3.73007 mm^2
|
||||
Peak Dynamic = 1.97751 W
|
||||
Subthreshold Leakage = 0.733056 W
|
||||
Gate Leakage = 0.0575912 W
|
||||
Runtime Dynamic = 1.97751 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 1.82421 mm^2
|
||||
Peak Dynamic = 2.76284 W
|
||||
Subthreshold Leakage = 0.0765654 W
|
||||
Gate Leakage = 0.0125478 W
|
||||
Runtime Dynamic = 1.94438 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 0.875874 mm^2
|
||||
Peak Dynamic = 1.249 W
|
||||
Subthreshold Leakage = 0.0113878 W
|
||||
Gate Leakage = 0.000693471 W
|
||||
Runtime Dynamic = 1.249 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 0.405459 mm^2
|
||||
Peak Dynamic = 0.610062 W
|
||||
Subthreshold Leakage = 0.0144803 W
|
||||
Gate Leakage = 0.000906674 W
|
||||
Runtime Dynamic = 0.305031 W
|
||||
|
||||
Free List:
|
||||
Area = 0.297629 mm^2
|
||||
Peak Dynamic = 0.137664 W
|
||||
Subthreshold Leakage = 0.0054316 W
|
||||
Gate Leakage = 0.000326171 W
|
||||
Runtime Dynamic = 0.275328 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.0530903 mm^2
|
||||
Peak Dynamic = 0.056222 W
|
||||
Subthreshold Leakage = 0.00135314 W
|
||||
Gate Leakage = 0.00011607 W
|
||||
Runtime Dynamic = 0.056222 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.018828 mm^2
|
||||
Peak Dynamic = 0.0186388 W
|
||||
Subthreshold Leakage = 0.000788229 W
|
||||
Gate Leakage = 6.41952e-05 W
|
||||
Runtime Dynamic = 0.00931941 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.162422 mm^2
|
||||
Peak Dynamic = 0.0989385 W
|
||||
Subthreshold Leakage = 0.00375181 W
|
||||
Gate Leakage = 0.000209083 W
|
||||
Runtime Dynamic = 0.0494693 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 4.35998 mm^2
|
||||
Peak Dynamic = 2.94939 W
|
||||
Subthreshold Leakage = 0.208781 W
|
||||
Gate Leakage = 0.0232213 W
|
||||
Runtime Dynamic = 3.60184 W
|
||||
|
||||
Data Cache:
|
||||
Area = 2.2051 mm^2
|
||||
Peak Dynamic = 1.08067 W
|
||||
Subthreshold Leakage = 0.0877157 W
|
||||
Gate Leakage = 0.00573003 W
|
||||
Runtime Dynamic = 2.30478 W
|
||||
|
||||
LoadQ:
|
||||
Area = 0.637121 mm^2
|
||||
Peak Dynamic = 0.551016 W
|
||||
Subthreshold Leakage = 0.0283256 W
|
||||
Gate Leakage = 0.00254841 W
|
||||
Runtime Dynamic = 0.275508 W
|
||||
|
||||
StoreQ:
|
||||
Area = 0.809965 mm^2
|
||||
Peak Dynamic = 1.02155 W
|
||||
Subthreshold Leakage = 0.053367 W
|
||||
Gate Leakage = 0.00471074 W
|
||||
Runtime Dynamic = 1.02155 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.517456 mm^2
|
||||
Peak Dynamic = 0.979218 W
|
||||
Subthreshold Leakage = 0.0808171 W
|
||||
Gate Leakage = 0.0139952 W
|
||||
Runtime Dynamic = 1.66678 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.127123 mm^2
|
||||
Peak Dynamic = 0.236587 W
|
||||
Subthreshold Leakage = 0.0160962 W
|
||||
Gate Leakage = 0.00146431 W
|
||||
Runtime Dynamic = 0.473177 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.379422 mm^2
|
||||
Peak Dynamic = 0.298399 W
|
||||
Subthreshold Leakage = 0.0253484 W
|
||||
Gate Leakage = 0.00229878 W
|
||||
Runtime Dynamic = 1.1936 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 27.5381 mm^2
|
||||
Peak Dynamic = 16.9637 W
|
||||
Subthreshold Leakage = 7.08185 W
|
||||
Gate Leakage = 0.73316 W
|
||||
Runtime Dynamic = 22.7198 W
|
||||
|
||||
Register Files:
|
||||
Area = 11.2548 mm^2
|
||||
Peak Dynamic = 3.2925 W
|
||||
Subthreshold Leakage = 0.11111 W
|
||||
Gate Leakage = 0.00754256 W
|
||||
Runtime Dynamic = 1.69823 W
|
||||
|
||||
Integer RF:
|
||||
Area = 7.55916 mm^2
|
||||
Peak Dynamic = 2.82012 W
|
||||
Subthreshold Leakage = 0.0664048 W
|
||||
Gate Leakage = 0.00458288 W
|
||||
Runtime Dynamic = 1.51078 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 3.69565 mm^2
|
||||
Peak Dynamic = 0.472385 W
|
||||
Subthreshold Leakage = 0.0447053 W
|
||||
Gate Leakage = 0.00295968 W
|
||||
Runtime Dynamic = 0.187454 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 2.08681 mm^2
|
||||
Peak Dynamic = 2.1684 W
|
||||
Subthreshold Leakage = 0.0325294 W
|
||||
Gate Leakage = 0.00296372 W
|
||||
Runtime Dynamic = 2.59089 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.287309 mm^2
|
||||
Peak Dynamic = 0.929972 W
|
||||
Subthreshold Leakage = 0.0127376 W
|
||||
Gate Leakage = 0.00137073 W
|
||||
Runtime Dynamic = 1.2089 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.128977 mm^2
|
||||
Peak Dynamic = 0.478661 W
|
||||
Subthreshold Leakage = 0.00802287 W
|
||||
Gate Leakage = 0.000873414 W
|
||||
Runtime Dynamic = 0.622222 W
|
||||
|
||||
ROB:
|
||||
Area = 1.67052 mm^2
|
||||
Peak Dynamic = 0.759764 W
|
||||
Subthreshold Leakage = 0.0117689 W
|
||||
Gate Leakage = 0.000719579 W
|
||||
Runtime Dynamic = 0.759764 W
|
||||
|
||||
Integer ALUs (Count: 6 ):
|
||||
Area = 4.03603 mm^2
|
||||
Peak Dynamic = 4.55818 W
|
||||
Subthreshold Leakage = 3.9898 W
|
||||
Gate Leakage = 0.412015 W
|
||||
Runtime Dynamic = 2.33394 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 2 ):
|
||||
Area = 9.71959 mm^2
|
||||
Peak Dynamic = 1.43327 W
|
||||
Subthreshold Leakage = 2.40207 W
|
||||
Gate Leakage = 0.248054 W
|
||||
Runtime Dynamic = 2.55333 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.336336 mm^2
|
||||
Peak Dynamic = 0.510666 W
|
||||
Subthreshold Leakage = 0.332484 W
|
||||
Gate Leakage = 0.0343346 W
|
||||
Runtime Dynamic = 3.18505 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.0936618 mm^2
|
||||
Peak Dynamic = 4.4084 W
|
||||
Subthreshold Leakage = 0.174486 W
|
||||
Gate Leakage = 0.0180186 W
|
||||
Runtime Dynamic = 10.3584 W
|
||||
|
||||
L2
|
||||
Area = 15.914 mm^2
|
||||
Peak Dynamic = 3.22061 W
|
||||
Subthreshold Leakage = 3.01991 W
|
||||
Gate Leakage = 0.0223008 W
|
||||
Runtime Dynamic = 6.28514 W
|
||||
|
||||
*****************************************************************************************
|
||||
L3
|
||||
Area = 278.612 mm^2
|
||||
Peak Dynamic = 6.11346 W
|
||||
Subthreshold Leakage = 20.1995 W
|
||||
Gate Leakage = 0.267752 W
|
||||
Runtime Dynamic = 5.1782 W
|
||||
|
||||
*****************************************************************************************
|
||||
BUSES
|
||||
Area = 5.5548 mm^2
|
||||
Peak Dynamic = 16.3909 W
|
||||
Subthreshold Leakage = 0.146229 W
|
||||
Gate Leakage = 0.0241913 W
|
||||
Runtime Dynamic = 16.3909 W
|
||||
|
||||
Bus:
|
||||
Area = 5.5548 mm^2
|
||||
Peak Dynamic = 16.3909 W
|
||||
Subthreshold Leakage = 0.146229 W
|
||||
Gate Leakage = 0.0241913 W
|
||||
Runtime Dynamic = 16.3909 W
|
||||
|
||||
*****************************************************************************************
|
341
ext/mcpat/results/Xeon_uncore
Normal file
341
ext/mcpat/results/Xeon_uncore
Normal file
|
@ -0,0 +1,341 @@
|
|||
McPAT (version 0.7 of May, 2010) is computing the target processor...
|
||||
|
||||
|
||||
McPAT (version 0.7 of May, 2010) results (current print level is 5)
|
||||
*****************************************************************************************
|
||||
Technology 65 nm
|
||||
Using Long Channel Devices When Appropriate
|
||||
Interconnect metal projection= aggressive interconnect technology projection
|
||||
Core clock Rate(MHz) 3400
|
||||
|
||||
*****************************************************************************************
|
||||
Processor:
|
||||
Area = 418.629 mm^2
|
||||
Peak Power = 96.2032 W
|
||||
Total Leakage = 27.5568 W
|
||||
Peak Dynamic = 68.6463 W
|
||||
Subthreshold Leakage = 25.8287 W
|
||||
Gate Leakage = 1.72809 W
|
||||
Runtime Dynamic = 50.332 W
|
||||
|
||||
Total Cores:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 134.217 mm^2
|
||||
Peak Dynamic = 50.8677 W
|
||||
Subthreshold Leakage = 15.0187 W
|
||||
Gate Leakage = 1.57092 W
|
||||
Runtime Dynamic = 33.3003 W
|
||||
|
||||
Total L3s:
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 278.843 mm^2
|
||||
Peak Dynamic = 4.84476 W
|
||||
Subthreshold Leakage = 10.7416 W
|
||||
Gate Leakage = 0.144361 W
|
||||
Runtime Dynamic = 4.09781 W
|
||||
|
||||
Total NoCs (Network/Bus):
|
||||
Device Type= ITRS high performance device type
|
||||
Area = 5.56828 mm^2
|
||||
Peak Dynamic = 12.9339 W
|
||||
Subthreshold Leakage = 0.0684953 W
|
||||
Gate Leakage = 0.0128043 W
|
||||
Runtime Dynamic = 12.9339 W
|
||||
|
||||
*****************************************************************************************
|
||||
Core:
|
||||
Area = 67.1085 mm^2
|
||||
Peak Dynamic = 25.4338 W
|
||||
Subthreshold Leakage = 7.50933 W
|
||||
Gate Leakage = 0.78546 W
|
||||
Runtime Dynamic = 33.3003 W
|
||||
|
||||
Instruction Fetch Unit:
|
||||
Area = 7.56843 mm^2
|
||||
Peak Dynamic = 4.27305 W
|
||||
Subthreshold Leakage = 0.571346 W
|
||||
Gate Leakage = 0.0523885 W
|
||||
Runtime Dynamic = 4.67953 W
|
||||
|
||||
Instruction Cache:
|
||||
Area = 2.44678 mm^2
|
||||
Peak Dynamic = 1.1785 W
|
||||
Subthreshold Leakage = 0.151766 W
|
||||
Gate Leakage = 0.009764 W
|
||||
Runtime Dynamic = 1.7926 W
|
||||
|
||||
Branch Target Buffer:
|
||||
Area = 0.718635 mm^2
|
||||
Peak Dynamic = 0.151619 W
|
||||
Subthreshold Leakage = 0.0238082 W
|
||||
Gate Leakage = 0.0015503 W
|
||||
Runtime Dynamic = 0.606475 W
|
||||
|
||||
Branch Predictor:
|
||||
Area = 0.446844 mm^2
|
||||
Peak Dynamic = 0.158508 W
|
||||
Subthreshold Leakage = 0.0293041 W
|
||||
Gate Leakage = 0.0021362 W
|
||||
Runtime Dynamic = 0.14087 W
|
||||
|
||||
Global Predictor:
|
||||
Area = 0.174801 mm^2
|
||||
Peak Dynamic = 0.0543932 W
|
||||
Subthreshold Leakage = 0.0116121 W
|
||||
Gate Leakage = 0.000827171 W
|
||||
Runtime Dynamic = 0.0543932 W
|
||||
|
||||
Local Predictor:
|
||||
Area = 0.0788692 mm^2
|
||||
Peak Dynamic = 0.0320817 W
|
||||
Subthreshold Leakage = 0.00452837 W
|
||||
Gate Leakage = 0.000354718 W
|
||||
Runtime Dynamic = 0.0320817 W
|
||||
|
||||
Area = 0.050748 mm^2
|
||||
Peak Dynamic = 0.0218669 W
|
||||
Subthreshold Leakage = 0.00318852 W
|
||||
Gate Leakage = 0.000264126 W
|
||||
Runtime Dynamic = 0.0218669 W
|
||||
|
||||
Chooser:
|
||||
Area = 0.174801 mm^2
|
||||
Peak Dynamic = 0.0543932 W
|
||||
Subthreshold Leakage = 0.0116121 W
|
||||
Gate Leakage = 0.000827171 W
|
||||
Runtime Dynamic = 0.0543932 W
|
||||
|
||||
RAS:
|
||||
Area = 0.0929863 mm^2
|
||||
Peak Dynamic = 0.0176394 W
|
||||
Subthreshold Leakage = 0.00155163 W
|
||||
Gate Leakage = 0.00012714 W
|
||||
Runtime Dynamic = 1.96119e-06 W
|
||||
|
||||
Instruction Buffer:
|
||||
Area = 0.0687233 mm^2
|
||||
Peak Dynamic = 0.579633 W
|
||||
Subthreshold Leakage = 0.00177049 W
|
||||
Gate Leakage = 0.000129185 W
|
||||
Runtime Dynamic = 0.386422 W
|
||||
|
||||
Instruction Decoder:
|
||||
Area = 3.87654 mm^2
|
||||
Peak Dynamic = 1.75316 W
|
||||
Subthreshold Leakage = 0.348225 W
|
||||
Gate Leakage = 0.0335628 W
|
||||
Runtime Dynamic = 1.75316 W
|
||||
|
||||
Renaming Unit:
|
||||
Area = 1.83366 mm^2
|
||||
Peak Dynamic = 2.16025 W
|
||||
Subthreshold Leakage = 0.0324638 W
|
||||
Gate Leakage = 0.00648876 W
|
||||
Runtime Dynamic = 1.53428 W
|
||||
|
||||
Int Front End RAT:
|
||||
Area = 0.879521 mm^2
|
||||
Peak Dynamic = 0.975897 W
|
||||
Subthreshold Leakage = 0.00490782 W
|
||||
Gate Leakage = 0.000372282 W
|
||||
Runtime Dynamic = 0.975897 W
|
||||
|
||||
FP Front End RAT:
|
||||
Area = 0.407642 mm^2
|
||||
Peak Dynamic = 0.477469 W
|
||||
Subthreshold Leakage = 0.00619591 W
|
||||
Gate Leakage = 0.000483134 W
|
||||
Runtime Dynamic = 0.238735 W
|
||||
|
||||
Free List:
|
||||
Area = 0.300513 mm^2
|
||||
Peak Dynamic = 0.112906 W
|
||||
Subthreshold Leakage = 0.00233243 W
|
||||
Gate Leakage = 0.000174984 W
|
||||
Runtime Dynamic = 0.225813 W
|
||||
|
||||
Int Retire RAT:
|
||||
Area = 0.0534147 mm^2
|
||||
Peak Dynamic = 0.0453154 W
|
||||
Subthreshold Leakage = 0.00058142 W
|
||||
Gate Leakage = 6.26682e-05 W
|
||||
Runtime Dynamic = 0.0453154 W
|
||||
|
||||
FP Retire RAT:
|
||||
Area = 0.018897 mm^2
|
||||
Peak Dynamic = 0.0151716 W
|
||||
Subthreshold Leakage = 0.000337803 W
|
||||
Gate Leakage = 3.45545e-05 W
|
||||
Runtime Dynamic = 0.00758578 W
|
||||
|
||||
FP Free List:
|
||||
Area = 0.162758 mm^2
|
||||
Peak Dynamic = 0.081858 W
|
||||
Subthreshold Leakage = 0.00163685 W
|
||||
Gate Leakage = 0.000115075 W
|
||||
Runtime Dynamic = 0.040929 W
|
||||
|
||||
Load Store Unit:
|
||||
Area = 4.4281 mm^2
|
||||
Peak Dynamic = 2.34722 W
|
||||
Subthreshold Leakage = 0.0896936 W
|
||||
Gate Leakage = 0.0121845 W
|
||||
Runtime Dynamic = 2.89901 W
|
||||
|
||||
Data Cache:
|
||||
Area = 2.25853 mm^2
|
||||
Peak Dynamic = 0.888323 W
|
||||
Subthreshold Leakage = 0.0382167 W
|
||||
Gate Leakage = 0.00311455 W
|
||||
Runtime Dynamic = 1.88387 W
|
||||
|
||||
LoadQ:
|
||||
Area = 0.638298 mm^2
|
||||
Peak Dynamic = 0.435889 W
|
||||
Subthreshold Leakage = 0.0121526 W
|
||||
Gate Leakage = 0.00134375 W
|
||||
Runtime Dynamic = 0.217944 W
|
||||
|
||||
StoreQ:
|
||||
Area = 0.811765 mm^2
|
||||
Peak Dynamic = 0.79719 W
|
||||
Subthreshold Leakage = 0.0228527 W
|
||||
Gate Leakage = 0.00248017 W
|
||||
Runtime Dynamic = 0.79719 W
|
||||
|
||||
Memory Management Unit:
|
||||
Area = 0.518866 mm^2
|
||||
Peak Dynamic = 0.760463 W
|
||||
Subthreshold Leakage = 0.0342246 W
|
||||
Gate Leakage = 0.00722713 W
|
||||
Runtime Dynamic = 1.31193 W
|
||||
|
||||
Itlb:
|
||||
Area = 0.12744 mm^2
|
||||
Peak Dynamic = 0.187517 W
|
||||
Subthreshold Leakage = 0.00686539 W
|
||||
Gate Leakage = 0.000767441 W
|
||||
Runtime Dynamic = 0.375037 W
|
||||
|
||||
Dtlb:
|
||||
Area = 0.380515 mm^2
|
||||
Peak Dynamic = 0.234221 W
|
||||
Subthreshold Leakage = 0.0108877 W
|
||||
Gate Leakage = 0.00121362 W
|
||||
Runtime Dynamic = 0.936886 W
|
||||
|
||||
Execution Unit:
|
||||
Area = 27.5564 mm^2
|
||||
Peak Dynamic = 13.34 W
|
||||
Subthreshold Leakage = 3.35055 W
|
||||
Gate Leakage = 0.425 W
|
||||
Runtime Dynamic = 17.8618 W
|
||||
|
||||
Register Files:
|
||||
Area = 11.2668 mm^2
|
||||
Peak Dynamic = 2.65925 W
|
||||
Subthreshold Leakage = 0.0472795 W
|
||||
Gate Leakage = 0.00398463 W
|
||||
Runtime Dynamic = 1.37147 W
|
||||
|
||||
Integer RF:
|
||||
Area = 7.56635 mm^2
|
||||
Peak Dynamic = 2.27672 W
|
||||
Subthreshold Leakage = 0.0282472 W
|
||||
Gate Leakage = 0.00241709 W
|
||||
Runtime Dynamic = 1.21967 W
|
||||
|
||||
Floating Point RF:
|
||||
Area = 3.70048 mm^2
|
||||
Peak Dynamic = 0.382527 W
|
||||
Subthreshold Leakage = 0.0190323 W
|
||||
Gate Leakage = 0.00156754 W
|
||||
Runtime Dynamic = 0.151797 W
|
||||
|
||||
Instruction Scheduler:
|
||||
Area = 2.09118 mm^2
|
||||
Peak Dynamic = 1.7092 W
|
||||
Subthreshold Leakage = 0.0139125 W
|
||||
Gate Leakage = 0.00156067 W
|
||||
Runtime Dynamic = 2.04197 W
|
||||
|
||||
Instruction Window:
|
||||
Area = 0.287606 mm^2
|
||||
Peak Dynamic = 0.721714 W
|
||||
Subthreshold Leakage = 0.00547415 W
|
||||
Gate Leakage = 0.000721338 W
|
||||
Runtime Dynamic = 0.940723 W
|
||||
|
||||
FP Instruction Window:
|
||||
Area = 0.129287 mm^2
|
||||
Peak Dynamic = 0.372875 W
|
||||
Subthreshold Leakage = 0.0034355 W
|
||||
Gate Leakage = 0.00045775 W
|
||||
Runtime Dynamic = 0.486639 W
|
||||
|
||||
ROB:
|
||||
Area = 1.67428 mm^2
|
||||
Peak Dynamic = 0.61461 W
|
||||
Subthreshold Leakage = 0.00500288 W
|
||||
Gate Leakage = 0.00038158 W
|
||||
Runtime Dynamic = 0.61461 W
|
||||
|
||||
Integer ALUs (Count: 6 ):
|
||||
Area = 4.03603 mm^2
|
||||
Peak Dynamic = 3.52986 W
|
||||
Subthreshold Leakage = 1.89726 W
|
||||
Gate Leakage = 0.240113 W
|
||||
Runtime Dynamic = 1.8074 W
|
||||
|
||||
Floating Point Units (FPUs) (Count: 2 ):
|
||||
Area = 9.71959 mm^2
|
||||
Peak Dynamic = 1.10993 W
|
||||
Subthreshold Leakage = 1.14225 W
|
||||
Gate Leakage = 0.14456 W
|
||||
Runtime Dynamic = 1.9773 W
|
||||
|
||||
Complex ALUs (Mul/Div) (Count: 1 ):
|
||||
Area = 0.336336 mm^2
|
||||
Peak Dynamic = 0.405148 W
|
||||
Subthreshold Leakage = 0.158105 W
|
||||
Gate Leakage = 0.0200094 W
|
||||
Runtime Dynamic = 2.4988 W
|
||||
|
||||
Results Broadcast Bus:
|
||||
Area Overhead = 0.0954831 mm^2
|
||||
Peak Dynamic = 3.47499 W
|
||||
Subthreshold Leakage = 0.0752739 W
|
||||
Gate Leakage = 0.00952648 W
|
||||
Runtime Dynamic = 8.1649 W
|
||||
|
||||
L2
|
||||
Area = 16.1307 mm^2
|
||||
Peak Dynamic = 2.55285 W
|
||||
Subthreshold Leakage = 1.29868 W
|
||||
Gate Leakage = 0.012304 W
|
||||
Runtime Dynamic = 5.01368 W
|
||||
|
||||
*****************************************************************************************
|
||||
L3
|
||||
Area = 278.843 mm^2
|
||||
Peak Dynamic = 4.84476 W
|
||||
Subthreshold Leakage = 10.7416 W
|
||||
Gate Leakage = 0.144361 W
|
||||
Runtime Dynamic = 4.09781 W
|
||||
|
||||
*****************************************************************************************
|
||||
BUSES
|
||||
Area = 5.56828 mm^2
|
||||
Peak Dynamic = 12.9339 W
|
||||
Subthreshold Leakage = 0.0684953 W
|
||||
Gate Leakage = 0.0128043 W
|
||||
Runtime Dynamic = 12.9339 W
|
||||
|
||||
Bus:
|
||||
Area = 5.56828 mm^2
|
||||
Peak Dynamic = 12.9339 W
|
||||
Subthreshold Leakage = 0.0684953 W
|
||||
Gate Leakage = 0.0128043 W
|
||||
Runtime Dynamic = 12.9339 W
|
||||
|
||||
*****************************************************************************************
|
1162
ext/mcpat/sharedcache.cc
Normal file
1162
ext/mcpat/sharedcache.cc
Normal file
File diff suppressed because it is too large
Load diff
89
ext/mcpat/sharedcache.h
Normal file
89
ext/mcpat/sharedcache.h
Normal file
|
@ -0,0 +1,89 @@
|
|||
/*****************************************************************************
|
||||
* McPAT
|
||||
* SOFTWARE LICENSE AGREEMENT
|
||||
* Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* All Rights Reserved
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef SHAREDCACHE_H_
|
||||
#define SHAREDCACHE_H_
|
||||
#include <vector>
|
||||
|
||||
#include "XML_Parse.h"
|
||||
#include "area.h"
|
||||
#include "array.h"
|
||||
#include "basic_components.h"
|
||||
#include "logic.h"
|
||||
#include "parameter.h"
|
||||
|
||||
class SharedCache :public Component{
|
||||
public:
|
||||
ParseXML * XML;
|
||||
int ithCache;
|
||||
InputParameter interface_ip;
|
||||
enum cache_level cacheL;
|
||||
DataCache unicache;//Shared cache
|
||||
CacheDynParam cachep;
|
||||
statsDef homenode_tdp_stats;
|
||||
statsDef homenode_rtp_stats;
|
||||
statsDef homenode_stats_t;
|
||||
double dir_overhead;
|
||||
// cache_processor llCache,directory, directory1, inv_dir;
|
||||
|
||||
//pipeline pipeLogicCache, pipeLogicDirectory;
|
||||
//clock_network clockNetwork;
|
||||
double scktRatio, executionTime;
|
||||
// Component L2Tot, cc, cc1, ccTot;
|
||||
|
||||
SharedCache(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_,enum cache_level cacheL_ =L2);
|
||||
void set_cache_param();
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,bool is_tdp=true);
|
||||
~SharedCache(){};
|
||||
};
|
||||
|
||||
class CCdir :public Component{
|
||||
public:
|
||||
ParseXML * XML;
|
||||
int ithCache;
|
||||
InputParameter interface_ip;
|
||||
DataCache dc;//Shared cache
|
||||
ArrayST * shadow_dir;
|
||||
// cache_processor llCache,directory, directory1, inv_dir;
|
||||
|
||||
//pipeline pipeLogicCache, pipeLogicDirectory;
|
||||
//clock_network clockNetwork;
|
||||
double scktRatio, clockRate, executionTime;
|
||||
Component L2Tot, cc, cc1, ccTot;
|
||||
|
||||
CCdir(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_);
|
||||
void computeEnergy(bool is_tdp=true);
|
||||
void displayEnergy(uint32_t indent = 0,bool is_tdp=true);
|
||||
~CCdir();
|
||||
};
|
||||
|
||||
#endif /* SHAREDCACHE_H_ */
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue